# Overview

This is a scratch notebook, where I conducted much of my data exploration - finding column data types, creating sub-sets of the data, combining the Texas Department of State Health Services overdose death data, etc.

In [1]:
# Imports
import pandas as pd

In [2]:
# After a brief exploration, these are the datatypes per column of the
# Washington Post dataset. Making them explicit for speed here
dtypes = {
    "REPORTER_DEA_NO" : "object",
    "REPORTER_BUS_ACT" : "object",
    "REPORTER_NAME" : "object",
    "REPORTER_ADDL_CO_INFO" : "object",
    "REPORTER_ADDRESS1" : "object",
    "REPORTER_ADDRESS2" : "object",
    "REPORTER_CITY" : "object",
    "REPORTER_STATE" : "object",
    "REPORTER_ZIP" : "int64",
    "REPORTER_COUNTY" : "object",
    "BUYER_DEA_NO" : "object",
    "BUYER_BUS_ACT" : "object",
    "BUYER_NAME" : "object",
    "BUYER_ADDL_CO_INFO" : "object",
    "BUYER_ADDRESS1" : "object",
    "BUYER_ADDRESS2" : "object",
    "BUYER_CITY" : "object",
    "BUYER_STATE" : "object",
    "BUYER_ZIP" : "int64",
    "BUYER_COUNTY" : "object",
    "TRANSACTION_CODE" : "object",
    "DRUG_CODE" : "int64",
    "NDC_NO" : "object",
    "DRUG_NAME" : "object",
    "QUANTITY" : "float64",
    "UNIT" : "float64",
    "ACTION_INDICATOR" : "object",
    "ORDER_FORM_NO" : "object",
    "CORRECTION_NO" :  "float64",
    "STRENGTH" : "float64",
    "TRANSACTION_DATE" : "int64",
    "CALC_BASE_WT_IN_GM" : "float64",
    "DOSAGE_UNIT" : "float64",
    "TRANSACTION_ID" : "int64",
    "Product_Name" : "object",
    "Ingredient_Name" : "object",
    "Measure" : "object",
    "MME_Conversion_Factor" : "float64",
    "Combined_Labeler_Name" : "object",
    "Revised_Company_Name" : "object",
    "Reporter_family" : "object",
    "dos_str" : "float64"
}
wp_data = pd.read_csv("data/arcos-tx-statewide-itemized.tsv", sep='\t', dtype=dtypes)

In [4]:
wp_data.head()

Unnamed: 0,REPORTER_DEA_NO,REPORTER_BUS_ACT,REPORTER_NAME,REPORTER_ADDL_CO_INFO,REPORTER_ADDRESS1,REPORTER_ADDRESS2,REPORTER_CITY,REPORTER_STATE,REPORTER_ZIP,REPORTER_COUNTY,...,DOSAGE_UNIT,TRANSACTION_ID,Product_Name,Ingredient_Name,Measure,MME_Conversion_Factor,Combined_Labeler_Name,Revised_Company_Name,Reporter_family,dos_str
0,PB0034861,DISTRIBUTOR,BELLCO DRUG CORP,,5500 NEW HORIZONS BLVD,,NORTH AMITYVILLE,NY,11701,SUFFOLK,...,500.0,3156,HYDROCODONE BIT 5MG/ACETAMINOPHEN 50,HYDROCODONE BITARTRATE HEMIPENTAHYDRATE,TAB,1.0,"Actavis Pharma, Inc.","Allergan, Inc.",Bellco Drug Corp,5.0
1,PB0034861,DISTRIBUTOR,BELLCO DRUG CORP,,5500 NEW HORIZONS BLVD,,NORTH AMITYVILLE,NY,11701,SUFFOLK,...,500.0,617,HYDROCODONE BIT 5MG/ACETAMINOPHEN 50,HYDROCODONE BITARTRATE HEMIPENTAHYDRATE,TAB,1.0,"Actavis Pharma, Inc.","Allergan, Inc.",Bellco Drug Corp,5.0
2,PB0034861,DISTRIBUTOR,BELLCO DRUG CORP,,5500 NEW HORIZONS BLVD,,NORTH AMITYVILLE,NY,11701,SUFFOLK,...,400.0,671,"HYDROCODONE.BIT./ACET.,10MG & 325MG/",HYDROCODONE BITARTRATE HEMIPENTAHYDRATE,TAB,1.0,Par Pharmaceutical,"Endo Pharmaceuticals, Inc.",Bellco Drug Corp,10.0
3,PB0034861,DISTRIBUTOR,BELLCO DRUG CORP,,5500 NEW HORIZONS BLVD,,NORTH AMITYVILLE,NY,11701,SUFFOLK,...,100.0,1300,"HYDROCODONE.BIT./ACET.,10MG & 325MG/",HYDROCODONE BITARTRATE HEMIPENTAHYDRATE,TAB,1.0,Par Pharmaceutical,"Endo Pharmaceuticals, Inc.",Bellco Drug Corp,10.0
4,PB0034861,DISTRIBUTOR,BELLCO DRUG CORP,,5500 NEW HORIZONS BLVD,,NORTH AMITYVILLE,NY,11701,SUFFOLK,...,100.0,4521,HYDROCODONE BIT. 10MG/ACETAMINOPHEN,HYDROCODONE BITARTRATE HEMIPENTAHYDRATE,TAB,1.0,"Actavis Pharma, Inc.","Allergan, Inc.",Bellco Drug Corp,10.0


#### Checking for most common values, for nulls, etc:

In [56]:
wp_data.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12108468 entries, 0 to 12108467
Data columns (total 42 columns):
REPORTER_DEA_NO          12108468 non-null object
REPORTER_BUS_ACT         12108468 non-null object
REPORTER_NAME            12108468 non-null object
REPORTER_ADDL_CO_INFO    820991 non-null object
REPORTER_ADDRESS1        12108468 non-null object
REPORTER_ADDRESS2        1922155 non-null object
REPORTER_CITY            12108468 non-null object
REPORTER_STATE           12108468 non-null object
REPORTER_ZIP             12108468 non-null int64
REPORTER_COUNTY          12108468 non-null object
BUYER_DEA_NO             12108468 non-null object
BUYER_BUS_ACT            12108468 non-null object
BUYER_NAME               12108468 non-null object
BUYER_ADDL_CO_INFO       4664223 non-null object
BUYER_ADDRESS1           12108468 non-null object
BUYER_ADDRESS2           1685301 non-null object
BUYER_CITY               12108468 non-null object
BUYER_STATE              12108468 non-nul

In [5]:
wp_data["BUYER_NAME"].value_counts().head()

WALGREEN CO.                           2386212
CVS PHARMACY, INC.                     1222795
ALBERTSONS LLC                          260251
RANDALLS FOOD AND DRUGS LP              225028
TARGET STORES A DIV.OF TARGET CORP.     132679
Name: BUYER_NAME, dtype: int64

In [6]:
wp_data["ACTION_INDICATOR"].value_counts()

I    407325
A       722
Name: ACTION_INDICATOR, dtype: int64

In [69]:
# Let's try to parse through the time stamp on these transactions
wp_data["TRANSACTION_DATE"].head(10)

0     1112006
1     1042006
2    11022006
3     1042007
4     2142007
5     3282007
6     4242007
7     3122007
8     8222006
9     7172006
Name: TRANSACTION_DATE, dtype: int64

In [71]:
wp_data["TRANSACTION_DATE"].sort_values().head()

3308482     1012006
953863      1012006
10072495    1012006
8721814     1012006
953809      1012006
Name: TRANSACTION_DATE, dtype: int64

In [74]:
# Can see that we need to fill in preceeding zeros for months with 1 digit, 
# so each date has 8 digits
# First need to turn that column into strings
wp_data["TRANSACTION_DATE"] = wp_data["TRANSACTION_DATE"].astype('str')
wp_data["TRANSACTION_DATE"] = wp_data["TRANSACTION_DATE"].str.zfill(8)

In [75]:
# Much better
wp_data["TRANSACTION_DATE"].head()

0    01112006
1    01042006
2    11022006
3    01042007
4    02142007
Name: TRANSACTION_DATE, dtype: object

In [77]:
# Now turning into a datetime object
wp_data["TRANSACTION_DATE"] = pd.to_datetime(wp_data["TRANSACTION_DATE"],
                                            format='%m%d%Y')

In [78]:
# Success
wp_data["TRANSACTION_DATE"].head(10)

0   2006-01-11
1   2006-01-04
2   2006-11-02
3   2007-01-04
4   2007-02-14
5   2007-03-28
6   2007-04-24
7   2007-03-12
8   2006-08-22
9   2006-07-17
Name: TRANSACTION_DATE, dtype: datetime64[ns]

In [85]:
# WP said they found 5,432,109,643 pills supplied to TX between 2006 and 2012
# Can see they arrived at that number using the Dosage Unit column
wp_data["DOSAGE_UNIT"].sum()

5432109643.5

#### Creating smaller subsets of the Washington Post dataset:

In [8]:
subset = wp_data.drop(columns=[
    "REPORTER_DEA_NO", "REPORTER_BUS_ACT", "REPORTER_ADDL_CO_INFO",
    "REPORTER_ADDRESS1", "REPORTER_ADDRESS2", "REPORTER_CITY",
    "REPORTER_COUNTY", "BUYER_DEA_NO", "BUYER_ADDL_CO_INFO", "DRUG_CODE",
    "NDC_NO", "UNIT", "ACTION_INDICATOR"])

#### Reading in TX DSHS data to combine

In [12]:
dshs_2003 = pd.read_csv("data/TXDSHS/2003TXDeaths_by_County_data.csv")
dshs_2004 = pd.read_csv("data/TXDSHS/2004TXDeaths_by_County_data.csv")
dshs_2005 = pd.read_csv("data/TXDSHS/2005TXDeaths_by_County_data.csv")
dshs_2006 = pd.read_csv("data/TXDSHS/2006TXDeaths_by_County_data.csv")
dshs_2007 = pd.read_csv("data/TXDSHS/2007TXDeaths_by_County_data.csv")
dshs_2008 = pd.read_csv("data/TXDSHS/2008TXDeaths_by_County_data.csv")
dshs_2009 = pd.read_csv("data/TXDSHS/2009TXDeaths_by_County_data.csv")
dshs_2010 = pd.read_csv("data/TXDSHS/2010TXDeaths_by_County_data.csv")
dshs_2011 = pd.read_csv("data/TXDSHS/2011TXDeaths_by_County_data.csv")
dshs_2012 = pd.read_csv("data/TXDSHS/2012TXDeaths_by_County_data.csv")
dshs_2013 = pd.read_csv("data/TXDSHS/2013TXDeaths_by_County_data.csv")
dshs_2014 = pd.read_csv("data/TXDSHS/2014TXDeaths_by_County_data.csv")
dshs_2015 = pd.read_csv("data/TXDSHS/2015TXDeaths_by_County_data.csv")

In [13]:
# Note that '---' isn't a null, it indicates the data was surpressed for 
# privacy purposes, and there were between 1 and 9 deaths 
dshs_2006.head()

Unnamed: 0,County Name,Number of Deaths,Type of Death1,Year (copy),Latitude (generated),Longitude (generated)
0,Zavala,0,Accidental Poisoning Deaths where Opioids Were...,2006,28.866,-99.761
1,Zapata,0,Accidental Poisoning Deaths where Opioids Were...,2006,26.971,-99.203
2,Young,---,Accidental Poisoning Deaths where Opioids Were...,2006,33.175,-98.687
3,Yoakum,0,Accidental Poisoning Deaths where Opioids Were...,2006,33.173,-102.829
4,Wood,---,Accidental Poisoning Deaths where Opioids Were...,2006,32.783,-95.407


In [14]:
dshs_2006["Number of Deaths"].value_counts()

0      149
---     88
30       2
40       1
13       1
24       1
10       1
15       1
52       1
45       1
12       1
43       1
78       1
11       1
80       1
220      1
31       1
18       1
Name: Number of Deaths, dtype: int64

In [41]:
dshs_2003.shape

(254, 6)

In [19]:
# And now, to combine:
# Can do it this way since I know each of the dataframes have the same
# counties in the same order
dshs = pd.DataFrame()
dshs["County Name"] = dshs_2012["County Name"]
dshs["Latitude"] = dshs_2012["Latitude (generated)"]
dshs["Longitude"] = dshs_2012["Longitude (generated)"]
dshs["Deaths_2003"] = dshs_2003["Number of Deaths"]
dshs["Deaths_2004"] = dshs_2004["Number of Deaths"]
dshs["Deaths_2005"] = dshs_2005["Number of Deaths"]
dshs["Deaths_2006"] = dshs_2006["Number of Deaths"]
dshs["Deaths_2007"] = dshs_2007["Number of Deaths"]
dshs["Deaths_2008"] = dshs_2008["Number of Deaths"]
dshs["Deaths_2009"] = dshs_2009["Number of Deaths"]
dshs["Deaths_2010"] = dshs_2010["Number of Deaths"]
dshs["Deaths_2011"] = dshs_2011["Number of Deaths"]
dshs["Deaths_2012"] = dshs_2012["Number of Deaths"]
dshs["Deaths_2013"] = dshs_2013["Number of Deaths"]
dshs["Deaths_2014"] = dshs_2014["Number of Deaths"]
dshs["Deaths_2015"] = dshs_2015["Number of Deaths"]

In [20]:
dshs.head()

Unnamed: 0,County Name,Latitude,Longitude,Deaths_2003,Deaths_2004,Deaths_2005,Deaths_2006,Deaths_2007,Deaths_2008,Deaths_2009,Deaths_2010,Deaths_2011,Deaths_2012,Deaths_2013,Deaths_2014,Deaths_2015
0,Zavala,28.866,-99.761,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Zapata,26.971,-99.203,0,0,0,0,0,---,---,0,---,0,0,0,0
2,Young,33.175,-98.687,---,---,0,---,---,---,0,---,0,0,---,---,---
3,Yoakum,33.173,-102.829,0,0,---,0,0,0,0,0,0,0,0,0,0
4,Wood,32.783,-95.407,---,0,0,---,---,---,---,0,---,---,---,---,---


In [52]:
# Saving as a CSV
# dshs.to_csv(r"data/TXDSHS/Combined_TXDeaths_by_County_Data.csv",
#             index=False)

In [53]:
# Sanity check
dshs_data = pd.read_csv("data/TXDSHS/Combined_TXDeaths_by_County_Data.csv")

In [54]:
dshs_data.head()

Unnamed: 0,County Name,Latitude,Longitude,Deaths_2003,Deaths_2004,Deaths_2005,Deaths_2006,Deaths_2007,Deaths_2008,Deaths_2009,Deaths_2010,Deaths_2011,Deaths_2012,Deaths_2013,Deaths_2014,Deaths_2015
0,Zavala,28.866,-99.761,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Zapata,26.971,-99.203,0,0,0,0,0,---,---,0,---,0,0,0,0
2,Young,33.175,-98.687,---,---,0,---,---,---,0,---,0,0,---,---,---
3,Yoakum,33.173,-102.829,0,0,---,0,0,0,0,0,0,0,0,0,0
4,Wood,32.783,-95.407,---,0,0,---,---,---,---,0,---,---,---,---,---


In [55]:
dshs_data.shape

(254, 16)