# Overview

This is a scratch notebook, where I conducted much of my data exploration - finding column data types, creating sub-sets of the data, combining the Texas Department of State Health Services overdose death data, etc.

## Washington Post Data

Source: [Washington Post DEA Database](https://www.washingtonpost.com/graphics/2019/investigations/dea-pain-pill-database/)

In [1]:
# Imports

# Data import and manipulation
import pandas as pd
# Math
import numpy as np
# Let's go ahead and seed the notebook, for reproducibility
np.random.seed(113)

In [3]:
# After a brief exploration, these are the datatypes per column of the
# Washington Post dataset. Making them explicit for speed here
dtypes = {
    "REPORTER_DEA_NO" : "object",
    "REPORTER_BUS_ACT" : "object",
    "REPORTER_NAME" : "object",
    "REPORTER_ADDL_CO_INFO" : "object",
    "REPORTER_ADDRESS1" : "object",
    "REPORTER_ADDRESS2" : "object",
    "REPORTER_CITY" : "object",
    "REPORTER_STATE" : "object",
    "REPORTER_ZIP" : "int64",
    "REPORTER_COUNTY" : "object",
    "BUYER_DEA_NO" : "object",
    "BUYER_BUS_ACT" : "object",
    "BUYER_NAME" : "object",
    "BUYER_ADDL_CO_INFO" : "object",
    "BUYER_ADDRESS1" : "object",
    "BUYER_ADDRESS2" : "object",
    "BUYER_CITY" : "object",
    "BUYER_STATE" : "object",
    "BUYER_ZIP" : "int64",
    "BUYER_COUNTY" : "object",
    "TRANSACTION_CODE" : "object",
    "DRUG_CODE" : "int64",
    "NDC_NO" : "object",
    "DRUG_NAME" : "object",
    "QUANTITY" : "float64",
    "UNIT" : "float64",
    "ACTION_INDICATOR" : "object",
    "ORDER_FORM_NO" : "object",
    "CORRECTION_NO" :  "float64",
    "STRENGTH" : "float64",
    "TRANSACTION_DATE" : "int64",
    "CALC_BASE_WT_IN_GM" : "float64",
    "DOSAGE_UNIT" : "float64",
    "TRANSACTION_ID" : "int64",
    "Product_Name" : "object",
    "Ingredient_Name" : "object",
    "Measure" : "object",
    "MME_Conversion_Factor" : "float64",
    "Combined_Labeler_Name" : "object",
    "Revised_Company_Name" : "object",
    "Reporter_family" : "object",
    "dos_str" : "float64"
}
wp_data = pd.read_csv("data/arcos-tx-statewide-itemized.tsv", sep='\t', dtype=dtypes)

In [4]:
wp_data.head()

Unnamed: 0,REPORTER_DEA_NO,REPORTER_BUS_ACT,REPORTER_NAME,REPORTER_ADDL_CO_INFO,REPORTER_ADDRESS1,REPORTER_ADDRESS2,REPORTER_CITY,REPORTER_STATE,REPORTER_ZIP,REPORTER_COUNTY,...,DOSAGE_UNIT,TRANSACTION_ID,Product_Name,Ingredient_Name,Measure,MME_Conversion_Factor,Combined_Labeler_Name,Revised_Company_Name,Reporter_family,dos_str
0,PB0034861,DISTRIBUTOR,BELLCO DRUG CORP,,5500 NEW HORIZONS BLVD,,NORTH AMITYVILLE,NY,11701,SUFFOLK,...,500.0,3156,HYDROCODONE BIT 5MG/ACETAMINOPHEN 50,HYDROCODONE BITARTRATE HEMIPENTAHYDRATE,TAB,1.0,"Actavis Pharma, Inc.","Allergan, Inc.",Bellco Drug Corp,5.0
1,PB0034861,DISTRIBUTOR,BELLCO DRUG CORP,,5500 NEW HORIZONS BLVD,,NORTH AMITYVILLE,NY,11701,SUFFOLK,...,500.0,617,HYDROCODONE BIT 5MG/ACETAMINOPHEN 50,HYDROCODONE BITARTRATE HEMIPENTAHYDRATE,TAB,1.0,"Actavis Pharma, Inc.","Allergan, Inc.",Bellco Drug Corp,5.0
2,PB0034861,DISTRIBUTOR,BELLCO DRUG CORP,,5500 NEW HORIZONS BLVD,,NORTH AMITYVILLE,NY,11701,SUFFOLK,...,400.0,671,"HYDROCODONE.BIT./ACET.,10MG & 325MG/",HYDROCODONE BITARTRATE HEMIPENTAHYDRATE,TAB,1.0,Par Pharmaceutical,"Endo Pharmaceuticals, Inc.",Bellco Drug Corp,10.0
3,PB0034861,DISTRIBUTOR,BELLCO DRUG CORP,,5500 NEW HORIZONS BLVD,,NORTH AMITYVILLE,NY,11701,SUFFOLK,...,100.0,1300,"HYDROCODONE.BIT./ACET.,10MG & 325MG/",HYDROCODONE BITARTRATE HEMIPENTAHYDRATE,TAB,1.0,Par Pharmaceutical,"Endo Pharmaceuticals, Inc.",Bellco Drug Corp,10.0
4,PB0034861,DISTRIBUTOR,BELLCO DRUG CORP,,5500 NEW HORIZONS BLVD,,NORTH AMITYVILLE,NY,11701,SUFFOLK,...,100.0,4521,HYDROCODONE BIT. 10MG/ACETAMINOPHEN,HYDROCODONE BITARTRATE HEMIPENTAHYDRATE,TAB,1.0,"Actavis Pharma, Inc.","Allergan, Inc.",Bellco Drug Corp,10.0


#### Checking for most common values, for nulls, etc:

In [5]:
wp_data.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12108468 entries, 0 to 12108467
Data columns (total 42 columns):
REPORTER_DEA_NO          12108468 non-null object
REPORTER_BUS_ACT         12108468 non-null object
REPORTER_NAME            12108468 non-null object
REPORTER_ADDL_CO_INFO    820991 non-null object
REPORTER_ADDRESS1        12108468 non-null object
REPORTER_ADDRESS2        1922155 non-null object
REPORTER_CITY            12108468 non-null object
REPORTER_STATE           12108468 non-null object
REPORTER_ZIP             12108468 non-null int64
REPORTER_COUNTY          12108468 non-null object
BUYER_DEA_NO             12108468 non-null object
BUYER_BUS_ACT            12108468 non-null object
BUYER_NAME               12108468 non-null object
BUYER_ADDL_CO_INFO       4664223 non-null object
BUYER_ADDRESS1           12108468 non-null object
BUYER_ADDRESS2           1685301 non-null object
BUYER_CITY               12108468 non-null object
BUYER_STATE              12108468 non-nul

In [None]:
wp_data["BUYER_NAME"].value_counts().head()

In [None]:
wp_data["ACTION_INDICATOR"].value_counts()

In [7]:
# Let's try to parse through the time stamp on these transactions
wp_data["TRANSACTION_DATE"].head(10)

0     1112006
1     1042006
2    11022006
3     1042007
4     2142007
5     3282007
6     4242007
7     3122007
8     8222006
9     7172006
Name: TRANSACTION_DATE, dtype: int64

In [None]:
wp_data["TRANSACTION_DATE"].sort_values().head()

3308482     1012006
953863      1012006
10072495    1012006
8721814     1012006
953809      1012006
Name: TRANSACTION_DATE, dtype: int64

In [6]:
# Can see that we need to fill in preceeding zeros for months with 1 digit, 
# so each date has 8 digits
# First need to turn that column into strings
wp_data["TRANSACTION_DATE"] = wp_data["TRANSACTION_DATE"].astype('str')
wp_data["TRANSACTION_DATE"] = wp_data["TRANSACTION_DATE"].str.zfill(8)

In [7]:
# Much better
wp_data["TRANSACTION_DATE"].head()

0    01112006
1    01042006
2    11022006
3    01042007
4    02142007
Name: TRANSACTION_DATE, dtype: object

In [8]:
# Now turning into a datetime object
wp_data["TRANSACTION_DATE"] = pd.to_datetime(wp_data["TRANSACTION_DATE"],
                                            format='%m%d%Y')

In [None]:
# Success
wp_data["TRANSACTION_DATE"].head(10)

In [None]:
# WP said they found 5,432,109,643 pills supplied to TX between 2006 and 2012
# Can see they arrived at that number using the Dosage Unit column
wp_data["DOSAGE_UNIT"].sum()

#### Creating smaller subsets of the Washington Post dataset:

In [None]:
subset = wp_data.drop(columns=[
    "REPORTER_DEA_NO", "REPORTER_BUS_ACT", "REPORTER_ADDL_CO_INFO",
    "REPORTER_ADDRESS1", "REPORTER_ADDRESS2", "REPORTER_CITY",
    "REPORTER_COUNTY", "BUYER_DEA_NO", "BUYER_ADDL_CO_INFO", "DRUG_CODE",
    "NDC_NO", "UNIT", "ACTION_INDICATOR"])

## Opioid Overdose Death Data

Source: [Texas Department of State Health Services](http://healthdata.dshs.texas.gov/Opioids/Deaths)

In [9]:
dshs_2003 = pd.read_csv("data/TXDSHS/2003TXDeaths_by_County_data.csv")
dshs_2004 = pd.read_csv("data/TXDSHS/2004TXDeaths_by_County_data.csv")
dshs_2005 = pd.read_csv("data/TXDSHS/2005TXDeaths_by_County_data.csv")
dshs_2006 = pd.read_csv("data/TXDSHS/2006TXDeaths_by_County_data.csv")
dshs_2007 = pd.read_csv("data/TXDSHS/2007TXDeaths_by_County_data.csv")
dshs_2008 = pd.read_csv("data/TXDSHS/2008TXDeaths_by_County_data.csv")
dshs_2009 = pd.read_csv("data/TXDSHS/2009TXDeaths_by_County_data.csv")
dshs_2010 = pd.read_csv("data/TXDSHS/2010TXDeaths_by_County_data.csv")
dshs_2011 = pd.read_csv("data/TXDSHS/2011TXDeaths_by_County_data.csv")
dshs_2012 = pd.read_csv("data/TXDSHS/2012TXDeaths_by_County_data.csv")
dshs_2013 = pd.read_csv("data/TXDSHS/2013TXDeaths_by_County_data.csv")
dshs_2014 = pd.read_csv("data/TXDSHS/2014TXDeaths_by_County_data.csv")
dshs_2015 = pd.read_csv("data/TXDSHS/2015TXDeaths_by_County_data.csv")

In [10]:
# Note that '---' isn't a null, it indicates the data was surpressed for 
# privacy purposes, and there were between 1 and 9 deaths 
dshs_2006.head()

Unnamed: 0,County Name,Number of Deaths,Type of Death1,Year (copy),Latitude (generated),Longitude (generated)
0,Zavala,0,Accidental Poisoning Deaths where Opioids Were...,2006,28.866,-99.761
1,Zapata,0,Accidental Poisoning Deaths where Opioids Were...,2006,26.971,-99.203
2,Young,---,Accidental Poisoning Deaths where Opioids Were...,2006,33.175,-98.687
3,Yoakum,0,Accidental Poisoning Deaths where Opioids Were...,2006,33.173,-102.829
4,Wood,---,Accidental Poisoning Deaths where Opioids Were...,2006,32.783,-95.407


In [11]:
dshs_2006.shape

(254, 6)

In [12]:
# First, want to prepare these dataframes for a multi-index for modeling
# Creating a list of our yearly dataframes
dshs_dfs = [dshs_2006, dshs_2007, dshs_2008, dshs_2009, dshs_2010, 
            dshs_2011, dshs_2012]
# Just stacking them, using concat
dshs_all = pd.concat(dshs_dfs)

In [13]:
dshs_all.shape

(1778, 6)

In [14]:
dshs_all.head()

Unnamed: 0,County Name,Number of Deaths,Type of Death1,Year (copy),Latitude (generated),Longitude (generated)
0,Zavala,0,Accidental Poisoning Deaths where Opioids Were...,2006,28.866,-99.761
1,Zapata,0,Accidental Poisoning Deaths where Opioids Were...,2006,26.971,-99.203
2,Young,---,Accidental Poisoning Deaths where Opioids Were...,2006,33.175,-98.687
3,Yoakum,0,Accidental Poisoning Deaths where Opioids Were...,2006,33.173,-102.829
4,Wood,---,Accidental Poisoning Deaths where Opioids Were...,2006,32.783,-95.407


In [15]:
# Now remember those --- are placeholders. The source gives the total number
# of deaths per year, so I can arrive at an average to get a good number
# to use in place of --- per year
# First, let's see how many of these placeholders we have - 626
year_placeholders = {}
for year in range(2006, 2013):
    year_df = dshs_all.loc[dshs_all["Year (copy)"] == year]
    placeholders = year_df.loc[year_df["Number of Deaths"] == "---"]
    year_placeholders[year] = len(placeholders)

In [16]:
year_placeholders

{2006: 88, 2007: 88, 2008: 102, 2009: 85, 2010: 89, 2011: 95, 2012: 79}

In [17]:
# Now, let's find the number of deaths we have in this database
dshs_all["Number of Deaths"].replace(to_replace="---", value=0, inplace=True)

In [18]:
# Need to set the Number of Deaths column as an integer
dshs_all["Number of Deaths"] = dshs_all["Number of Deaths"].astype("int64")

In [19]:
# Creating an empty dictionary, where we'll have each year as the key and 
# each yearly sum as the value
total_deaths = {}
for year in range(2006, 2013):
    year_df = dshs_all.loc[dshs_all["Year (copy)"] == year]
    year_sum = year_df["Number of Deaths"].sum()
    total_deaths[year] = year_sum

In [20]:
total_deaths

{2006: 752, 2007: 850, 2008: 760, 2009: 847, 2010: 781, 2011: 853, 2012: 804}

In [21]:
# These actual counts of the total number of opioid overdose deaths in Texas
# comes from the DSHS website
actual_deaths = {2006: 972, 2007: 1044, 2008: 1008, 2009: 1038, 2010: 1027,
                2011: 1070, 2012: 1016}

In [22]:
# And now, let's find those annual averages for placeholder counties
average_placeholder = {}
for year in range(2006, 2013):
    death_difference = actual_deaths[year] - total_deaths[year]
    death_average = death_difference / year_placeholders[year]
    average_placeholder[year] = death_average

In [23]:
average_placeholder

{2006: 2.5,
 2007: 2.2045454545454546,
 2008: 2.4313725490196076,
 2009: 2.2470588235294118,
 2010: 2.764044943820225,
 2011: 2.2842105263157895,
 2012: 2.6835443037974684}

In [24]:
# So now we know that, for all the placeholders in our data, 2.5 would be a 
# reasonable estimate to use in our analysis - so let's replace them
# Previously we replaced those --- with zeros, so need a new df to work with
dshs_for_csv = pd.concat(dshs_dfs)
dshs_for_csv.head()

Unnamed: 0,County Name,Number of Deaths,Type of Death1,Year (copy),Latitude (generated),Longitude (generated)
0,Zavala,0,Accidental Poisoning Deaths where Opioids Were...,2006,28.866,-99.761
1,Zapata,0,Accidental Poisoning Deaths where Opioids Were...,2006,26.971,-99.203
2,Young,---,Accidental Poisoning Deaths where Opioids Were...,2006,33.175,-98.687
3,Yoakum,0,Accidental Poisoning Deaths where Opioids Were...,2006,33.173,-102.829
4,Wood,---,Accidental Poisoning Deaths where Opioids Were...,2006,32.783,-95.407


In [25]:
dshs_for_csv["Number of Deaths"].replace(to_replace="---", value="2.5", inplace=True)

In [26]:
dshs_for_csv.head()

Unnamed: 0,County Name,Number of Deaths,Type of Death1,Year (copy),Latitude (generated),Longitude (generated)
0,Zavala,0.0,Accidental Poisoning Deaths where Opioids Were...,2006,28.866,-99.761
1,Zapata,0.0,Accidental Poisoning Deaths where Opioids Were...,2006,26.971,-99.203
2,Young,2.5,Accidental Poisoning Deaths where Opioids Were...,2006,33.175,-98.687
3,Yoakum,0.0,Accidental Poisoning Deaths where Opioids Were...,2006,33.173,-102.829
4,Wood,2.5,Accidental Poisoning Deaths where Opioids Were...,2006,32.783,-95.407


In [27]:
# Writing to a csv
dshs_for_csv.to_csv(r"data/TXDSHS/TXDeaths_by_County_Data_NullsRemoved.csv",
                    index=False)

In [28]:
# Sanity check
dshs_test = pd.read_csv(
    "data/TXDSHS/TXDeaths_by_County_Data_NullsRemoved.csv")
dshs_test.head()

Unnamed: 0,County Name,Number of Deaths,Type of Death1,Year (copy),Latitude (generated),Longitude (generated)
0,Zavala,0.0,Accidental Poisoning Deaths where Opioids Were...,2006,28.866,-99.761
1,Zapata,0.0,Accidental Poisoning Deaths where Opioids Were...,2006,26.971,-99.203
2,Young,2.5,Accidental Poisoning Deaths where Opioids Were...,2006,33.175,-98.687
3,Yoakum,0.0,Accidental Poisoning Deaths where Opioids Were...,2006,33.173,-102.829
4,Wood,2.5,Accidental Poisoning Deaths where Opioids Were...,2006,32.783,-95.407


#### Population Data

Source: [Texas State Library and Archives Commission](https://www.tsl.texas.gov/ref/abouttx/population.html), which links to the Census data to download.

Note that, per year, I am using the July county population estimates - even in years where there was a census conducted. This is for consistency, because if I need to use July estimates in other years I'd prefer to use it each year, not switching to the April 2010 census count and then back to July 2011 estimates (for example). 

In [29]:
# Loading in the data for the 2000-2010 population estimates
# Defining column names and skipping some opening rows/footers because excel
pop_2000_2010 = pd.read_excel("data/2000-2010_Population_Estimates_TX.xls",
                              names=["COUNTY", "APR_2000", "JUL_2000", 
                                     "JUL_2001", "JUL_2002", "JUL_2003", 
                                     "JUL_2004", "JUL_2005", "JUL_2006", 
                                     "JUL_2007", "JUL_2008", "JUL_2009", 
                                     "APR_2010", "JUL_2010"],
                              skiprows=[0, 1, 2, 3], skipfooter=8)

In [30]:
pop_2000_2010.head()

Unnamed: 0,COUNTY,APR_2000,JUL_2000,JUL_2001,JUL_2002,JUL_2003,JUL_2004,JUL_2005,JUL_2006,JUL_2007,JUL_2008,JUL_2009,APR_2010,JUL_2010
0,.Anderson County,55114,55062,54263,54740,56068,56245,56873,57386,57870,57963,58410,58458,58452
1,.Andrews County,13002,12949,12856,13022,12976,13006,13016,13195,13513,14099,14601,14786,14833
2,.Angelina County,80123,80270,80273,80803,81510,82070,82553,83810,84518,84961,86029,86771,86953
3,.Aransas County,22457,22452,22287,22616,22843,23067,23561,23395,23172,23225,23291,23158,23151
4,.Archer County,8904,8966,8849,8942,9013,9078,9068,9063,9026,9104,9023,9054,9060


In [31]:
pop_2000_2010.tail(5)

Unnamed: 0,COUNTY,APR_2000,JUL_2000,JUL_2001,JUL_2002,JUL_2003,JUL_2004,JUL_2005,JUL_2006,JUL_2007,JUL_2008,JUL_2009,APR_2010,JUL_2010
249,.Wood County,36729,36811,37288,37633,38915,39600,39917,41099,41414,41722,41870,41964,42019
250,.Yoakum County,7325,7274,7299,7212,7235,7362,7404,7404,7588,7765,7908,7879,7865
251,.Young County,17872,17846,17659,17665,17892,17921,17801,18227,18135,18104,18466,18550,18559
252,.Zapata County,12081,12088,12266,12514,12584,12727,13043,13069,13388,13640,13876,14018,14070
253,.Zavala County,11645,11636,11596,11616,11513,11512,11565,11642,11657,11725,11544,11677,11724


In [32]:
pop_2000_2010.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 254 entries, 0 to 253
Data columns (total 14 columns):
COUNTY      254 non-null object
APR_2000    254 non-null int64
JUL_2000    254 non-null int64
JUL_2001    254 non-null int64
JUL_2002    254 non-null int64
JUL_2003    254 non-null int64
JUL_2004    254 non-null int64
JUL_2005    254 non-null int64
JUL_2006    254 non-null int64
JUL_2007    254 non-null int64
JUL_2008    254 non-null int64
JUL_2009    254 non-null int64
APR_2010    254 non-null int64
JUL_2010    254 non-null int64
dtypes: int64(13), object(1)
memory usage: 27.9+ KB


In [33]:
# All of the counties have a period at the beginning
# We want them in the format "ANDERSON" not ".Anderson County"

# Removing the dot
pop_2000_2010["COUNTY"] = [x.strip('.') for x in pop_2000_2010["COUNTY"]]

# Removing " County"
pop_2000_2010["COUNTY"] = pop_2000_2010["COUNTY"].str.split(' County').str[0]

# Changing all to uppercase
pop_2000_2010["COUNTY"] = pop_2000_2010["COUNTY"].str.upper()

In [34]:
# Dropping data from before my dataset, because I won't need it
# Also dropping 2010, since I'll use the updated and hopefully more accurate
# 2010 estimates from the more recent database
pop_2006_2009 = pop_2000_2010[[
    "COUNTY", "JUL_2006", "JUL_2007", "JUL_2008", "JUL_2009"]]

In [35]:
# Much better
pop_2006_2009.head()

Unnamed: 0,COUNTY,JUL_2006,JUL_2007,JUL_2008,JUL_2009
0,ANDERSON,57386,57870,57963,58410
1,ANDREWS,13195,13513,14099,14601
2,ANGELINA,83810,84518,84961,86029
3,ARANSAS,23395,23172,23225,23291
4,ARCHER,9063,9026,9104,9023


In [36]:
# Now loading the data for 2010-2018 population estimates
pop_2010_2018 = pd.read_csv("data/2010-2018_Population_Estimates_TX.csv",
                            header=1,
                            names=["EXT_ID", "ID", "COUNTY", "APR_2010_CEN",
                                   "APR_2010_BASE", "JUL_2010", "JUL_2011",
                                   "JUL_2012", "JUL_2013", "JUL_2014",
                                   "JUL_2015", "JUL_2016", "JUL_2017", 
                                   "JUL_2018"])

In [37]:
pop_2010_2018.head()

Unnamed: 0,EXT_ID,ID,COUNTY,APR_2010_CEN,APR_2010_BASE,JUL_2010,JUL_2011,JUL_2012,JUL_2013,JUL_2014,JUL_2015,JUL_2016,JUL_2017,JUL_2018
0,0500000US48001,48001,"Anderson County, Texas",58458,58459,58497,58394,58065,57977,57849,57646,57550,58212,58057
1,0500000US48003,48003,"Andrews County, Texas",14786,14786,14849,15388,16113,16788,17445,18083,17805,17631,18128
2,0500000US48005,48005,"Angelina County, Texas",86771,86771,86905,87295,87520,87333,87599,87874,87759,87711,87092
3,0500000US48007,48007,"Aransas County, Texas",23158,23158,23182,23214,23457,23890,24570,24815,25191,25447,23792
4,0500000US48009,48009,"Archer County, Texas",9054,9055,9112,8834,8809,8795,8837,8758,8780,8786,8786


In [38]:
# Again, need to get the counties to be just the uppercase name

# Removing " County"
pop_2010_2018["COUNTY"] = pop_2010_2018["COUNTY"].str.split(' County').str[0]

# Changing all to uppercase
pop_2010_2018["COUNTY"] = pop_2010_2018["COUNTY"].str.upper()

pop_2010_2018.head()

Unnamed: 0,EXT_ID,ID,COUNTY,APR_2010_CEN,APR_2010_BASE,JUL_2010,JUL_2011,JUL_2012,JUL_2013,JUL_2014,JUL_2015,JUL_2016,JUL_2017,JUL_2018
0,0500000US48001,48001,ANDERSON,58458,58459,58497,58394,58065,57977,57849,57646,57550,58212,58057
1,0500000US48003,48003,ANDREWS,14786,14786,14849,15388,16113,16788,17445,18083,17805,17631,18128
2,0500000US48005,48005,ANGELINA,86771,86771,86905,87295,87520,87333,87599,87874,87759,87711,87092
3,0500000US48007,48007,ARANSAS,23158,23158,23182,23214,23457,23890,24570,24815,25191,25447,23792
4,0500000US48009,48009,ARCHER,9054,9055,9112,8834,8809,8795,8837,8758,8780,8786,8786


In [39]:
pop_2010_2012 = pop_2010_2018[["COUNTY", "JUL_2010", "JUL_2011", "JUL_2012"]]

In [40]:
pop_2010_2012.head()

Unnamed: 0,COUNTY,JUL_2010,JUL_2011,JUL_2012
0,ANDERSON,58497,58394,58065
1,ANDREWS,14849,15388,16113
2,ANGELINA,86905,87295,87520
3,ARANSAS,23182,23214,23457
4,ARCHER,9112,8834,8809


In [41]:
# And now, a dataset of all the relevant population data!
pop_data = pop_2006_2009.merge(pop_2010_2012, on="COUNTY")
# Renaming columns for ease of use, since now they're all July estimates
pop_data.rename(columns={"JUL_2006": 2006, "JUL_2007": 2007,
                         "JUL_2008": 2008, "JUL_2009": 2009,
                         "JUL_2010": 2010, "JUL_2011": 2011,
                         "JUL_2012": 2012}, inplace=True)
pop_data.head()

Unnamed: 0,COUNTY,2006,2007,2008,2009,2010,2011,2012
0,ANDERSON,57386,57870,57963,58410,58497,58394,58065
1,ANDREWS,13195,13513,14099,14601,14849,15388,16113
2,ANGELINA,83810,84518,84961,86029,86905,87295,87520
3,ARANSAS,23395,23172,23225,23291,23182,23214,23457
4,ARCHER,9063,9026,9104,9023,9112,8834,8809


#### Combining to arrive at a opioid death per capita figure, then deaths per 100k of population

In [42]:
dshs_for_csv.head()

Unnamed: 0,County Name,Number of Deaths,Type of Death1,Year (copy),Latitude (generated),Longitude (generated)
0,Zavala,0.0,Accidental Poisoning Deaths where Opioids Were...,2006,28.866,-99.761
1,Zapata,0.0,Accidental Poisoning Deaths where Opioids Were...,2006,26.971,-99.203
2,Young,2.5,Accidental Poisoning Deaths where Opioids Were...,2006,33.175,-98.687
3,Yoakum,0.0,Accidental Poisoning Deaths where Opioids Were...,2006,33.173,-102.829
4,Wood,2.5,Accidental Poisoning Deaths where Opioids Were...,2006,32.783,-95.407


In [43]:
# Some quick cleaning
dshs_clean = dshs_for_csv.copy()
# Only keeping the columns we want
dshs_clean.drop(columns=["Type of Death1", "Latitude (generated)", 
                         "Longitude (generated)"], inplace=True)
# Renaming the columns
dshs_clean.rename(columns={"County Name": "COUNTY",
                           "Number of Deaths": "NUM_DEATHS", 
                           "Year (copy)": "YEAR"}, inplace=True)
# Making all the county names uppercase
dshs_clean["COUNTY"] = dshs_clean["COUNTY"].str.upper()
# Making sure the values in the Number of Deaths column are floats
dshs_clean["NUM_DEATHS"] = dshs_clean["NUM_DEATHS"].astype("float")

dshs_clean.head()

Unnamed: 0,COUNTY,NUM_DEATHS,YEAR
0,ZAVALA,0.0,2006
1,ZAPATA,0.0,2006
2,YOUNG,2.5,2006
3,YOAKUM,0.0,2006
4,WOOD,2.5,2006


In [142]:
# Pivoting the table to make the columns each year
dshs_pivot = pd.pivot_table(dshs_clean, index="COUNTY",
                          columns="YEAR",
                          values="NUM_DEATHS")

In [143]:
# Removing a weird index name, leftover from the pivot
dshs_pivot.rename_axis(None, axis=1, inplace=True)

In [144]:
# Hooray, now it looks just like our population data table
dshs_pivot.head()

Unnamed: 0_level_0,2006,2007,2008,2009,2010,2011,2012
COUNTY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ANDERSON,2.5,2.5,2.5,2.5,2.5,2.5,2.5
ANDREWS,0.0,0.0,0.0,2.5,0.0,0.0,0.0
ANGELINA,2.5,2.5,2.5,0.0,2.5,2.5,2.5
ARANSAS,2.5,2.5,2.5,2.5,2.5,2.5,2.5
ARCHER,0.0,0.0,2.5,0.0,0.0,2.5,0.0


In [47]:
pop_data.set_index("COUNTY", inplace=True)
pop_data.head()

Unnamed: 0_level_0,2006,2007,2008,2009,2010,2011,2012
COUNTY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ANDERSON,57386,57870,57963,58410,58497,58394,58065
ANDREWS,13195,13513,14099,14601,14849,15388,16113
ANGELINA,83810,84518,84961,86029,86905,87295,87520
ARANSAS,23395,23172,23225,23291,23182,23214,23457
ARCHER,9063,9026,9104,9023,9112,8834,8809


In [48]:
pop_data.loc[pop_data.index == "TERRELL"]

Unnamed: 0_level_0,2006,2007,2008,2009,2010,2011,2012
COUNTY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
TERRELL,921,886,884,930,1011,950,921


In [49]:
dshs_pivot.loc[dshs_pivot.index == "TERRELL"]

Unnamed: 0_level_0,2006,2007,2008,2009,2010,2011,2012
COUNTY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
TERRELL,0.0,2.5,0.0,0.0,0.0,0.0,0.0


In [50]:
deaths_percapita = dshs_pivot / pop_data

In [51]:
deaths_percapita.describe()

Unnamed: 0,2006,2007,2008,2009,2010,2011,2012
count,254.0,254.0,254.0,254.0,254.0,254.0,254.0
mean,3.9e-05,5e-05,3.9e-05,3.3e-05,3.5e-05,3.5e-05,3e-05
std,8.4e-05,0.000197,7.3e-05,7.4e-05,7.9e-05,6.3e-05,9.4e-05
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,4.9e-05,4.9e-05,5.2e-05,4.3e-05,4e-05,5e-05,3.1e-05
max,0.000849,0.002822,0.000476,0.000752,0.000615,0.000414,0.00128


In [52]:
pop_100k = pop_data / 100000

In [53]:
deaths_per100k = dshs_pivot / pop_100k

In [54]:
deaths_per100k.describe()

Unnamed: 0,2006,2007,2008,2009,2010,2011,2012
count,254.0,254.0,254.0,254.0,254.0,254.0,254.0
mean,3.884246,4.954376,3.877724,3.318701,3.506903,3.528426,2.972931
std,8.368138,19.666148,7.340291,7.375099,7.930177,6.333992,9.354662
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,4.916587,4.854982,5.164023,4.266988,4.046553,4.984242,3.130589
max,84.860828,282.167043,47.600914,75.165364,61.546036,41.377027,128.008193


In [55]:
deaths_per100k[2007].sort_values(ascending=False).head()

COUNTY
TERRELL     282.167043
REAL         75.574365
HUDSPETH     72.212594
SUTTON       57.950858
GOLIAD       35.004201
Name: 2007, dtype: float64

In [56]:
wp_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12108468 entries, 0 to 12108467
Data columns (total 42 columns):
REPORTER_DEA_NO          object
REPORTER_BUS_ACT         object
REPORTER_NAME            object
REPORTER_ADDL_CO_INFO    object
REPORTER_ADDRESS1        object
REPORTER_ADDRESS2        object
REPORTER_CITY            object
REPORTER_STATE           object
REPORTER_ZIP             int64
REPORTER_COUNTY          object
BUYER_DEA_NO             object
BUYER_BUS_ACT            object
BUYER_NAME               object
BUYER_ADDL_CO_INFO       object
BUYER_ADDRESS1           object
BUYER_ADDRESS2           object
BUYER_CITY               object
BUYER_STATE              object
BUYER_ZIP                int64
BUYER_COUNTY             object
TRANSACTION_CODE         object
DRUG_CODE                int64
NDC_NO                   object
DRUG_NAME                object
QUANTITY                 float64
UNIT                     float64
ACTION_INDICATOR         object
ORDER_FORM_NO         

In [57]:
pills = wp_data[["BUYER_COUNTY", "TRANSACTION_DATE", "DOSAGE_UNIT"]].copy()

In [58]:
pills.head()

Unnamed: 0,BUYER_COUNTY,TRANSACTION_DATE,DOSAGE_UNIT
0,GREGG,2006-01-11,500.0
1,JEFFERSON,2006-01-04,500.0
2,JEFFERSON,2006-11-02,400.0
3,JEFFERSON,2007-01-04,100.0
4,JEFFERSON,2007-02-14,100.0


In [59]:
pills["YEAR"] = pills["TRANSACTION_DATE"].dt.year

In [60]:
pills.drop(columns="TRANSACTION_DATE", inplace=True)
pills.rename(columns={"BUYER_COUNTY": "COUNTY"}, inplace=True)

In [61]:
pills.head()

Unnamed: 0,COUNTY,DOSAGE_UNIT,YEAR
0,GREGG,500.0,2006
1,JEFFERSON,500.0,2006
2,JEFFERSON,400.0,2006
3,JEFFERSON,100.0,2007
4,JEFFERSON,100.0,2007


In [62]:
# Alas, one specific county has a space in this dataframe, while all the 
# other datasets I have spell it without a space - let's fix that
pills["COUNTY"].replace(to_replace="DE WITT", value="DEWITT", inplace=True)

In [63]:
# Pivoting the table to make the columns each year
pills_pivot = pd.pivot_table(pills, index="COUNTY", columns="YEAR",
                             values="DOSAGE_UNIT", aggfunc="sum")

In [64]:
pills_pivot = pills_pivot.reindex_like(pop_data)

In [65]:
# Filling nulls, when no pills were sent to that county
pills_pivot.fillna(0, inplace=True)
# Removing a weird index name, leftover from the pivot
pills_pivot.rename_axis(None, axis=1, inplace=True)

In [66]:
pills_pivot.head()

Unnamed: 0_level_0,2006,2007,2008,2009,2010,2011,2012
COUNTY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ANDERSON,2209130.0,2148570.0,2296470.0,2348990.0,2445130.0,2740100.0,2672540.0
ANDREWS,246600.0,274080.0,320200.0,331510.0,367330.0,415720.0,481510.0
ANGELINA,3070975.0,3472800.0,4016760.0,4286080.0,4319439.0,4789710.0,4693270.0
ARANSAS,734500.0,948950.0,989600.0,1014920.0,1110790.0,1206540.0,1294570.0
ARCHER,0.0,200.0,0.0,0.0,0.0,0.0,100.0


In [67]:
pills_percapita = pills_pivot / pop_data

In [68]:
pills_percapita.describe()

Unnamed: 0,2006,2007,2008,2009,2010,2011,2012
count,254.0,254.0,254.0,254.0,254.0,254.0,254.0
mean,21.064651,24.087389,25.850747,27.449645,29.093404,32.659495,33.197048
std,14.82111,16.798077,17.151721,18.069433,18.793225,20.035172,20.283625
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,11.408081,13.224395,14.64114,16.07243,16.933764,19.514753,20.747307
50%,19.323922,21.784128,24.089366,26.064345,27.147936,32.777558,33.328704
75%,29.753164,33.5909,36.777378,38.822188,40.756245,45.414605,46.297715
max,103.736457,119.518327,88.494255,86.246615,88.626591,98.326704,96.952074


In [69]:
pills_percapita.head()

Unnamed: 0_level_0,2006,2007,2008,2009,2010,2011,2012
COUNTY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ANDERSON,38.495975,37.127527,39.619585,40.215545,41.799238,46.924342,46.026694
ANDREWS,18.688897,20.282691,22.710831,22.704609,24.737693,27.015857,29.883324
ANGELINA,36.642107,41.089472,47.277692,49.821339,49.702998,54.868091,53.625114
ARANSAS,31.395597,40.952443,42.609257,43.57563,47.916056,51.97467,55.189069
ARCHER,0.0,0.022158,0.0,0.0,0.0,0.0,0.011352


In [70]:
pills_percapita.shape

(254, 7)

## Visualizing

In [71]:
# Need county identification numbers, which are Federal Information Processing
# Standard codes - which, luckily, were a part of one of the population csvs
county_id = {}
for obs in pop_2010_2018.index:
    county_id[pop_2010_2018["COUNTY"][obs]] = pop_2010_2018["ID"][obs]

In [73]:
# Creating a new column for the County IDs
pills_percapita["COUNTY_ID"] = county_id.values()

In [74]:
pills_percapita.head()

Unnamed: 0_level_0,2006,2007,2008,2009,2010,2011,2012,COUNTY_ID
COUNTY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ANDERSON,38.495975,37.127527,39.619585,40.215545,41.799238,46.924342,46.026694,48001
ANDREWS,18.688897,20.282691,22.710831,22.704609,24.737693,27.015857,29.883324,48003
ANGELINA,36.642107,41.089472,47.277692,49.821339,49.702998,54.868091,53.625114,48005
ARANSAS,31.395597,40.952443,42.609257,43.57563,47.916056,51.97467,55.189069,48007
ARCHER,0.0,0.022158,0.0,0.0,0.0,0.0,0.011352,48009


In [75]:
pills_percapita.describe()

Unnamed: 0,2006,2007,2008,2009,2010,2011,2012,COUNTY_ID
count,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0
mean,21.064651,24.087389,25.850747,27.449645,29.093404,32.659495,33.197048,48254.0
std,14.82111,16.798077,17.151721,18.069433,18.793225,20.035172,20.283625,146.93536
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,48001.0
25%,11.408081,13.224395,14.64114,16.07243,16.933764,19.514753,20.747307,48127.5
50%,19.323922,21.784128,24.089366,26.064345,27.147936,32.777558,33.328704,48254.0
75%,29.753164,33.5909,36.777378,38.822188,40.756245,45.414605,46.297715,48380.5
max,103.736457,119.518327,88.494255,86.246615,88.626591,98.326704,96.952074,48507.0


In [76]:
pills_percapita.reset_index(inplace=True)

In [77]:
pills_percapita.head()

Unnamed: 0,COUNTY,2006,2007,2008,2009,2010,2011,2012,COUNTY_ID
0,ANDERSON,38.495975,37.127527,39.619585,40.215545,41.799238,46.924342,46.026694,48001
1,ANDREWS,18.688897,20.282691,22.710831,22.704609,24.737693,27.015857,29.883324,48003
2,ANGELINA,36.642107,41.089472,47.277692,49.821339,49.702998,54.868091,53.625114,48005
3,ARANSAS,31.395597,40.952443,42.609257,43.57563,47.916056,51.97467,55.189069,48007
4,ARCHER,0.0,0.022158,0.0,0.0,0.0,0.0,0.011352,48009


In [None]:
import plotly.figure_factory as ff
import plotly.graph_objects as go

In [None]:
fig2012 = ff.create_choropleth(fips=pills_percapita["COUNTY_ID"],
                               values=pills_percapita[2012],
                               scope=["TX"],
                               binning_endpoints=[1, 25, 50, 75],
                               county_outline={'color': 'rgb(255,255,255)',
                                               'width': 0.5},
                               show_hover=True,
                               legend_title="Number of Pills Per Capita"
                               )
fig2012.layout.template = None
fig2012.show()

In [None]:
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

In [156]:
pills_percapita.head()

Unnamed: 0,COUNTY,2006,2007,2008,2009,2010,2011,2012,COUNTY_ID
0,ANDERSON,38.495975,37.127527,39.619585,40.215545,41.799238,46.924342,46.026694,48001
1,ANDREWS,18.688897,20.282691,22.710831,22.704609,24.737693,27.015857,29.883324,48003
2,ANGELINA,36.642107,41.089472,47.277692,49.821339,49.702998,54.868091,53.625114,48005
3,ARANSAS,31.395597,40.952443,42.609257,43.57563,47.916056,51.97467,55.189069,48007
4,ARCHER,0.0,0.022158,0.0,0.0,0.0,0.0,0.011352,48009


In [None]:
years = list(pills_pc_melted["YEAR"].unique())

In [None]:
@interact
def color_counties(year=years):
    fig = ff.create_choropleth(fips=pills_percapita["COUNTY_ID"],
                                   values=pills_percapita[year],
                                   scope=["TX"],
                                   binning_endpoints=[1, 25, 50, 75],
                                   county_outline={'color': 'rgb(255,255,255)',
                                                   'width': 0.5},
                                   show_hover=True,
                                   legend_title="Number of Pills Per Capita"
                                   )
    fig.layout.template = None
    fig.show()

In [None]:
ff.create_choropleth?

In [None]:
pills_percapita.head()

In [None]:
year_slider = []

In [None]:
for year in range(2006, 2013):
    df_year = pills_percapita[["COUNTY", year, "COUNTY_ID"]]
#     df_year["TEXT"] = "County: " + df_year["COUNTY"] + \
#         " Pills Per Capita: " + df_year[year].round(2).astype(str)
    data_year = dict(fips=pills_percapita["COUNTY_ID"],
                     values=pills_percapita[year],
                     scope=["TX"],
                     binning_endpoints=[1, 25, 50, 75],
                     county_outline={'color': 'rgb(255,255,255)',
                                     'width': 0.5},
                     show_hover=True,
                     legend_title="Number of Pills Per Capita")
    year_slider.append(data_year)

In [None]:
steps = []

for i in range(len(year_slider)):
    step = dict(method='restyle',
               args=['visible', [False] * len(year_slider)],
               label='Year {}'.format(i+2006))
    step['args'][1][i] = True
    steps.append(step)

In [None]:
sliders = [dict(active=0, pad={"t": 1}, steps=steps)]

In [None]:
layout = dict(geo=dict(scope='usa'),
              sliders=sliders)
fig=dict(data=year_slider, layout=layout)

In [None]:
import plotly
import plotly.graph_objs as go
import plotly.offline as offline
from plotly.graph_objs import *
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [None]:
import plotly.express as px
gapminder = px.data.gapminder()

## Creating a Better Dataframe to Export

In [107]:
pills_pivot.reset_index(inplace=True)

In [110]:
pills_pivot.head()

Unnamed: 0,COUNTY,2006,2007,2008,2009,2010,2011,2012
0,ANDERSON,2209130.0,2148570.0,2296470.0,2348990.0,2445130.0,2740100.0,2672540.0
1,ANDREWS,246600.0,274080.0,320200.0,331510.0,367330.0,415720.0,481510.0
2,ANGELINA,3070975.0,3472800.0,4016760.0,4286080.0,4319439.0,4789710.0,4693270.0
3,ARANSAS,734500.0,948950.0,989600.0,1014920.0,1110790.0,1206540.0,1294570.0
4,ARCHER,0.0,200.0,0.0,0.0,0.0,0.0,100.0


In [113]:
pills_total_melted = pd.melt(pills_pivot, id_vars=["COUNTY"],
                           var_name="YEAR", value_name="TOTAL_PILLS")

In [114]:
pills_total_melted.head()

Unnamed: 0,COUNTY,YEAR,TOTAL_PILLS
0,ANDERSON,2006,2209130.0
1,ANDREWS,2006,246600.0
2,ANGELINA,2006,3070975.0
3,ARANSAS,2006,734500.0
4,ARCHER,2006,0.0


In [117]:
pop_data.head()

Unnamed: 0_level_0,2006,2007,2008,2009,2010,2011,2012
COUNTY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ANDERSON,57386,57870,57963,58410,58497,58394,58065
ANDREWS,13195,13513,14099,14601,14849,15388,16113
ANGELINA,83810,84518,84961,86029,86905,87295,87520
ARANSAS,23395,23172,23225,23291,23182,23214,23457
ARCHER,9063,9026,9104,9023,9112,8834,8809


In [118]:
pop_data.reset_index(inplace=True)
pop_melted = pd.melt(pop_data, id_vars=["COUNTY"],
                     var_name="YEAR", value_name="TOTAL_POPULATION")

In [119]:
pop_melted.head()

Unnamed: 0,COUNTY,YEAR,TOTAL_POPULATION
0,ANDERSON,2006,57386
1,ANDREWS,2006,13195
2,ANGELINA,2006,83810
3,ARANSAS,2006,23395
4,ARCHER,2006,9063


In [122]:
pills_percapita.head()

Unnamed: 0,COUNTY,2006,2007,2008,2009,2010,2011,2012,COUNTY_ID
0,ANDERSON,38.495975,37.127527,39.619585,40.215545,41.799238,46.924342,46.026694,48001
1,ANDREWS,18.688897,20.282691,22.710831,22.704609,24.737693,27.015857,29.883324,48003
2,ANGELINA,36.642107,41.089472,47.277692,49.821339,49.702998,54.868091,53.625114,48005
3,ARANSAS,31.395597,40.952443,42.609257,43.57563,47.916056,51.97467,55.189069,48007
4,ARCHER,0.0,0.022158,0.0,0.0,0.0,0.0,0.011352,48009


In [123]:
pills_pc_melted = pd.melt(pills_percapita, id_vars=["COUNTY", "COUNTY_ID"],
                          var_name="YEAR", value_name="PILLS_PC")

In [124]:
pills_pc_melted.head()

Unnamed: 0,COUNTY,COUNTY_ID,YEAR,PILLS_PC
0,ANDERSON,48001,2006,38.495975
1,ANDREWS,48003,2006,18.688897
2,ANGELINA,48005,2006,36.642107
3,ARANSAS,48007,2006,31.395597
4,ARCHER,48009,2006,0.0


In [145]:
dshs_pivot.head()

Unnamed: 0_level_0,2006,2007,2008,2009,2010,2011,2012
COUNTY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ANDERSON,2.5,2.5,2.5,2.5,2.5,2.5,2.5
ANDREWS,0.0,0.0,0.0,2.5,0.0,0.0,0.0
ANGELINA,2.5,2.5,2.5,0.0,2.5,2.5,2.5
ARANSAS,2.5,2.5,2.5,2.5,2.5,2.5,2.5
ARCHER,0.0,0.0,2.5,0.0,0.0,2.5,0.0


In [146]:
dshs_pivot.reset_index(inplace=True)
deaths_melted = pd.melt(dshs_pivot, id_vars="COUNTY",
                        var_name="YEAR", value_name="TOTAL_DEATHS")

In [148]:
deaths_melted.head()

Unnamed: 0,COUNTY,YEAR,TOTAL_DEATHS
0,ANDERSON,2006,2.5
1,ANDREWS,2006,0.0
2,ANGELINA,2006,2.5
3,ARANSAS,2006,2.5
4,ARCHER,2006,0.0


In [88]:
deaths_percapita.reset_index(inplace=True)

In [89]:
deaths_percapita.head()

Unnamed: 0,COUNTY,2006,2007,2008,2009,2010,2011,2012
0,ANDERSON,4.4e-05,4.3e-05,4.3e-05,4.3e-05,4.3e-05,4.3e-05,4.3e-05
1,ANDREWS,0.0,0.0,0.0,0.000171,0.0,0.0,0.0
2,ANGELINA,3e-05,3e-05,2.9e-05,0.0,2.9e-05,2.9e-05,2.9e-05
3,ARANSAS,0.000107,0.000108,0.000108,0.000107,0.000108,0.000108,0.000107
4,ARCHER,0.0,0.0,0.000275,0.0,0.0,0.000283,0.0


In [90]:
deaths_pc_melted = pd.melt(deaths_percapita, id_vars=["COUNTY"],
                           var_name="YEAR", value_name="DEATHS_PC")

In [92]:
deaths_pc_melted.head()

Unnamed: 0,COUNTY,YEAR,DEATHS_PC
0,ANDERSON,2006,4.4e-05
1,ANDREWS,2006,0.0
2,ANGELINA,2006,3e-05
3,ARANSAS,2006,0.000107
4,ARCHER,2006,0.0


In [127]:
deaths_per100k.head()

Unnamed: 0_level_0,2006,2007,2008,2009,2010,2011,2012
COUNTY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ANDERSON,4.356463,4.320028,4.313096,4.280089,4.273723,4.281262,4.30552
ANDREWS,0.0,0.0,0.0,17.122115,0.0,0.0,0.0
ANGELINA,2.982938,2.95795,2.942527,0.0,2.876704,2.863852,2.85649
ARANSAS,10.686044,10.788883,10.764263,10.73376,10.784229,10.769363,10.657799
ARCHER,0.0,0.0,27.460457,0.0,0.0,28.299751,0.0


In [128]:
deaths_per100k.reset_index(inplace=True)
deaths_p100k_melted = pd.melt(deaths_per100k, id_vars=["COUNTY"],
                              var_name="YEAR", value_name="DEATHS_PER_100K")

In [129]:
deaths_p100k_melted.head()

Unnamed: 0,COUNTY,YEAR,DEATHS_PER_100K
0,ANDERSON,2006,4.356463
1,ANDREWS,2006,0.0
2,ANGELINA,2006,2.982938
3,ARANSAS,2006,10.686044
4,ARCHER,2006,0.0


In [121]:
pop_melted.shape

(1778, 3)

In [126]:
pills_total_melted.shape

(1778, 3)

In [93]:
pills_pc_melted.shape

(1778, 4)

In [149]:
deaths_melted.shape

(1778, 3)

In [94]:
deaths_pc_melted.shape

(1778, 3)

In [130]:
deaths_p100k_melted.shape

(1778, 3)

In [150]:
county_data = pills_pc_melted[["COUNTY", "COUNTY_ID", "YEAR"]].copy()
county_data.head()

Unnamed: 0,COUNTY,COUNTY_ID,YEAR
0,ANDERSON,48001,2006
1,ANDREWS,48003,2006
2,ANGELINA,48005,2006
3,ARANSAS,48007,2006
4,ARCHER,48009,2006


In [151]:
county_data["TOTAL_POPULATION"] = pop_melted["TOTAL_POPULATION"]
county_data["TOTAL_PILLS"] = pills_total_melted["TOTAL_PILLS"]
county_data["PILLS_PER_CAPITA"] = pills_pc_melted["PILLS_PC"]
county_data["TOTAL_OVERDOSE_DEATHS"] = deaths_melted["TOTAL_DEATHS"]
county_data["DEATHS_PER_CAPITA"] = deaths_pc_melted["DEATHS_PC"]
county_data["DEATHS_PER_100K_PEOPLE"] = deaths_p100k_melted["DEATHS_PER_100K"]

In [152]:
county_data.head()

Unnamed: 0,COUNTY,COUNTY_ID,YEAR,TOTAL_POPULATION,TOTAL_PILLS,PILLS_PER_CAPITA,TOTAL_OVERDOSE_DEATHS,DEATHS_PER_CAPITA,DEATHS_PER_100K_PEOPLE
0,ANDERSON,48001,2006,57386,2209130.0,38.495975,2.5,4.4e-05,4.356463
1,ANDREWS,48003,2006,13195,246600.0,18.688897,0.0,0.0,0.0
2,ANGELINA,48005,2006,83810,3070975.0,36.642107,2.5,3e-05,2.982938
3,ARANSAS,48007,2006,23395,734500.0,31.395597,2.5,0.000107,10.686044
4,ARCHER,48009,2006,9063,0.0,0.0,0.0,0.0,0.0


In [153]:
# Writing to a csv
# county_data.to_csv(r"data/TX_County_By_Year.csv", index=False)

In [154]:
# Sanity check
county_test = pd.read_csv("data/TX_County_By_Year.csv")

In [155]:
county_test.head()

Unnamed: 0,COUNTY,COUNTY_ID,YEAR,TOTAL_POPULATION,TOTAL_PILLS,PILLS_PER_CAPITA,TOTAL_OVERDOSE_DEATHS,DEATHS_PER_CAPITA,DEATHS_PER_100K_PEOPLE
0,ANDERSON,48001,2006,57386,2209130.0,38.495975,2.5,4.4e-05,4.356463
1,ANDREWS,48003,2006,13195,246600.0,18.688897,0.0,0.0,0.0
2,ANGELINA,48005,2006,83810,3070975.0,36.642107,2.5,3e-05,2.982938
3,ARANSAS,48007,2006,23395,734500.0,31.395597,2.5,0.000107,10.686044
4,ARCHER,48009,2006,9063,0.0,0.0,0.0,0.0,0.0
