In [1]:
# Imports
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-darkgrid')

In [2]:
# First, the Washington Post data
# Specifying some dtypes as 'objects' for faster input
dtypes = {
    "BUYER_COUNTY": "object",
    "DOSAGE_UNIT": "float64",
    "TRANSACTION_DATE" : "object"
}
# Reading in columns for modeling
wp_data = pd.read_csv("data/arcos-tx-statewide-itemized.tsv",
                   sep='\t',
                   usecols=["BUYER_COUNTY", "DOSAGE_UNIT", "TRANSACTION_DATE"],
                   dtype=dtypes)

In [3]:
# Transforming the WP data's date into datetime
# Need to add leading zeros for months with 1 digit, so each date has 8 digits
wp_data["TRANSACTION_DATE"] = wp_data["TRANSACTION_DATE"].str.zfill(8)
# And then actually transforming the data type
wp_data["TRANSACTION_DATE"] = pd.to_datetime(wp_data["TRANSACTION_DATE"],
                                            format='%m%d%Y')

In [4]:
# Let's also make a column for the years of each transaction
wp_data["YEAR"] = wp_data["TRANSACTION_DATE"].dt.year

In [5]:
# Now we don't need the transaction date, and it's going to get confusing 
# with the aggregation, so let's drop that column
wp_data.drop(columns="TRANSACTION_DATE", inplace=True)

In [6]:
# Renaming the columns to be a bit simpler
wp_data.rename(columns={"BUYER_COUNTY": "COUNTY"},
              inplace=True)

In [7]:
wp_data.head(10)

Unnamed: 0,COUNTY,DOSAGE_UNIT,YEAR
0,GREGG,500.0,2006
1,JEFFERSON,500.0,2006
2,JEFFERSON,400.0,2006
3,JEFFERSON,100.0,2007
4,JEFFERSON,100.0,2007
5,JEFFERSON,300.0,2007
6,EL PASO,500.0,2007
7,HARRIS,3000.0,2007
8,HARRIS,1500.0,2006
9,HARRIS,500.0,2006


In [8]:
wp_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12108468 entries, 0 to 12108467
Data columns (total 3 columns):
COUNTY         object
DOSAGE_UNIT    float64
YEAR           int64
dtypes: float64(1), int64(1), object(1)
memory usage: 277.1+ MB


In [9]:
# For reasons I cannot fathom, extracting the year doesn't keep it as a 
# datetime object, which I need for the index
wp_data["YEAR"] = pd.to_datetime(wp_data["YEAR"], format='%Y')

In [10]:
# Probably not going to use this, but woo pivot tables
wp_pivot = pd.pivot_table(wp_data, index="COUNTY",
                          columns="YEAR",
                          values="DOSAGE_UNIT", aggfunc=[np.sum])

In [11]:
wp_pivot.head()

Unnamed: 0_level_0,sum,sum,sum,sum,sum,sum,sum
YEAR,2006-01-01,2007-01-01,2008-01-01,2009-01-01,2010-01-01,2011-01-01,2012-01-01
COUNTY,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
ANDERSON,2209130.0,2148570.0,2296470.0,2348990.0,2445130.0,2740100.0,2672540.0
ANDREWS,246600.0,274080.0,320200.0,331510.0,367330.0,415720.0,481510.0
ANGELINA,3070975.0,3472800.0,4016760.0,4286080.0,4319439.0,4789710.0,4693270.0
ARANSAS,734500.0,948950.0,989600.0,1014920.0,1110790.0,1206540.0,1294570.0
ARCHER,,200.0,,,,,100.0


In [12]:
# Creating an aggregated multi-index dataframe
wp_agg = wp_data.groupby(["YEAR", "COUNTY"]).agg(
    {"DOSAGE_UNIT": "sum"})

In [13]:
wp_agg.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,DOSAGE_UNIT
YEAR,COUNTY,Unnamed: 2_level_1
2006-01-01,ANDERSON,2209130.0
2006-01-01,ANDREWS,246600.0
2006-01-01,ANGELINA,3070975.0
2006-01-01,ARANSAS,734500.0
2006-01-01,ARMSTRONG,14500.0
2006-01-01,ATASCOSA,1139180.0
2006-01-01,AUSTIN,503500.0
2006-01-01,BAILEY,111080.0
2006-01-01,BANDERA,250070.0
2006-01-01,BASTROP,1402040.0


In [32]:
# Now, the TX DSHS data
dshs_data = pd.read_csv("data/TXDSHS/MultiIndex_TXDeaths_by_County_Data.csv")

In [33]:
dshs_data.head()

Unnamed: 0,County Name,Number of Deaths,Type of Death1,Year (copy),Latitude (generated),Longitude (generated)
0,Zavala,0.0,Accidental Poisoning Deaths where Opioids Were...,2006,28.866,-99.761
1,Zapata,0.0,Accidental Poisoning Deaths where Opioids Were...,2006,26.971,-99.203
2,Young,2.5,Accidental Poisoning Deaths where Opioids Were...,2006,33.175,-98.687
3,Yoakum,0.0,Accidental Poisoning Deaths where Opioids Were...,2006,33.173,-102.829
4,Wood,2.5,Accidental Poisoning Deaths where Opioids Were...,2006,32.783,-95.407


In [34]:
dshs_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1778 entries, 0 to 1777
Data columns (total 6 columns):
County Name              1778 non-null object
Number of Deaths         1778 non-null float64
Type of Death1           1778 non-null object
Year (copy)              1778 non-null int64
Latitude (generated)     1778 non-null float64
Longitude (generated)    1778 non-null float64
dtypes: float64(3), int64(1), object(2)
memory usage: 83.4+ KB


In [35]:
# Need to do a bit of clean up here
# Dropping columns we don't need
dshs_data.drop(columns=["Type of Death1", "Latitude (generated)",
                        "Longitude (generated)"], inplace=True)
# Making sure all of the county names in that column are uppercase
dshs_data["County Name"] = dshs_data["County Name"].str.upper()
# Renaming the columns to be the same as the WP data
dshs_data.rename(columns={"County Name": "COUNTY",
                          "Year (copy)": "YEAR"},
                 inplace=True)

In [36]:
dshs_data["YEAR"] = pd.to_datetime(dshs_data["YEAR"], format='%Y')

In [62]:
# Creating an aggregated multi-index dataframe, but now for DSHS data
dshs_agg = dshs_data.groupby(["YEAR", "COUNTY"]).agg(
    {"Number of Deaths": "sum"})
dshs_test = dshs_data.groupby(["YEAR"]).agg(
    {"COUNTY": "all","Number of Deaths": "sum"})

In [48]:
dshs_agg.shape

(1778, 1)

In [49]:
# And now to merge!
merged_data = dshs_agg.merge(wp_agg, how='left', 
                             left_index=True, right_index=True)

In [50]:
merged_data.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Number of Deaths,DOSAGE_UNIT
YEAR,COUNTY,Unnamed: 2_level_1,Unnamed: 3_level_1
2006-01-01,ANDERSON,2.5,2209130.0
2006-01-01,ANDREWS,0.0,246600.0
2006-01-01,ANGELINA,2.5,3070975.0
2006-01-01,ARANSAS,2.5,734500.0
2006-01-01,ARCHER,0.0,
2006-01-01,ARMSTRONG,0.0,14500.0
2006-01-01,ATASCOSA,0.0,1139180.0
2006-01-01,AUSTIN,0.0,503500.0
2006-01-01,BAILEY,0.0,111080.0
2006-01-01,BANDERA,2.5,250070.0


In [51]:
# Now, to fill nulls in the dosage unit (where no pills were shipped)
merged_data["DOSAGE_UNIT"].fillna(value=0, inplace=True)

In [52]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1778 entries, (2006-01-01 00:00:00, ANDERSON) to (2012-01-01 00:00:00, ZAVALA)
Data columns (total 2 columns):
Number of Deaths    1778 non-null float64
DOSAGE_UNIT         1778 non-null float64
dtypes: float64(2)
memory usage: 115.1+ KB


### Investigating the Relationship

In [53]:
fig1 = px.scatter(merged_data, x="DOSAGE_UNIT", y="Number of Deaths")

fig1.show()

In [55]:
for year in merged_data.index:
    year_df = merged_data.loc[year]
    fig = px.scatter(year_df, x="DOSAGE_UNIT", y="Number of Deaths",
                     title = "{}".format(year))
    fig.show()

AttributeError: 'Series' object has no attribute 'columns'