## Merge dataframes

First, load in dataframes from pickles.

Target dataframe: odf_mort_df
Regressors: census_df
Master list of counties: county_df; state_df (dataframe) and states (list) also get loaded

In [1]:
# Necessary imports
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
#import diagnostic_plots
import pickle
import time

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
%matplotlib inline

In [2]:
with open('statedf_countydf_states.pkl', 'rb') as picklefile:
    [state_df, county_df, states] = pickle.load(picklefile)

In [3]:
with open('od_mort_df.pkl', 'rb') as picklefile:
    od_mort_df = pickle.load(picklefile)

In [4]:
od_mort_df.head()

Unnamed: 0,county_name,overdose_deaths,population,crude_rate,age_adj_rate,INCITS
0,Autauga County AL (01001),Suppressed,55416,Suppressed,Suppressed,1001
1,Baldwin County AL (01003),34,208563,16.3,15.7,1003
2,Barbour County AL (01005),Suppressed,25965,Suppressed,Suppressed,1005
3,Bibb County AL (01007),Suppressed,22643,Suppressed,Suppressed,1007
4,Blount County AL (01009),15,57704,Unreliable,Unreliable,1009


In [5]:
# We'll make two separate dataframes, one with only the records that are not suppressed, or missing.
clean_od_mort_df = od_mort_df[od_mort_df.overdose_deaths != 'Suppressed']
clean_od_mort_df = clean_od_mort_df[clean_od_mort_df.overdose_deaths != 'Missing']

In [6]:
clean_od_mort_df.shape

(1000, 6)

In [7]:
# Right now, all the columns are strings because they contained words initially; however, now that we have
# filtered out the 'Suppressed' and 'Missing' values, we make everything numeric.
clean_od_mort_df[['overdose_deaths', 'population']] = \
clean_od_mort_df[['overdose_deaths', 'population']].apply(pd.to_numeric, errors = 'coerce')

In [8]:
clean_od_mort_df.head()

Unnamed: 0,county_name,overdose_deaths,population,crude_rate,age_adj_rate,INCITS
1,Baldwin County AL (01003),34,208563,16.3,15.7,1003
4,Blount County AL (01009),15,57704,Unreliable,Unreliable,1009
7,Calhoun County AL (01015),16,114611,Unreliable,Unreliable,1015
21,Cullman County AL (01043),20,82471,24.3,26.4,1043
24,DeKalb County AL (01049),19,70900,Unreliable,Unreliable,1049


In [9]:
clean_od_mort_df.loc[clean_od_mort_df.crude_rate == 'Unreliable', 'crude_rate'] = \
clean_od_mort_df.overdose_deaths/clean_od_mort_df.population * 100000

In [10]:
# let's get an even cleaner overdose mortality df, where we also throw out the 'Unreliable' data points as well
super_clean_od_mort_df = clean_od_mort_df[clean_od_mort_df.age_adj_rate != 'Unreliable']

In [11]:
super_clean_od_mort_df.shape

(611, 6)

In [None]:
with open('clean_od_mort_df.pkl', 'wb') as picklefile:
    pickle.dump(clean_od_mort_df, picklefile)

Further analyses:
* Repeat analysis only on super_clean_od_mort_df, see if model changes.
* Impute suppressed values on od_mort_df, (have to pick a sensible way to do this, can either set them all to 5 or the number of deaths can be proportional to the population of the county.

## Merge census predictors onto the target to create dataframe

In [12]:
with open('census_master_df.pkl', 'rb') as picklefile:
    census_df = pickle.load(picklefile)

In [13]:
print(census_df.shape)
print(clean_od_mort_df.shape)

(3142, 7)
(1000, 6)


In [14]:
result = pd.merge(clean_od_mort_df, census_df, on='INCITS')

In [15]:
result.shape

(1000, 12)

## Merge opioid prescribing map predictors onto dataframe

In [16]:
with open('opioid_rx_df.pkl', 'rb') as picklefile:
    opioid_rx_df = pickle.load(picklefile)

In [17]:
result = pd.merge(result, opioid_rx_df, on='INCITS')

In [18]:
result.head()

Unnamed: 0,county_name,overdose_deaths,population,crude_rate,age_adj_rate,INCITS,median_age,percent_white,hs_percent,median_hh_income,percent_unemployed,percent_poverty,state,opioid_rx_rate_2016,opioid_rx_rate_2015,opioid_rx_rate_2014
0,Baldwin County AL (01003),34,208563,16.3,15.7,1003,42.3,86.4,90.0,51365,3.7,9.3,AL,123.8,132.1,143.5
1,Blount County AL (01009),15,57704,25.9947,Unreliable,1009,40.8,95.4,80.0,46212,3.0,12.2,AL,56.9,57.9,63.2
2,Calhoun County AL (01015),16,114611,13.9603,Unreliable,1015,39.1,74.9,82.3,41954,6.7,15.8,AL,161.0,165.4,180.0
3,Cullman County AL (01043),20,82471,24.3,26.4,1043,40.5,95.9,82.2,39297,3.2,12.0,AL,166.2,172.0,182.6
4,DeKalb County AL (01049),19,70900,26.7983,Unreliable,1049,39.3,87.2,72.6,38248,3.4,14.3,AL,114.4,117.6,122.7


In [19]:
result.shape

(1000, 16)

## Merge PMP age per state onto dataframe

In [20]:
with open('pmp_age.pkl', 'rb') as picklefile:
    pmp_age_df = pickle.load(picklefile)

In [21]:
pmp_age_df.rename(columns={'index':'name'}, inplace=True)
pmp_age_df.head()

Unnamed: 0,name,year,pmp_age_in_2016
0,Alabama,2006,10
1,Idaho,1967,49
2,Missouri,2017,0
3,Pennsylvania,1973,43
4,Alaska,2011,5


In [22]:
# You need to add a 'state' code to pmp_age_df and also to results in order to merge these two together.
# First, we'll add state code to pmp_age_df.
pmp_age_df = pd.merge(pmp_age_df, state_df[['name', 'state_code']], on='name')

In [23]:
incit = result['INCITS']
result['state_code'] = [i[0:2] for i in incit]

In [24]:
result.head()

Unnamed: 0,county_name,overdose_deaths,population,crude_rate,age_adj_rate,INCITS,median_age,percent_white,hs_percent,median_hh_income,percent_unemployed,percent_poverty,state,opioid_rx_rate_2016,opioid_rx_rate_2015,opioid_rx_rate_2014,state_code
0,Baldwin County AL (01003),34,208563,16.3,15.7,1003,42.3,86.4,90.0,51365,3.7,9.3,AL,123.8,132.1,143.5,1
1,Blount County AL (01009),15,57704,25.9947,Unreliable,1009,40.8,95.4,80.0,46212,3.0,12.2,AL,56.9,57.9,63.2,1
2,Calhoun County AL (01015),16,114611,13.9603,Unreliable,1015,39.1,74.9,82.3,41954,6.7,15.8,AL,161.0,165.4,180.0,1
3,Cullman County AL (01043),20,82471,24.3,26.4,1043,40.5,95.9,82.2,39297,3.2,12.0,AL,166.2,172.0,182.6,1
4,DeKalb County AL (01049),19,70900,26.7983,Unreliable,1049,39.3,87.2,72.6,38248,3.4,14.3,AL,114.4,117.6,122.7,1


In [25]:
result = pd.merge(result, pmp_age_df[['pmp_age_in_2016', 'state_code']], on='state_code')
result.head()

Unnamed: 0,county_name,overdose_deaths,population,crude_rate,age_adj_rate,INCITS,median_age,percent_white,hs_percent,median_hh_income,percent_unemployed,percent_poverty,state,opioid_rx_rate_2016,opioid_rx_rate_2015,opioid_rx_rate_2014,state_code,pmp_age_in_2016
0,Baldwin County AL (01003),34,208563,16.3,15.7,1003,42.3,86.4,90.0,51365,3.7,9.3,AL,123.8,132.1,143.5,1,10
1,Blount County AL (01009),15,57704,25.9947,Unreliable,1009,40.8,95.4,80.0,46212,3.0,12.2,AL,56.9,57.9,63.2,1,10
2,Calhoun County AL (01015),16,114611,13.9603,Unreliable,1015,39.1,74.9,82.3,41954,6.7,15.8,AL,161.0,165.4,180.0,1,10
3,Cullman County AL (01043),20,82471,24.3,26.4,1043,40.5,95.9,82.2,39297,3.2,12.0,AL,166.2,172.0,182.6,1,10
4,DeKalb County AL (01049),19,70900,26.7983,Unreliable,1049,39.3,87.2,72.6,38248,3.4,14.3,AL,114.4,117.6,122.7,1,10


## Merge division dummy variables onto dataframe.
Data from this csv: https://raw.githubusercontent.com/cphalpert/census-regions/master/us%20census%20bureau%20regions%20and%20divisions.csv

In [26]:
with open('us_regions_dummies.pkl', 'rb') as picklefile:
    us_regions_dummies_df = pickle.load(picklefile)

In [27]:
us_regions_dummies_df.head()

Unnamed: 0,State Code,Division_East North Central,Division_East South Central,Division_Middle Atlantic,Division_Mountain,Division_New England,Division_Pacific,Division_South Atlantic,Division_West North Central,Division_West South Central
0,AK,0,0,0,0,0,1,0,0,0
1,AL,0,1,0,0,0,0,0,0,0
2,AR,0,0,0,0,0,0,0,0,1
3,AZ,0,0,0,1,0,0,0,0,0
4,CA,0,0,0,0,0,1,0,0,0


In [28]:
result = pd.merge(result, us_regions_dummies_df, left_on='state', right_on='State Code')
result.head()

Unnamed: 0,county_name,overdose_deaths,population,crude_rate,age_adj_rate,INCITS,median_age,percent_white,hs_percent,median_hh_income,...,State Code,Division_East North Central,Division_East South Central,Division_Middle Atlantic,Division_Mountain,Division_New England,Division_Pacific,Division_South Atlantic,Division_West North Central,Division_West South Central
0,Baldwin County AL (01003),34,208563,16.3,15.7,1003,42.3,86.4,90.0,51365,...,AL,0,1,0,0,0,0,0,0,0
1,Blount County AL (01009),15,57704,25.9947,Unreliable,1009,40.8,95.4,80.0,46212,...,AL,0,1,0,0,0,0,0,0,0
2,Calhoun County AL (01015),16,114611,13.9603,Unreliable,1015,39.1,74.9,82.3,41954,...,AL,0,1,0,0,0,0,0,0,0
3,Cullman County AL (01043),20,82471,24.3,26.4,1043,40.5,95.9,82.2,39297,...,AL,0,1,0,0,0,0,0,0,0
4,DeKalb County AL (01049),19,70900,26.7983,Unreliable,1049,39.3,87.2,72.6,38248,...,AL,0,1,0,0,0,0,0,0,0


In [29]:
result.columns

Index(['county_name', 'overdose_deaths', 'population', 'crude_rate',
       'age_adj_rate', 'INCITS', 'median_age', 'percent_white', 'hs_percent',
       'median_hh_income', 'percent_unemployed', 'percent_poverty', 'state',
       'opioid_rx_rate_2016', 'opioid_rx_rate_2015', 'opioid_rx_rate_2014',
       'state_code', 'pmp_age_in_2016', 'State Code',
       'Division_East North Central', 'Division_East South Central',
       'Division_Middle Atlantic', 'Division_Mountain', 'Division_New England',
       'Division_Pacific', 'Division_South Atlantic',
       'Division_West North Central', 'Division_West South Central'],
      dtype='object')

In [31]:
# let's make a dataframe with only the relevant columns. It's easier to specify the irrelevant columns and take
# them out, since there are a lot of relevant columns with the addition of the dummies.
drop_columns = ['county_name', 'state_code', 'overdose_deaths', 'population', 'age_adj_rate', 'State Code', 'state']

master_df = result.drop(columns=drop_columns)

In [31]:
master_df = result[relevant_columns]

In [32]:
master_df.set_index('INCITS', inplace=True)

In [33]:
# Now that we've selected all the relevant columns, let's make all of the values numeric.
master_df = master_df[master_df.columns].apply(pd.to_numeric)

In [37]:
with open('master_df.pkl', 'wb') as picklefile:
    pickle.dump(master_df, picklefile)