In [1]:
#Import Dependencies
import pandas as pd
from scipy.stats import spearmanr
from scipy.stats import pearsonr

In [2]:
#Import Wonder Data, and add to DataFrame - 3106 rows with pertinent data in CSV
mortality_df = pd.read_csv('2016Mortality.txt', sep="\t")
mortality_df = mortality_df.head(3105)

In [3]:
#Transform DataFrame to show columns of interest - remove crude rate and notes, divide deaths/population per 1000
mortality_df = mortality_df[['County', 'County Code', 'Deaths', 'Population']]
mortality_df['Death Rate per 1000'] = (mortality_df['Deaths']/mortality_df['Population'] * 1000).round(1)
mortality_df['County Code'] = mortality_df['County Code'].astype(int)
mortality_df.head()

Unnamed: 0,County,County Code,Deaths,Population,Death Rate per 1000
0,"Autauga County, AL",1001,520.0,55416.0,9.4
1,"Baldwin County, AL",1003,1974.0,208563.0,9.5
2,"Barbour County, AL",1005,256.0,25965.0,9.9
3,"Bibb County, AL",1007,239.0,22643.0,10.6
4,"Blount County, AL",1009,697.0,57704.0,12.1


In [4]:
#Import SVI CSV
svi_df = pd.read_csv('SVI2016_US_COUNTY.csv')
svi_df.head()

Unnamed: 0,FID,ST,STATE,ST_ABBR,COUNTY,FIPS,LOCATION,AREA_SQMI,E_TOTPOP,M_TOTPOP,...,F_CROWD,F_NOVEH,F_GROUPQ,F_THEME4,F_TOTAL,E_UNINSUR,M_UNINSUR,EP_UNINSUR,MP_UNINSUR,E_DAYPOP
0,0,1,ALABAMA,AL,Autauga,1001,"Autauga County, Alabama",594.44612,55049,0,...,0,0,0,0,0,4852,649,8.9,1.2,40854
1,1,1,ALABAMA,AL,Blount,1009,"Blount County, Alabama",644.806508,57704,0,...,0,0,0,0,0,6388,740,11.2,1.3,42597
2,2,1,ALABAMA,AL,Chambers,1017,"Chambers County, Alabama",596.531112,34018,0,...,0,0,0,0,0,3979,544,11.8,1.6,27940
3,3,1,ALABAMA,AL,Coffee,1031,"Coffee County, Alabama",678.985652,50991,0,...,0,0,0,0,0,5253,464,10.7,0.9,47236
4,4,1,ALABAMA,AL,Colbert,1033,"Colbert County, Alabama",592.619664,54377,0,...,0,0,0,0,0,4932,458,9.1,0.8,56227


In [5]:
#Merge Mortality and SVI on County Code - Inner join for statistical analysis
merged_df = pd.merge(mortality_df, svi_df, how='inner', left_on='County Code', right_on='FIPS')
merged_df.head()

Unnamed: 0,County,County Code,Deaths,Population,Death Rate per 1000,FID,ST,STATE,ST_ABBR,COUNTY,...,F_CROWD,F_NOVEH,F_GROUPQ,F_THEME4,F_TOTAL,E_UNINSUR,M_UNINSUR,EP_UNINSUR,MP_UNINSUR,E_DAYPOP
0,"Autauga County, AL",1001,520.0,55416.0,9.4,0,1,ALABAMA,AL,Autauga,...,0,0,0,0,0,4852,649,8.9,1.2,40854
1,"Baldwin County, AL",1003,1974.0,208563.0,9.5,1341,1,ALABAMA,AL,Baldwin,...,0,0,0,1,1,23255,1817,11.8,0.9,197683
2,"Barbour County, AL",1005,256.0,25965.0,9.9,3074,1,ALABAMA,AL,Barbour,...,0,0,1,2,8,3079,385,13.0,1.6,27321
3,"Bibb County, AL",1007,239.0,22643.0,10.6,2113,1,ALABAMA,AL,Bibb,...,0,0,1,2,2,1859,400,9.0,1.9,18756
4,"Blount County, AL",1009,697.0,57704.0,12.1,1,1,ALABAMA,AL,Blount,...,0,0,0,0,0,6388,740,11.2,1.3,42597


In [6]:
#Perform Spearman (assumes not normal distribution) correlation to determine if relationship between mortality, SVI index
correlations = {}
columns = merged_df.columns.tolist()

for column in columns:
    correlations[column] = spearmanr(merged_df['Death Rate per 1000'], merged_df[column])



In [7]:
#Add correlation values to DataFrame, sort values
result = pd.DataFrame.from_dict(correlations, orient='index')
result.columns = ['SCC', 'p-value']
result.sort_index()
sorted_df = result.sort_values(by=['SCC'])
sorted_df

Unnamed: 0,SCC,p-value
E_LIMENG,-0.535116,1.808915e-229
M_LIMENG,-0.501235,4.236370e-197
E_MINRTY,-0.461334,2.222176e-163
E_CROWD,-0.455775,5.005209e-159
E_AGE17,-0.453982,1.218931e-157
M_MUNIT,-0.449123,6.340737e-154
M_CROWD,-0.444777,1.188585e-150
EPL_LIMENG,-0.442789,3.604155e-149
EP_LIMENG,-0.442527,5.639028e-149
E_MUNIT,-0.439673,7.231035e-147


In [9]:
#Perform Pearson correlation to test for stronger relationships, need integer columns only

correlations_p = {}
columns_p = merged_df.columns[13:129].tolist()
for column_p in columns_p:
    correlations_p[column_p] = pearsonr(merged_df['Death Rate per 1000'], merged_df[column_p])
result_p = pd.DataFrame.from_dict(correlations_p, orient='index')
result_p.columns = ['PCC', 'p-value']
sorted_p = result_p.sort_values(by=['PCC'])
sorted_p

Unnamed: 0,PCC,p-value
EPL_LIMENG,-0.434319,5.742127e-143
EP_MUNIT,-0.432810,7.007387e-142
EPL_MUNIT,-0.413875,1.046615e-128
SPL_THEME3,-0.411071,7.941810e-127
RPL_THEME3,-0.404938,8.900245e-123
M_MUNIT,-0.401528,1.460614e-120
M_SNGPNT,-0.399526,2.835453e-119
M_UNINSUR,-0.393110,3.329490e-115
M_CROWD,-0.389822,3.741118e-113
M_UNEMP,-0.381762,3.176213e-108


In [10]:
#Import life expectancy DF
life_df = pd.read_csv('lifemedicare.csv')
life_df.head()

Unnamed: 0.1,Unnamed: 0,Location,FIPS,Life Expectancy,County name,"Total per enrollee (age, sex, race adjusted)","Total per enrollee (age, sex, race, price adjusted)"
0,0,"Autauga County, Alabama",1001,75.67,AL-Autauga County,8848,10413
1,1,"Baldwin County, Alabama",1003,78.08,AL-Baldwin County,8427,9883
2,2,"Barbour County, Alabama",1005,75.42,AL-Barbour County,8979,10704
3,3,"Bibb County, Alabama",1007,73.97,AL-Bibb County,9550,10860
4,4,"Blount County, Alabama",1009,76.16,AL-Blount County,9860,11179


In [14]:
#Merge life expectancy and SVI on County Code - Inner join for statistical analysis
merged_df2 = pd.merge(life_df, svi_df, how='inner', left_on='FIPS', right_on='FIPS')
merged_df2.head()

Unnamed: 0.1,Unnamed: 0,Location,FIPS,Life Expectancy,County name,"Total per enrollee (age, sex, race adjusted)","Total per enrollee (age, sex, race, price adjusted)",FID,ST,STATE,...,F_CROWD,F_NOVEH,F_GROUPQ,F_THEME4,F_TOTAL,E_UNINSUR,M_UNINSUR,EP_UNINSUR,MP_UNINSUR,E_DAYPOP
0,0,"Autauga County, Alabama",1001,75.67,AL-Autauga County,8848,10413,0,1,ALABAMA,...,0,0,0,0,0,4852,649,8.9,1.2,40854
1,1,"Baldwin County, Alabama",1003,78.08,AL-Baldwin County,8427,9883,1341,1,ALABAMA,...,0,0,0,1,1,23255,1817,11.8,0.9,197683
2,2,"Barbour County, Alabama",1005,75.42,AL-Barbour County,8979,10704,3074,1,ALABAMA,...,0,0,1,2,8,3079,385,13.0,1.6,27321
3,3,"Bibb County, Alabama",1007,73.97,AL-Bibb County,9550,10860,2113,1,ALABAMA,...,0,0,1,2,2,1859,400,9.0,1.9,18756
4,4,"Blount County, Alabama",1009,76.16,AL-Blount County,9860,11179,1,1,ALABAMA,...,0,0,0,0,0,6388,740,11.2,1.3,42597


In [15]:
#Perform Spearman correlation to determine if relationship between life expectancy, SVI index
correlations_life = {}
columns_life = merged_df2.columns[12:130].tolist()

for column in columns_life:
    correlations_life[column] = spearmanr(merged_df2['Life Expectancy'], merged_df2[column])
result_life = pd.DataFrame.from_dict(correlations_life, orient='index')
result_life.columns = ['SCC', 'p-value']
result_life.sort_index()
sorted_life_df = result_life.sort_values(by=['SCC'])
sorted_life_df



Unnamed: 0,SCC,p-value
RPL_THEME1,-0.773194,0.000000e+00
SPL_THEME1,-0.773187,0.000000e+00
EPL_PCI,-0.702873,0.000000e+00
SPL_THEMES,-0.696074,0.000000e+00
RPL_THEMES,-0.696073,0.000000e+00
EPL_NOHSDP,-0.695788,0.000000e+00
EP_NOHSDP,-0.695788,0.000000e+00
EPL_POV,-0.693060,0.000000e+00
EP_POV,-0.693060,0.000000e+00
RPL_THEME2,-0.634747,0.000000e+00


In [122]:
#Perform Pearson correlation to determine if relationship between life expectancy, SVI index
correlations_life_p = {}
columns_life_p = merged_df3.columns[13:130].tolist()

for column in columns_life_p:
    correlations_life_p[column] = pearsonr(merged_df3['Life Expectancy'], merged_df3[column])
result_life_p = pd.DataFrame.from_dict(correlations_life_p, orient='index')
result_life_p.columns = ['PCC', 'p-value']
result_life_p.sort_index()
sorted_life_p = result_life_p.sort_values(by=['PCC'])
sorted_life_p

Unnamed: 0,PCC,p-value
SPL_THEME1,-0.764425,0.000000e+00
RPL_THEME1,-0.762146,0.000000e+00
EPL_PCI,-0.701032,0.000000e+00
EPL_POV,-0.689713,0.000000e+00
EP_POV,-0.685514,0.000000e+00
RPL_THEMES,-0.683695,0.000000e+00
EPL_NOHSDP,-0.680065,0.000000e+00
SPL_THEMES,-0.674783,0.000000e+00
RPL_THEME2,-0.620507,0.000000e+00
SPL_THEME2,-0.614892,0.000000e+00


In [16]:
#Take sample of 500 random counties, export to CSV for visualization test
random_counties_life = merged_df2.sample(n=500, random_state=1)
random_counties_life.head()
random_counties_life.to_csv('randomcounties_life.csv')