In [2]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Data Collection

## Collecting Asthma Information

CSV files with information on asthma, diabetes, cancer, copd, heart disease, kidney disease incident rates/ prevalence rates were downloaded from the CDC website. These csv files are each approximately 40 MB with responses to various questions, rates by demographic as well as overall population totals for more than a decade. Each dataset must be cleaned and unecessary information dropped to reduce the files to a manageable size. 

In [2]:
# Large csv files were uploaded into an untracked folder
asthma = pd.read_csv('Ignore/Asthma.csv')

In [3]:
asthma.shape

(80342, 33)

In [4]:
asthma.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,Response,DataValueUnit,DataValueType,...,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1,StratificationCategoryID2,StratificationID2,StratificationCategoryID3,StratificationID3
0,2014,2014,AR,Arkansas,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,5,AST,AST3_1,NMBR,GENDER,GENM,,,,
1,2018,2018,CO,Colorado,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,8,AST,AST3_1,NMBR,OVERALL,OVR,,,,
2,2018,2018,DC,District of Columbia,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,11,AST,AST3_1,NMBR,OVERALL,OVR,,,,
3,2017,2017,GA,Georgia,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,13,AST,AST3_1,NMBR,GENDER,GENF,,,,
4,2010,2010,MI,Michigan,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,26,AST,AST3_1,NMBR,RACE,HIS,,,,


In [5]:
asthma.Question.value_counts()

Asthma mortality rate                                                                      13497
Current asthma prevalence among adults aged >= 18 years                                     9570
Influenza vaccination among noninstitutionalized adults aged >= 65 years with asthma        9570
Influenza vaccination among noninstitutionalized adults aged 18-64 years with asthma        9570
Pneumococcal vaccination among noninstitutionalized adults aged 18-64 years with asthma     9570
Pneumococcal vaccination among noninstitutionalized adults aged >= 65 years with asthma     9570
Hospitalizations for asthma                                                                 7812
Emergency department visit rate for asthma                                                  7608
Asthma prevalence among women aged 18-44 years                                              3575
Name: Question, dtype: int64

In [6]:
# Reducing data to only include reqponses to question of interest
asthma = asthma[asthma['Question']== "Current asthma prevalence among adults aged >= 18 years"]

In [7]:
asthma.shape

(9570, 33)

In [8]:
asthma.describe()

Unnamed: 0,YearStart,YearEnd,Response,DataValue,DataValueAlt,LowConfidenceLimit,HighConfidenceLimit,StratificationCategory2,Stratification2,StratificationCategory3,Stratification3,ResponseID,LocationID,StratificationCategoryID2,StratificationID2,StratificationCategoryID3,StratificationID3
count,9570.0,9570.0,0.0,7662.0,7662.0,7662.0,7662.0,0.0,0.0,0.0,0.0,0.0,9570.0,0.0,0.0,0.0,0.0
mean,2016.0,2016.0,,10.195184,10.195184,8.039585,13.059867,,,,,,31.542529,,,,
std,3.162443,3.162443,,3.745644,3.745644,2.676902,5.999667,,,,,,18.26542,,,,
min,2011.0,2011.0,,1.9,1.9,1.1,3.3,,,,,,1.0,,,,
25%,2013.0,2013.0,,7.7,7.7,6.1,9.3,,,,,,17.0,,,,
50%,2016.0,2016.0,,9.6,9.6,8.0,11.4,,,,,,31.0,,,,
75%,2019.0,2019.0,,11.9,11.9,9.8,14.8,,,,,,45.0,,,,
max,2021.0,2021.0,,44.1,44.1,26.9,66.6,,,,,,78.0,,,,


In [9]:
# We only need data for pre-existing conditions for 1 year
asthma = asthma[asthma['YearStart'] == 2019]

In [10]:
asthma.shape

(870, 33)

In [11]:
asthma.YearStart.value_counts()

2019    870
Name: YearStart, dtype: int64

In [12]:
# Reducing dataframe to only include the 7 columns we need for analysis and data cleaning
asthma = asthma[['YearStart', 'LocationAbbr', 'LocationDesc', 'Question', 'DataValue', 'Stratification1', 'DataValueType']]

In [13]:
asthma.head()

Unnamed: 0,YearStart,LocationAbbr,LocationDesc,Question,DataValue,Stratification1,DataValueType
28919,2019,GU,Guam,Current asthma prevalence among adults aged >=...,7.5,Female,Age-adjusted Prevalence
28927,2019,AK,Alaska,Current asthma prevalence among adults aged >=...,10.0,"White, non-Hispanic",Crude Prevalence
28932,2019,AK,Alaska,Current asthma prevalence among adults aged >=...,12.6,Female,Age-adjusted Prevalence
28977,2019,AK,Alaska,Current asthma prevalence among adults aged >=...,7.0,Male,Age-adjusted Prevalence
28983,2019,AK,Alaska,Current asthma prevalence among adults aged >=...,7.0,"Other, non-Hispanic",Crude Prevalence


In [14]:
asthma.shape

(870, 7)

In [15]:
asthma['LocationDesc'].nunique()

55

In [16]:
asthma.drop_duplicates(inplace=True)

In [17]:
asthma.shape

(870, 7)

In [18]:
# Reducing dataset to only include population totals/ dropping rates by race and gender demographics
asthma = asthma[asthma['Stratification1'] == 'Overall']

## Creating a function

In [19]:
#cancer = pd.read_csv('Ignore/Cancer.csv')
#cancer.shape

In [20]:
#cancer['Question'].value_counts()

In [21]:
#cancer['YearStart'].value_counts()

In [22]:
#Reducing Cancer to just total cancer
#cancer.Question.value_counts()

#cancer = cancer[cancer['Question']== 'Invasive cancer (all sites combined), incidence']

#cancer.shape

In [23]:
#cancer.YearStart.value_counts()

*For the question on interest, total cancer rates, we do not have data for 2019 or 2020. The most recent year is 2016 which is too long ago to be used as an accurate estimate of pre-existing cancer rates*

## Reading in other csvs 

In [24]:
heart = pd.read_csv('Ignore/Cardiovascular.csv')

In [25]:
# Splitting cardiac dataset into deaths from cardio diseases and diagnosed prevalence of hypertension
cardiac_mortality = heart[heart['Question'] == 'Mortality from total cardiovascular diseases']
print(cardiac_mortality.shape)

high_blood_pressure = heart[heart['Question']== 'Awareness of high blood pressure among adults aged >= 18 years']
print(high_blood_pressure.shape)

(13497, 33)
(5220, 33)


In [26]:
copd = pd.read_csv('Ignore/COPD.csv')
copd.shape

(152874, 33)

In [27]:
copd.Question.value_counts()

copd = copd[copd['Question']== 'Prevalence of chronic obstructive pulmonary disease among adults >= 18']

copd.shape

(9570, 33)

In [28]:
diabetes = pd.read_csv('Ignore/Diabetes.csv')
diabetes.shape

(156808, 33)

In [29]:
diabetes['Question'].value_counts()

diabetes = diabetes[diabetes['Question']== 'Prevalence of diagnosed diabetes among adults aged >= 18 years']

diabetes.shape

(9570, 33)

In [30]:
immun = pd.read_csv('Ignore/Immunization.csv')

In [31]:
kidney = pd.read_csv('Ignore/Kidney.csv')

In [32]:
kidney['Question'].value_counts()

kidney = kidney[kidney['Question']== 'Prevalence of chronic kidney disease among adults aged >= 18 years']

kidney.shape

(9570, 33)

## Defining cleaning function:
Following the steps used on the asthma csv as each dataset is structured the same

In [33]:
def cleaning(df):
    print(df.shape)
    df = df[df['YearStart'] == 2019]
    print(df.shape)
    df = df[['YearStart', 'LocationAbbr', 'LocationDesc', 'Question', 'DataValue', 'Stratification1', 'DataValueType']]
    print(df.shape)
    df = df[df['Stratification1'] == 'Overall']
    return df  
               

In [34]:
#cancer = cancer[cancer['YearStart'] >= 2016]
#cancer = cancer[['YearStart', 'LocationAbbr', 'LocationDesc', 'Question', 'DataValue', 'Stratification1']]                 

In [35]:
cardiac_mortality = cleaning(cardiac_mortality)

(13497, 33)
(1227, 33)
(1227, 7)


In [36]:
high_blood_pressure = cleaning(high_blood_pressure)

(5220, 33)
(870, 33)
(870, 7)


In [37]:
copd = cleaning(copd)

(9570, 33)
(870, 33)
(870, 7)


In [38]:
diabetes = cleaning(diabetes)

(9570, 33)
(870, 33)
(870, 7)


In [39]:
immun = cleaning(immun)

(9570, 33)
(870, 33)
(870, 7)


In [40]:
kidney = cleaning(kidney)

(9570, 33)
(870, 33)
(870, 7)


In [41]:
# I now have 7 dataframes. Some of these need to be filtered by question
# asthma, cancer, heart, copd, diabetes, immun, kidney
# I have the overall rates as well as rates by gender & race

In [42]:
#Examining the size of each dataset to ensure cleaning function worked

print(asthma.shape)
#print(cancer.shape)
print(cardiac_mortality.shape)
print(high_blood_pressure.shape)
print(copd.shape)
print(diabetes.shape)
print(immun.shape)
print(kidney.shape)

(110, 7)
(156, 7)
(110, 7)
(110, 7)
(110, 7)
(110, 7)
(110, 7)


In [43]:
# Cardiac_mortality is larger: ensuring there are no duplicate values
cardiac_mortality.drop_duplicates(inplace=True)

In [44]:
cardiac_mortality.shape

(156, 7)

In [45]:
asthma.head()

Unnamed: 0,YearStart,LocationAbbr,LocationDesc,Question,DataValue,Stratification1,DataValueType
29319,2019,AK,Alaska,Current asthma prevalence among adults aged >=...,9.7,Overall,Age-adjusted Prevalence
29673,2019,AK,Alaska,Current asthma prevalence among adults aged >=...,9.9,Overall,Crude Prevalence
30178,2019,AL,Alabama,Current asthma prevalence among adults aged >=...,9.4,Overall,Age-adjusted Prevalence
30473,2019,AL,Alabama,Current asthma prevalence among adults aged >=...,9.5,Overall,Crude Prevalence
31698,2019,AR,Arkansas,Current asthma prevalence among adults aged >=...,9.3,Overall,Age-adjusted Prevalence


In [46]:
asthma['Stratification1'].value_counts()

Overall    110
Name: Stratification1, dtype: int64

In [47]:
asthma.DataValueType.value_counts()

Age-adjusted Prevalence    55
Crude Prevalence           55
Name: DataValueType, dtype: int64

In [48]:
copd.DataValueType.value_counts()

Age-adjusted Prevalence    55
Crude Prevalence           55
Name: DataValueType, dtype: int64

In [49]:
cardiac_mortality.DataValueType.value_counts()

Number               52
Crude Rate           52
Age-adjusted Rate    52
Name: DataValueType, dtype: int64

In [50]:
high_blood_pressure.head()

Unnamed: 0,YearStart,LocationAbbr,LocationDesc,Question,DataValue,Stratification1,DataValueType
97076,2019,AK,Alaska,Awareness of high blood pressure among adults ...,32.8,Overall,Age-adjusted Prevalence
98160,2019,AK,Alaska,Awareness of high blood pressure among adults ...,32.8,Overall,Crude Prevalence
98274,2019,AL,Alabama,Awareness of high blood pressure among adults ...,42.5,Overall,Crude Prevalence
98432,2019,AL,Alabama,Awareness of high blood pressure among adults ...,39.4,Overall,Age-adjusted Prevalence
99507,2019,AR,Arkansas,Awareness of high blood pressure among adults ...,38.2,Overall,Age-adjusted Prevalence


In [51]:
cardiac_mortality.head()

Unnamed: 0,YearStart,LocationAbbr,LocationDesc,Question,DataValue,Stratification1,DataValueType
223,2019,RI,Rhode Island,Mortality from total cardiovascular diseases,3043.0,Overall,Number
1148,2019,MN,Minnesota,Mortality from total cardiovascular diseases,212.3,Overall,Crude Rate
1400,2019,MD,Maryland,Mortality from total cardiovascular diseases,16036.0,Overall,Number
1656,2019,AK,Alaska,Mortality from total cardiovascular diseases,156.2,Overall,Crude Rate
1682,2019,DE,Delaware,Mortality from total cardiovascular diseases,214.3,Overall,Age-adjusted Rate


In [52]:
def adj(df):
    adj_df = df[df['DataValueType'] == 'Age-adjusted Prevalence'].copy()
    adj_df.drop(columns=['LocationAbbr', 'DataValueType', 'Stratification1'], inplace=True)
    return adj_df

def crude(df):
    crude_df = df[df['DataValueType'] == 'Prevalence'].copy()
    crude_df.drop(columns=['LocationAbbr', 'DataValueType', 'Stratification1'], inplace=True)
    return crude_df

# Cardiac mortality is 'Age-adjusted Rate' and 'Crude Rate' not 'Prevalence'- fix this in final code

In [53]:
#adj_asthma = adj(asthma)
#crude_asthma = crude(asthma)

#adj_asthma.to_csv('Data/asthma_adj.csv')
#crude_asthma.to_csv('Data/asthma_crude.csv')

In [54]:
#Don't keep re-running this cell
def process_dataframes(dataframe_list):
    results = {} 
    for name, df in dataframe_list.items():
        adj_result = adj(df)
        crude_result = crude(df)
        results[name] = {'adj': adj_result, 'crude': crude_result}
    return results

# Create a list of DataFrames 
dataframe_list = {
    'copd': copd,
    'asthma': asthma,
    'high_blood_pressure': high_blood_pressure,
    'diabetes': diabetes,
    'immun': immun,
    'kidney': kidney
}
# Process the list of DataFrames
results = process_dataframes(dataframe_list)

# Save the results to CSV
for name, result in results.items():
    result['adj'].to_csv(f'Data/Raw/{name}_adj.csv', index = False)
    result['crude'].to_csv(f'Data/Raw/{name}_crude.csv', index = False)

In [55]:
cardiac_mortality = cardiac_mortality[cardiac_mortality['DataValueType'] == 'Age-adjusted Rate']
cardiac_mortality.drop(columns=['LocationAbbr', 'DataValueType', 'Stratification1'], inplace=True)
cardiac_mortality.to_csv('Data/Raw/cardiac_mortality_adj.csv', index = False)

# Data Cleaning

### Data Cleaning Goals:

1. Identify any issues with each dataframe
2. If any missing values- fill with an appropriate placeholder/ check documentation
3. Rename variables appropriately
    - Location
    - variable of interest
4. Drop uneccesary columns
    - year
    - variable descriptions
    - demographics (initially)
    - abbreviations
5. Drop unecessary rows
    - Keep 50 states
    - keep total us
    - Maybe keep PR or DC if present in all datasets
5. Calculate any necessary values-> convert total numbers to rates per population
6. Check datatypes
7. Date/ Time variables?
8. Reformat dataframes if necessary
9. Concatenate/ merge as necessary

#### For 2 insurance datasets: 
- add year as prefix/ suffix to each column 
- drop year column

In [56]:
insur_2019 = pd.read_csv('Data/Raw/2019_insurance.csv')

In [57]:
insur_2019.head()

Unnamed: 0,Location,Employer,Non-Group,Medicaid,Medicare,Military,Uninsured,Year
0,United States,0.496,0.059,0.198,0.142,0.014,0.092,2019
1,Alabama,0.472,0.055,0.195,0.16,0.021,0.097,2019
2,Alaska,0.484,0.035,0.213,0.1,0.053,0.115,2019
3,Arizona,0.451,0.052,0.21,0.161,0.015,0.111,2019
4,Arkansas,0.42,0.054,0.262,0.159,0.014,0.091,2019


In [58]:
insur_2021 = pd.read_csv('Data/Raw/2021_insurance.csv')

In [59]:
insur = pd.merge(insur_2019, insur_2021, how ='inner', on ='Location', suffixes=('_2019', '_2021'))

In [60]:
insur.head()

Unnamed: 0,Location,Employer_2019,Non-Group_2019,Medicaid_2019,Medicare_2019,Military_2019,Uninsured_2019,Year_2019,Employer_2021,Non-Group_2021,Medicaid_2021,Medicare_2021,Military_2021,Uninsured_2021,Year_2021
0,United States,0.496,0.059,0.198,0.142,0.014,0.092,2019,0.485,0.061,0.211,0.143,0.013,0.086,2021
1,Alabama,0.472,0.055,0.195,0.16,0.021,0.097,2019,0.466,0.06,0.192,0.162,0.021,0.1,2021
2,Alaska,0.484,0.035,0.213,0.1,0.053,0.115,2019,0.433,0.041,0.257,0.107,0.053,0.108,2021
3,Arizona,0.451,0.052,0.21,0.161,0.015,0.111,2019,0.45,0.054,0.213,0.162,0.015,0.106,2021
4,Arkansas,0.42,0.054,0.262,0.159,0.014,0.091,2019,0.411,0.056,0.27,0.156,0.015,0.092,2021


In [61]:
insur.drop(columns =['Year_2019', 'Year_2021'], inplace=True)

In [62]:
insur.head()

Unnamed: 0,Location,Employer_2019,Non-Group_2019,Medicaid_2019,Medicare_2019,Military_2019,Uninsured_2019,Employer_2021,Non-Group_2021,Medicaid_2021,Medicare_2021,Military_2021,Uninsured_2021
0,United States,0.496,0.059,0.198,0.142,0.014,0.092,0.485,0.061,0.211,0.143,0.013,0.086
1,Alabama,0.472,0.055,0.195,0.16,0.021,0.097,0.466,0.06,0.192,0.162,0.021,0.1
2,Alaska,0.484,0.035,0.213,0.1,0.053,0.115,0.433,0.041,0.257,0.107,0.053,0.108
3,Arizona,0.451,0.052,0.21,0.161,0.015,0.111,0.45,0.054,0.213,0.162,0.015,0.106
4,Arkansas,0.42,0.054,0.262,0.159,0.014,0.091,0.411,0.056,0.27,0.156,0.015,0.092


In [63]:
insur.to_csv('Data/cleaned_insur.csv', index = False)

#### For pre-existing conditions & immunizations:
1. choose crude or adj
2. drop old index
3. rename data value to question
4. drop year and question columns
5. For total number of incident data -> after concatening with population data, convert to rate

*Crude are the raw numbers. Adj are these numbers adjusted/ standardized based on population's age distribution. We can retain the crude numbers however the age adjusted numbers will probably be better for cross comparison*

In [64]:
asthma = pd.read_csv('Data/Raw/asthma_adj.csv')

In [65]:
asthma.head()

Unnamed: 0,YearStart,LocationDesc,Question,DataValue
0,2019,Alaska,Current asthma prevalence among adults aged >=...,9.7
1,2019,Alabama,Current asthma prevalence among adults aged >=...,9.4
2,2019,Arkansas,Current asthma prevalence among adults aged >=...,9.3
3,2019,Arizona,Current asthma prevalence among adults aged >=...,9.8
4,2019,California,Current asthma prevalence among adults aged >=...,7.8


In [66]:
asthma.rename(columns={
    'DataValue':'asthma_prevalence',
    'LocationDesc':'Location'}, inplace=True)

In [67]:
asthma.drop(columns=['YearStart', 'Question'], inplace=True)
asthma.head()

Unnamed: 0,Location,asthma_prevalence
0,Alaska,9.7
1,Alabama,9.4
2,Arkansas,9.3
3,Arizona,9.8
4,California,7.8


In [68]:
high_bp = pd.read_csv('Data/Raw/high_blood_pressure_adj.csv')
high_bp.head()

Unnamed: 0,YearStart,LocationDesc,Question,DataValue
0,2019,Alaska,Awareness of high blood pressure among adults ...,32.8
1,2019,Alabama,Awareness of high blood pressure among adults ...,39.4
2,2019,Arkansas,Awareness of high blood pressure among adults ...,38.2
3,2019,Arizona,Awareness of high blood pressure among adults ...,29.9
4,2019,California,Awareness of high blood pressure among adults ...,26.6


In [69]:
high_bp.rename(columns={
    'DataValue':'high_bp_prevalence',
    'LocationDesc':'Location'}, inplace=True)
high_bp.drop(columns=['YearStart', 'Question'], inplace=True)
high_bp.head()

Unnamed: 0,Location,high_bp_prevalence
0,Alaska,32.8
1,Alabama,39.4
2,Arkansas,38.2
3,Arizona,29.9
4,California,26.6


In [70]:
cardiac_mortality = pd.read_csv('Data/Raw/cardiac_mortality_adj.csv')

In [71]:
cardiac_mortality.rename(columns={
    'DataValue':'cardiac_mortality_rate',
    'LocationDesc':'Location'}, inplace=True)
cardiac_mortality.drop(columns=['YearStart', 'Question'], inplace=True)
cardiac_mortality.head()

Unnamed: 0,Location,cardiac_mortality_rate
0,Delaware,214.3
1,Georgia,234.3
2,Alaska,178.5
3,Kentucky,253.8
4,Arkansas,284.3


In [72]:
diabetes = pd.read_csv('Data/Raw/diabetes_adj.csv')
diabetes.head()

Unnamed: 0,YearStart,LocationDesc,Question,DataValue
0,2019,Alaska,Prevalence of diagnosed diabetes among adults ...,7.1
1,2019,Alabama,Prevalence of diagnosed diabetes among adults ...,12.2
2,2019,Arkansas,Prevalence of diagnosed diabetes among adults ...,12.2
3,2019,Arizona,Prevalence of diagnosed diabetes among adults ...,9.8
4,2019,California,Prevalence of diagnosed diabetes among adults ...,9.4


In [73]:
diabetes.rename(columns={
    'DataValue':'diabetes_prevalence',
    'LocationDesc':'Location'}, inplace=True)

In [74]:
diabetes.drop(columns=['YearStart', 'Question'], inplace=True)
diabetes.head()

Unnamed: 0,Location,diabetes_prevalence
0,Alaska,7.1
1,Alabama,12.2
2,Arkansas,12.2
3,Arizona,9.8
4,California,9.4


In [75]:
kidney = pd.read_csv('Data/Raw/kidney_adj.csv')
kidney.head()

Unnamed: 0,YearStart,LocationDesc,Question,DataValue
0,2019,Alaska,Prevalence of chronic kidney disease among adu...,1.8
1,2019,Alabama,Prevalence of chronic kidney disease among adu...,3.1
2,2019,Arkansas,Prevalence of chronic kidney disease among adu...,3.7
3,2019,California,Prevalence of chronic kidney disease among adu...,2.8
4,2019,Arizona,Prevalence of chronic kidney disease among adu...,3.6


In [76]:
kidney.rename(columns={
    'DataValue':'kidney_disease_prevalence',
    'LocationDesc':'Location'}, inplace=True)

In [77]:
kidney.drop(columns=['YearStart', 'Question'], inplace=True)
kidney.head()

Unnamed: 0,Location,kidney_disease_prevalence
0,Alaska,1.8
1,Alabama,3.1
2,Arkansas,3.7
3,California,2.8
4,Arizona,3.6


In [78]:
copd = pd.read_csv('Data/Raw/copd_adj.csv')
copd.rename(columns={
    'DataValue':'copd_prevalence',
    'LocationDesc':'Location'}, inplace=True)
copd.drop(columns=['YearStart', 'Question'], inplace=True)
copd.head()

Unnamed: 0,Location,copd_prevalence
0,Alaska,4.6
1,Alabama,9.3
2,Arkansas,9.7
3,California,4.2
4,Arizona,6.0


In [79]:
immun = pd.read_csv('Data/Raw/immun_adj.csv')
immun.rename(columns={
    'DataValue':'flu_vaccination_rate_2019',
    'LocationDesc':'Location'}, inplace=True)
immun.drop(columns=['YearStart', 'Question'], inplace=True)
immun.head()

Unnamed: 0,Location,flu_vaccination_rate_2019
0,Alaska,37.0
1,Alabama,39.5
2,Arkansas,40.1
3,Arizona,37.1
4,California,40.7


In [80]:
pre_con = pd.merge(immun, asthma, on='Location', how='inner')

dataframes_to_merge = [cardiac_mortality, high_bp, copd, kidney, diabetes]

for df in dataframes_to_merge:
    pre_con = pd.merge(pre_con, df, on='Location', how='inner')

In [81]:
pre_con.head()

Unnamed: 0,Location,flu_vaccination_rate_2019,asthma_prevalence,cardiac_mortality_rate,high_bp_prevalence,copd_prevalence,kidney_disease_prevalence,diabetes_prevalence
0,Alaska,37.0,9.7,178.5,32.8,4.6,1.8,7.1
1,Alabama,39.5,9.4,289.3,39.4,9.3,3.1,12.2
2,Arkansas,40.1,9.3,284.3,38.2,9.7,3.7,12.2
3,Arizona,37.1,9.8,181.8,29.9,6.0,3.6,9.8
4,California,40.7,7.8,192.5,26.6,4.2,2.8,9.4


In [82]:
pre_con.to_csv('Data/cleaned_pre_condtions.csv', index = True)

#### Income per capita: (Suli)
1. Rename year columns
2. Rename Geoname to Location
3. Drop geofips column


#### Life Expectancy: (Suli)
1. Convert abbreviated state names to unabbreviated name (Rename column name to match and then merge with Mask Mandates on abbreviations and then drop abbreviations)
2. Rename State to Location
3. Rename rate to average life expectancy
4. Drop url and year

#### Mask Mandate: (Suli)
1. After using abbreviation to match with states for life expectancy drop abbrev
2. Rename state_name to Location
3. Rename mandatory to mask_mandate

In [83]:
mask = pd.read_csv('Data/Mask Mandate.csv')
mask.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Data/Mask Mandate.csv'

In [None]:
mask.rename(columns = {'STATE_NAME': 'Location'},inplace=True)
mask.drop(columns='State_Abrv', inplace=True)
mask.head()

In [None]:
mask.to_csv('Data/cleaned_mask.csv', index = False)

#### Population Data:
1. Drop index column
2. Rename State to Location

In [None]:
pop_size = pd.read_csv('Data/Population_data_2010_&_2020.csv')

In [None]:
pop_size['Location'] = pop_size['State']

In [None]:
pop_size.head()

In [None]:
pop_size.drop(columns = ['Unnamed: 0', 'State'], inplace=True)

In [None]:
pop_size.head()

In [None]:
pop_size.to_csv('Data/cleaned_pop_size.csv', index = False)

#### Population Density:
1. Drop index column
2. Rename State to Location

In [None]:
pop_dense = pd.read_csv('Data/Population_Density_data.csv')

In [None]:
pop_dense.head()

In [None]:
pop_dense.rename(columns={'State': 'Location'}, inplace=True)
pop_dense.drop(columns='Unnamed: 0', inplace=True)

In [None]:
pop_dense.head()

In [None]:
pop_dense.to_csv('Data/cleaned_pop_dense.csv', index = False)

#### Total Employment:
1. Rename columns with values of row 1
2. Rename year columns to total_employment_year
3. Rename GeoName to Location
4. Drop GeoFips

#### Total Physicians:
- 

# Merging Cleaned Datasets

# County-Level Data

#### County conditions

In [None]:
county_cond = pd.read_csv('Data/2019 County Health Rankings Data - cleaned.csv')

In [None]:
county_cond.shape

In [None]:
county_cond.head()

In [None]:
county_cond.dtypes

#### Covid Vaccination Rates by county 

In [84]:
covid_vax = pd.read_csv('Ignore/COVID-19_Vaccinations_by_county.csv')

In [85]:
covid_vax.shape

(871062, 15)

In [86]:
covid_vax.head()

Unnamed: 0,Date,FIPS,MMWR_week,Recip_County,Recip_State,Administered_Dose1_Pop_Pct,Administered_Dose1_Recip_65PlusPop_Pct,Series_Complete_Pop_Pct,Series_Complete_65PlusPop_Pct,Booster_Doses_Vax_Pct,Booster_Doses_50Plus_Vax_Pct,Booster_Doses_65Plus_Vax_Pct,Metro_status,Census2019,Census2019_65PlusPop
0,12/28/2022,2013,52,Aleutians East Borough,AK,86.9,55.0,74.9,49.9,41.4,59.0,80.6,Non-metro,3337.0,351.0
1,12/28/2022,2016,52,Aleutians West Census Area,AK,77.8,69.9,64.2,59.9,38.6,59.8,76.1,Non-metro,5634.0,419.0
2,12/28/2022,2020,52,Anchorage Municipality,AK,79.5,95.0,70.8,95.0,49.8,67.9,77.7,Metro,288000.0,33757.0
3,12/28/2022,2050,52,Bethel Census Area,AK,74.0,89.3,68.7,86.0,51.8,74.9,82.9,Non-metro,18386.0,1448.0
4,12/28/2022,2060,52,Bristol Bay Borough,AK,95.0,95.0,95.0,91.2,41.0,67.6,88.7,Non-metro,836.0,136.0


In [None]:
county_fips = covid_vax[[FIPS

In [None]:
# What dates do I want to keep?
9/18/2021
covid_vax[covid_vax['Date'] == '11/18/2021'].shape

In [None]:
covid_vax[covid_vax['Date'] == '9/18/2021'].shape

In [None]:
covid_vax[covid_vax['Date'] == '12/18/2021'].shape

In [None]:
covid_vax[covid_vax['Date'] == '1/31/2022'].shape

In [None]:
covid_vax[covid_vax['Date'] == '4/30/2022'].shape

In [None]:
covid_vax[covid_vax['Date'] == '10/18/2021'].shape

#### Covid Cases

In [87]:
cases = pd.read_csv('Ignore/covid_confirmed_usafacts.csv')

In [None]:
cases.head()

In [None]:
cases.shape

In [None]:
cases.describe()

In [88]:
cases['County'] = cases['County Name'].str.replace(r'\bCounty\b', '', regex=True).str.strip()

In [89]:
cases.head()

Unnamed: 0,countyFIPS,County Name,State,StateFIPS,2020-01-22,2020-01-23,2020-01-24,2020-01-25,2020-01-26,2020-01-27,...,2023-07-15,2023-07-16,2023-07-17,2023-07-18,2023-07-19,2023-07-20,2023-07-21,2023-07-22,2023-07-23,County
0,0,Statewide Unallocated,AL,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Statewide Unallocated
1,1001,Autauga County,AL,1,0,0,0,0,0,0,...,19913,19913,19913,19913,19913,19913,19913,19913,19913,Autauga
2,1003,Baldwin County,AL,1,0,0,0,0,0,0,...,70521,70521,70521,70521,70521,70521,70521,70521,70521,Baldwin
3,1005,Barbour County,AL,1,0,0,0,0,0,0,...,7582,7582,7582,7582,7582,7582,7582,7582,7582,Barbour
4,1007,Bibb County,AL,1,0,0,0,0,0,0,...,8149,8149,8149,8149,8149,8149,8149,8149,8149,Bibb


In [90]:
county_fips = cases[['countyFIPS', 'County Name', 'State']]

In [91]:
county_fips.head()

Unnamed: 0,countyFIPS,County Name,State
0,0,Statewide Unallocated,AL
1,1001,Autauga County,AL
2,1003,Baldwin County,AL
3,1005,Barbour County,AL
4,1007,Bibb County,AL


In [92]:
county_fips.shape

(3193, 3)

In [93]:
county_fips.to_csv('Data/county_fips.csv')

In [None]:
cases_county = cases[['countyFIPS', 
              'County', 
              'State', 
              'StateFIPS', 
              '2020-01-22', 
              '2020-05-01', 
              '2020-01-22',
              '2020-09-01',
              '2020-12-31',
              '2021-04-30',
              '2021-08-31',
              '2021-07-02',
              '2021-12-31',
              '2022-04-01',
              '2022-07-02',
              '2022-12-31']]

In [None]:
cases_county.shape

In [None]:
cases_county.head()

In [None]:
cases_county[cases_county['2022-12-31'] == 0].shape

In [None]:
cases_county[cases_county['County'] == 'Statewide Unallocated']

***We need to determine how we want to handle the Statewide Unallocated- increase the number of cases per county based on the county's portion of the population?***

- to handle it this way we would need to append the county population size to this df/ merge with a different df
- then calculate pop_prop = county_pop/ state_pop
- then increase covid cases for each county within a state 

In [None]:
for state, abrv in state_list:
    state_abrv 
    pop['population'] == sum(county['population'] if county['State'] == state_abrv)

In [None]:
# getting sum of each states excess deaths 
#def excess_deaths(deaths, start_year=2017, end_year=2022):
    #state_totals = {state: [0] * (end_year - start_year + 1) for state in 
                    deaths['State'].unique()}
    
   # for index, row in deaths.iterrows():
       # state = row['State']
      #  year = row['Year']
       # excess_estimate = row['Excess Estimate']
        
       # if start_year <= year <= end_year:
     #       state_totals[state][year - start_year] += excess_estimate

  #  result = pd.DataFrame(state_totals).T.reset_index()
   # result.columns = ['State'] + [f'Exc_deaths_{year}' for year in 
                                  range(start_year, end_year + 1)]
    
  #  return result


excess_deaths = excess_deaths(deaths)
excess_deaths.head()


In [3]:
deaths = pd.read_csv('Ignore/covid_deaths_usafacts.csv')

In [4]:
deaths.shape

(3193, 1269)

In [5]:
deaths.head()

Unnamed: 0,countyFIPS,County Name,State,StateFIPS,2020-01-22,2020-01-23,2020-01-24,2020-01-25,2020-01-26,2020-01-27,...,2023-07-14,2023-07-15,2023-07-16,2023-07-17,2023-07-18,2023-07-19,2023-07-20,2023-07-21,2023-07-22,2023-07-23
0,0,Statewide Unallocated,AL,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1001,Autauga County,AL,1,0,0,0,0,0,0,...,235,235,235,235,235,235,235,235,235,235
2,1003,Baldwin County,AL,1,0,0,0,0,0,0,...,731,731,731,731,731,731,731,731,731,731
3,1005,Barbour County,AL,1,0,0,0,0,0,0,...,104,104,104,104,104,104,104,104,104,104
4,1007,Bibb County,AL,1,0,0,0,0,0,0,...,111,111,111,111,111,111,111,111,111,111


In [6]:
for i in range(5, len(deaths.columns)):
    deaths[deaths.columns[i]] = deaths[deaths.columns[i]] + deaths[deaths.columns[i - 1]]

In [7]:
deaths.head()

Unnamed: 0,countyFIPS,County Name,State,StateFIPS,2020-01-22,2020-01-23,2020-01-24,2020-01-25,2020-01-26,2020-01-27,...,2023-07-14,2023-07-15,2023-07-16,2023-07-17,2023-07-18,2023-07-19,2023-07-20,2023-07-21,2023-07-22,2023-07-23
0,0,Statewide Unallocated,AL,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1001,Autauga County,AL,1,0,0,0,0,0,0,...,167118,167353,167588,167823,168058,168293,168528,168763,168998,169233
2,1003,Baldwin County,AL,1,0,0,0,0,0,0,...,528939,529670,530401,531132,531863,532594,533325,534056,534787,535518
3,1005,Barbour County,AL,1,0,0,0,0,0,0,...,78773,78877,78981,79085,79189,79293,79397,79501,79605,79709
4,1007,Bibb County,AL,1,0,0,0,0,0,0,...,85889,86000,86111,86222,86333,86444,86555,86666,86777,86888


In [ ]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression 
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.tree import DecisionTreeRegressor 
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn import metrics

In [32]:
merged_data = pd.read_csv('Data/Merged_state_data.csv')

In [35]:
merged_data.dropna(inplace=True)

In [36]:
merged_data.isna().sum()

Location                      0
Employment_2020               0
Employment_2021               0
Employment_2022               0
Inc_Per_Cap_2020              0
Inc_Per_Cap_2021              0
Inc_Per_CAp_2022              0
Life_Exp_2020                 0
Life_Exp_2019                 0
Life_Exp_2018                 0
Employer_2019                 0
Non-Group_2019                0
Medicaid_2019                 0
Medicare_2019                 0
Military_2019                 0
Uninsured_2019                0
Employer_2021                 0
Non-Group_2021                0
Medicaid_2021                 0
Medicare_2021                 0
Military_2021                 0
Uninsured_2021                0
Population Density per mi²    0
2010 Population               0
2020 Population               0
Unnamed: 0                    0
flu_vaccination_rate_2019     0
asthma_prevalence             0
cardiac_mortality_rate        0
high_bp_prevalence            0
copd_prevalence               0
kidney_d

In [34]:
merged_data.dtypes

Location                       object
Employment_2020                 int64
Employment_2021                 int64
Employment_2022                 int64
Inc_Per_Cap_2020                int64
Inc_Per_Cap_2021                int64
Inc_Per_CAp_2022                int64
Life_Exp_2020                 float64
Life_Exp_2019                 float64
Life_Exp_2018                 float64
Employer_2019                 float64
Non-Group_2019                float64
Medicaid_2019                 float64
Medicare_2019                 float64
Military_2019                 float64
Uninsured_2019                float64
Employer_2021                 float64
Non-Group_2021                float64
Medicaid_2021                 float64
Medicare_2021                 float64
Military_2021                 float64
Uninsured_2021                float64
Population Density per mi²    float64
2010 Population                 int64
2020 Population                 int64
Unnamed: 0                      int64
flu_vaccinat

In [27]:
merged_data['Active MO'] = pd.to_numeric(merged_data['Active MO'], errors='coerce').astype('Int64')

In [37]:
X = merged_data.drop(columns=['Location','Exc_deaths_2017', 'Exc_deaths_2018','Exc_deaths_2019',
                              'Exc_deaths_2020', 'Exc_deaths_2021','Exc_deaths_2022',
                             'Exc_count_2017', 'Exc_count_2018', 'Exc_count_2019', 
                             'Exc_count_2020', 'Exc_count_2021', 'Exc_count_2022', 'Active MO'])

y = merged_data['Exc_deaths_2021']

#'Exc_deaths_2018','Exc_deaths_2019','Exc_deaths_2020','Exc_deaths_2021','Exc_deaths_2022'

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [31]:
X_train.isna().sum()

Employment_2020                0
Employment_2021                0
Employment_2022                0
Inc_Per_Cap_2020               0
Inc_Per_Cap_2021               0
Inc_Per_CAp_2022               0
Life_Exp_2020                  0
Life_Exp_2019                  0
Life_Exp_2018                  0
Employer_2019                  0
Non-Group_2019                 0
Medicaid_2019                  0
Medicare_2019                  0
Military_2019                  0
Uninsured_2019                 0
Employer_2021                  0
Non-Group_2021                 0
Medicaid_2021                  0
Medicare_2021                  0
Military_2021                  0
Uninsured_2021                 0
Population Density per mi²     0
2010 Population                0
2020 Population                0
Unnamed: 0                     0
flu_vaccination_rate_2019      0
asthma_prevalence              0
cardiac_mortality_rate         0
high_bp_prevalence             0
copd_prevalence                0
kidney_dis

In [38]:
lr1 = LinearRegression()
lr1.fit(X_train, y_train)

In [39]:
lr1.score(X_train, y_train)

0.9999999999999957

In [40]:
lr1.score(X_test, y_test)

0.8684601692945261

In [41]:
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [42]:
lr2 = LinearRegression()
lr2.fit(X_train_sc, y_train)

In [43]:
lr2.score(X_train_sc, y_train), lr2.score(X_test_sc, y_test)

(1.0, 0.8993171705628464)

In [44]:
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
knn.score(X_train, y_train), knn.score(X_test, y_test)

(0.7014420451232843, 0.6354478147460688)

In [45]:
knn2 = KNeighborsRegressor()
knn2.fit(X_train_sc, y_train)
knn2.score(X_train_sc, y_train), knn2.score(X_test_sc, y_test)

(0.7686629512245433, 0.6135423398706363)

In [46]:
tree = DecisionTreeRegressor()
tree.fit(X_train, y_train)
tree.score(X_train, y_train), tree.score(X_test, y_test)

(1.0, 0.8361170975065783)

In [47]:
tree2 = DecisionTreeRegressor()
tree2.fit(X_train_sc, y_train)
tree2.score(X_train_sc, y_train), tree2.score(X_test_sc, y_test)

(1.0, 0.747698492769266)

In [48]:
bag1 = BaggingRegressor()
bag1.fit(X_train , y_train)
bag1.score(X_train, y_train), bag1.score(X_test, y_test)

(0.862912331164615, 0.7551529437431134)

In [49]:
bag2 = BaggingRegressor()
bag2.fit(X_train_sc, y_train)
bag2.score(X_train_sc, y_train), bag2.score(X_test_sc, y_test)

(0.8817864141412134, 0.6969194613312981)

# Fixing NJ missing data:

-can inpute substitute numbers from 2018 or 2020 from raw data

asthma prevalence 2018 = 8.4
kidney prevalence 2019 = 52.1
flu_vaccination_rate_2019
high_bp_prevalence
copd prevalence
diabetes prevalence