In [3]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Data Collection

## Collecting Asthma Information

CSV files with information on asthma, diabetes, cancer, copd, heart disease, kidney disease incident rates/ prevalence rates were downloaded from the CDC website. These csv files are each approximately 40 MB with responses to various questions, rates by demographic as well as overall population totals for more than a decade. Each dataset must be cleaned and unecessary information dropped to reduce the files to a manageable size. 

In [4]:
# Large csv files were uploaded into an untracked folder
asthma = pd.read_csv('Ignore/Asthma.csv')

In [5]:
asthma.shape

(80342, 33)

In [6]:
asthma.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,Response,DataValueUnit,DataValueType,...,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1,StratificationCategoryID2,StratificationID2,StratificationCategoryID3,StratificationID3
0,2014,2014,AR,Arkansas,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,5,AST,AST3_1,NMBR,GENDER,GENM,,,,
1,2018,2018,CO,Colorado,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,8,AST,AST3_1,NMBR,OVERALL,OVR,,,,
2,2018,2018,DC,District of Columbia,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,11,AST,AST3_1,NMBR,OVERALL,OVR,,,,
3,2017,2017,GA,Georgia,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,13,AST,AST3_1,NMBR,GENDER,GENF,,,,
4,2010,2010,MI,Michigan,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,26,AST,AST3_1,NMBR,RACE,HIS,,,,


In [7]:
asthma.Question.value_counts()

Asthma mortality rate                                                                      13497
Current asthma prevalence among adults aged >= 18 years                                     9570
Influenza vaccination among noninstitutionalized adults aged >= 65 years with asthma        9570
Influenza vaccination among noninstitutionalized adults aged 18-64 years with asthma        9570
Pneumococcal vaccination among noninstitutionalized adults aged 18-64 years with asthma     9570
Pneumococcal vaccination among noninstitutionalized adults aged >= 65 years with asthma     9570
Hospitalizations for asthma                                                                 7812
Emergency department visit rate for asthma                                                  7608
Asthma prevalence among women aged 18-44 years                                              3575
Name: Question, dtype: int64

In [8]:
# Reducing data to only include reqponses to question of interest
asthma = asthma[asthma['Question']== "Current asthma prevalence among adults aged >= 18 years"]

In [9]:
asthma.shape

(9570, 33)

In [10]:
asthma.describe()

Unnamed: 0,YearStart,YearEnd,Response,DataValue,DataValueAlt,LowConfidenceLimit,HighConfidenceLimit,StratificationCategory2,Stratification2,StratificationCategory3,Stratification3,ResponseID,LocationID,StratificationCategoryID2,StratificationID2,StratificationCategoryID3,StratificationID3
count,9570.0,9570.0,0.0,7662.0,7662.0,7662.0,7662.0,0.0,0.0,0.0,0.0,0.0,9570.0,0.0,0.0,0.0,0.0
mean,2016.0,2016.0,,10.195184,10.195184,8.039585,13.059867,,,,,,31.542529,,,,
std,3.162443,3.162443,,3.745644,3.745644,2.676902,5.999667,,,,,,18.26542,,,,
min,2011.0,2011.0,,1.9,1.9,1.1,3.3,,,,,,1.0,,,,
25%,2013.0,2013.0,,7.7,7.7,6.1,9.3,,,,,,17.0,,,,
50%,2016.0,2016.0,,9.6,9.6,8.0,11.4,,,,,,31.0,,,,
75%,2019.0,2019.0,,11.9,11.9,9.8,14.8,,,,,,45.0,,,,
max,2021.0,2021.0,,44.1,44.1,26.9,66.6,,,,,,78.0,,,,


In [11]:
# We only need data for pre-existing conditions for 1 year
asthma = asthma[asthma['YearStart'] == 2019]

In [12]:
asthma.shape

(870, 33)

In [13]:
asthma.YearStart.value_counts()

2019    870
Name: YearStart, dtype: int64

In [14]:
# Reducing dataframe to only include the 7 columns we need for analysis and data cleaning
asthma = asthma[['YearStart', 'LocationAbbr', 'LocationDesc', 'Question', 'DataValue', 'Stratification1', 'DataValueType']]

In [15]:
asthma.head()

Unnamed: 0,YearStart,LocationAbbr,LocationDesc,Question,DataValue,Stratification1,DataValueType
28919,2019,GU,Guam,Current asthma prevalence among adults aged >=...,7.5,Female,Age-adjusted Prevalence
28927,2019,AK,Alaska,Current asthma prevalence among adults aged >=...,10.0,"White, non-Hispanic",Crude Prevalence
28932,2019,AK,Alaska,Current asthma prevalence among adults aged >=...,12.6,Female,Age-adjusted Prevalence
28977,2019,AK,Alaska,Current asthma prevalence among adults aged >=...,7.0,Male,Age-adjusted Prevalence
28983,2019,AK,Alaska,Current asthma prevalence among adults aged >=...,7.0,"Other, non-Hispanic",Crude Prevalence


In [16]:
asthma.shape

(870, 7)

In [17]:
asthma['LocationDesc'].nunique()

55

In [18]:
asthma.drop_duplicates(inplace=True)

In [19]:
asthma.shape

(870, 7)

In [20]:
# Reducing dataset to only include population totals/ dropping rates by race and gender demographics
asthma = asthma[asthma['Stratification1'] == 'Overall']

## Creating a function

In [21]:
#cancer = pd.read_csv('Ignore/Cancer.csv')
#cancer.shape

In [22]:
#cancer['Question'].value_counts()

In [23]:
#cancer['YearStart'].value_counts()

In [24]:
#Reducing Cancer to just total cancer
#cancer.Question.value_counts()

#cancer = cancer[cancer['Question']== 'Invasive cancer (all sites combined), incidence']

#cancer.shape

In [25]:
#cancer.YearStart.value_counts()

*For the question on interest, total cancer rates, we do not have data for 2019 or 2020. The most recent year is 2016 which is too long ago to be used as an accurate estimate of pre-existing cancer rates*

## Reading in other csvs 

In [26]:
heart = pd.read_csv('Ignore/Cardiovascular.csv')

In [27]:
# Splitting cardiac dataset into deaths from cardio diseases and diagnosed prevalence of hypertension
cardiac_mortality = heart[heart['Question'] == 'Mortality from total cardiovascular diseases']
print(cardiac_mortality.shape)

high_blood_pressure = heart[heart['Question']== 'Awareness of high blood pressure among adults aged >= 18 years']
print(high_blood_pressure.shape)

(13497, 33)
(5220, 33)


In [28]:
copd = pd.read_csv('Ignore/COPD.csv')
copd.shape

(152874, 33)

In [29]:
copd.Question.value_counts()

copd = copd[copd['Question']== 'Prevalence of chronic obstructive pulmonary disease among adults >= 18']

copd.shape

(9570, 33)

In [30]:
diabetes = pd.read_csv('Ignore/Diabetes.csv')
diabetes.shape

(156808, 33)

In [31]:
diabetes['Question'].value_counts()

diabetes = diabetes[diabetes['Question']== 'Prevalence of diagnosed diabetes among adults aged >= 18 years']

diabetes.shape

(9570, 33)

In [32]:
immun = pd.read_csv('Ignore/Immunization.csv')

In [34]:
kidney = pd.read_csv('Ignore/Kidney.csv')

In [35]:
kidney['Question'].value_counts()

kidney = kidney[kidney['Question']== 'Prevalence of chronic kidney disease among adults aged >= 18 years']

kidney.shape

(9570, 33)

## Defining cleaning function:
Following the steps used on the asthma csv as each dataset is structured the same

In [36]:
def cleaning(df):
    print(df.shape)
    df = df[df['YearStart'] == 2019]
    print(df.shape)
    df = df[['YearStart', 'LocationAbbr', 'LocationDesc', 'Question', 'DataValue', 'Stratification1', 'DataValueType']]
    print(df.shape)
    df = df[df['Stratification1'] == 'Overall']
    return df  
               

In [None]:
#cancer = cancer[cancer['YearStart'] >= 2016]
#cancer = cancer[['YearStart', 'LocationAbbr', 'LocationDesc', 'Question', 'DataValue', 'Stratification1']]                 

In [37]:
cardiac_mortality = cleaning(cardiac_mortality)

(13497, 33)
(1227, 33)
(1227, 7)


In [38]:
high_blood_pressure = cleaning(high_blood_pressure)

(5220, 33)
(870, 33)
(870, 7)


In [39]:
copd = cleaning(copd)

(9570, 33)
(870, 33)
(870, 7)


In [40]:
diabetes = cleaning(diabetes)

(9570, 33)
(870, 33)
(870, 7)


In [41]:
immun = cleaning(immun)

(9570, 33)
(870, 33)
(870, 7)


In [42]:
kidney = cleaning(kidney)

(9570, 33)
(870, 33)
(870, 7)


In [None]:
# I now have 7 dataframes. Some of these need to be filtered by question
# asthma, cancer, heart, copd, diabetes, immun, kidney
# I have the overall rates as well as rates by gender & race

In [43]:
#Examining the size of each dataset to ensure cleaning function worked

print(asthma.shape)
#print(cancer.shape)
print(cardiac_mortality.shape)
print(high_blood_pressure.shape)
print(copd.shape)
print(diabetes.shape)
print(immun.shape)
print(kidney.shape)

(110, 7)
(156, 7)
(110, 7)
(110, 7)
(110, 7)
(110, 7)
(110, 7)


In [44]:
# Cardiac_mortality is larger: ensuring there are no duplicate values
cardiac_mortality.drop_duplicates(inplace=True)

In [45]:
cardiac_mortality.shape

(156, 7)

In [46]:
asthma.head()

Unnamed: 0,YearStart,LocationAbbr,LocationDesc,Question,DataValue,Stratification1,DataValueType
29319,2019,AK,Alaska,Current asthma prevalence among adults aged >=...,9.7,Overall,Age-adjusted Prevalence
29673,2019,AK,Alaska,Current asthma prevalence among adults aged >=...,9.9,Overall,Crude Prevalence
30178,2019,AL,Alabama,Current asthma prevalence among adults aged >=...,9.4,Overall,Age-adjusted Prevalence
30473,2019,AL,Alabama,Current asthma prevalence among adults aged >=...,9.5,Overall,Crude Prevalence
31698,2019,AR,Arkansas,Current asthma prevalence among adults aged >=...,9.3,Overall,Age-adjusted Prevalence


In [47]:
asthma['Stratification1'].value_counts()

Overall    110
Name: Stratification1, dtype: int64

In [49]:
asthma.DataValueType.value_counts()

Age-adjusted Prevalence    55
Crude Prevalence           55
Name: DataValueType, dtype: int64

In [50]:
copd.DataValueType.value_counts()

Age-adjusted Prevalence    55
Crude Prevalence           55
Name: DataValueType, dtype: int64

In [51]:
cardiac_mortality.DataValueType.value_counts()

Number               52
Crude Rate           52
Age-adjusted Rate    52
Name: DataValueType, dtype: int64

In [52]:
def adj(df):
    adj_df = df[df['DataValueType'] == 'Age-adjusted Prevalence'].copy()
    adj_df.drop(columns=['LocationAbbr', 'DataValueType', 'Stratification1'], inplace=True)
    return adj_df

def crude(df):
    crude_df = df[df['DataValueType'] == 'Crude Prevalence'].copy()
    crude_df.drop(columns=['LocationAbbr', 'DataValueType', 'Stratification1'], inplace=True)
    return crude_df

In [612]:
#adj_asthma = adj(asthma)
#crude_asthma = crude(asthma)

#adj_asthma.to_csv('Data/asthma_adj.csv')
#crude_asthma.to_csv('Data/asthma_crude.csv')

In [613]:
#Don't keep re-running this cell
def process_dataframes(dataframe_list):
    results = {} 
    for name, df in dataframe_list.items():
        adj_result = adj(df)
        crude_result = crude(df)
        results[name] = {'adj': adj_result, 'crude': crude_result}
    return results

# Create a list of DataFrames 
dataframe_list = {
    'copd': copd,
    'asthma': asthma,
    'high_blood_pressure': high_blood_pressure,
    'cardiac_mortality': cardiac_mortality,
    'diabetes': diabetes,
    'immun': immun,
    'kidney': kidney
}
# Process the list of DataFrames
results = process_dataframes(dataframe_list)

# Save the results to CSV
#for name, result in results.items():
   # result['adj'].to_csv(f'Data/{name}_adj.csv')
   # result['crude'].to_csv(f'Data/{name}_crude.csv')

# Data Cleaning

### Data Cleaning Goals:

1. Identify any issues with each dataframe
2. If any missing values- fill with an appropriate placeholder/ check documentation
3. Rename variables appropriately
    - Location
    - variable of interest
4. Drop uneccesary columns
    - year
    - variable descriptions
    - demographics (initially)
    - abbreviations
5. Drop unecessary rows
    - Keep 50 states
    - keep total us
    - Maybe keep PR or DC if present in all datasets
5. Calculate any necessary values-> convert total numbers to rates per population
6. Check datatypes
7. Date/ Time variables?
8. Reformat dataframes if necessary
9. Concatenate/ merge as necessary

#### For 2 insurance datasets: 
- add year as prefix/ suffix to each column 
- drop year column

In [55]:
insur_2019 = pd.read_csv('Data/2019_insurance.csv')

In [56]:
insur_2019.head()

Unnamed: 0,Location,Employer,Non-Group,Medicaid,Medicare,Military,Uninsured,Year
0,United States,0.496,0.059,0.198,0.142,0.014,0.092,2019
1,Alabama,0.472,0.055,0.195,0.16,0.021,0.097,2019
2,Alaska,0.484,0.035,0.213,0.1,0.053,0.115,2019
3,Arizona,0.451,0.052,0.21,0.161,0.015,0.111,2019
4,Arkansas,0.42,0.054,0.262,0.159,0.014,0.091,2019


In [57]:
insur_2021 = pd.read_csv('Data/2021_insurance.csv')

In [None]:
insur

#### For pre-existing conditions:
1. choose crude or adj
2. drop old index
3. rename data value to question
4. drop year and question columns
5. For total number of incident data -> after concattening with population data, convert to rate

#### Income per capita:
1. Rename year columns
2. Rename Geoname to Location
3. Drop geofips column


#### Life Expectancy:
1. Convert abbreviated state names to unabbreviated name (Rename column name to match and then merge with Mask Mandates on abbreviations and then drop abbreviations)
2. Rename State to Location
3. Rename rate to average life expectancy
4. Drop url and year

#### Mask Mandate:
1. After using abbreviation to match with states for life expectancy drop abbrev
2. Rename state_name to Location
3. Rename mandatory to mask_mandate

#### Population Data:
1. Drop index column
2. Rename State to Location

#### Population Density:
1. Drop index column
2. Rename State to Location

#### Total Employment:
1. Rename columns with values of row 1
2. Rename year columns to total_employment_year
3. Rename GeoName to Location
4. Drop GeoFips