# CDC Mortality Data 1968-2018

In [1]:
import pandas as pd 
import numpy as np 

The CDC Mortality Data is publically available here:
https://www.cdc.gov/nchs/data_access/cmf.htm
The mortality data from 1968-1988 is housed in two zip files. In order to conduct my analysis I had to download the zip files to my computer.  Due to the CDC's privacy concerns, I am not comfortable publishing the text data to github. The data is housed within text files that can be deciphered with a the codes provided by the CDC in a pdf found on the webpage given.  Mortality Data from 1989-2018 can be found by using their WONDER api found here:
https://wonder.cdc.gov/mortSQL.html and 
https://wonder.cdc.gov/ucd-icd10.html

## Importing and Cleaning Data

In [2]:
#This creates a dataframe for mortality data by US county from 1968-1978
file=r'Data\CDC_Wonder\mort6878\Mort6878.txt'
with open(file) as f:
    content=f.readlines()
content=[x.strip() for x in content]
FIPS=[]
for code in content:
    code=code[0:5]
    FIPS.append(code)
year_list=[]
for year in content:
    year=year[5:9]
    year_list.append(year)
deaths=[]
for death in content:
    death=death[19:]
    deaths.append(death)
dfMort6878=pd.DataFrame({'FIPS':FIPS,'year':year_list,'deaths':deaths})
dfMort6878['deaths']=dfMort6878['deaths'].str.lstrip()
dfMort6878.head(10)

Unnamed: 0,FIPS,year,deaths
0,1001,1968,1
1,1001,1968,2
2,1001,1968,1
3,1001,1968,2
4,1001,1968,1
5,1001,1968,1
6,1001,1968,1
7,1001,1968,1
8,1001,1968,1
9,1001,1968,1


In [3]:
len(dfMort6878.iloc[0,2])

1

In [4]:
dfMort6878.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8774864 entries, 0 to 8774863
Data columns (total 3 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   FIPS    object
 1   year    object
 2   deaths  object
dtypes: object(3)
memory usage: 200.8+ MB


In [5]:
#this does the same thing but for 1979-1988
file=r'Data\CDC_Wonder\mort7988\Mort7988.txt'
with open(file) as f:
    content=f.readlines()
content=[x.strip() for x in content]
FIPS=[]
for code in content:
    code=code[0:5]
    FIPS.append(code)
year_list=[]
for year in content:
    year=year[5:9]
    year_list.append(year)
deaths=[]
for death in content:
    death=death[19:]
    deaths.append(death)
dfMort7988=pd.DataFrame({'FIPS':FIPS,'year':year_list,'deaths':deaths})
dfMort7988['deaths']=dfMort7988['deaths'].str.lstrip()
dfMort7988.head(10)

Unnamed: 0,FIPS,year,deaths
0,1001,1979,1
1,1001,1979,1
2,1001,1979,1
3,1001,1979,1
4,1001,1979,2
5,1001,1979,1
6,1001,1979,1
7,1001,1979,1
8,1001,1979,1
9,1001,1979,1


In [6]:
dfMort7988.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8776385 entries, 0 to 8776384
Data columns (total 3 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   FIPS    object
 1   year    object
 2   deaths  object
dtypes: object(3)
memory usage: 200.9+ MB


In [7]:
#this groups the mortality data by year and FIPS code 
df=pd.concat([dfMort6878,dfMort7988])
df['year']=df.year.astype(int)
df['deaths']=df.deaths.astype(int)
df=df.groupby(['FIPS','year']).sum()
df6888=df
df6888.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,deaths
FIPS,year,Unnamed: 2_level_1
1001,1968,226
1001,1969,191
1001,1970,204
1001,1971,229
1001,1972,206
1001,1973,257
1001,1974,221
1001,1975,196
1001,1976,234
1001,1977,224


In [8]:
#This imports each cdc mortality data set from 1989-2016.  Raw text data file was initially cleaned up by hand.
df_list=[]
for i in range(1989,2017):
    file=r'Data\CDC_Wonder\Compressed_Mortality\Compressed Mortality, '+str(i)+' edit.txt'
    df=pd.read_csv(file,delim_whitespace=True)
    df=df.loc[df['Deaths']!='Missing']
    df=df.loc[df['Deaths']!='Suppressed']
    df_list.append(df)
for i,j in list(zip([x for x in range(len(df_list))],[x for x in range(1989,2017)])):
    df_list[i]['year']=j

In [9]:
#does the same thing for 2017-2018
df_list_short=[]
for i in range(2017,2019):
    file=r'Data\CDC_Wonder\Multiple_Cause\Multiple Cause of Death, '+str(i)+' edit.txt'
    df=pd.read_csv(file,delim_whitespace=True)
    df=df.loc[df['Deaths']!='Missing']
    df=df.loc[df['Deaths']!='Suppressed']
    df_list_short.append(df)
for i,j in list(zip([x for x in range(len(df_list_short))],[x for x in range(2017,2019)])):
    df_list_short[i]['year']=j
df_list.append(df_list_short[0])
df_list.append(df_list_short[1])

In [10]:
#combines and cleans up data
df=pd.concat(df_list)
df=df.drop(['County','Population','CrudeRate','Rel'], axis=1)
df.columns=['FIPS', 'deaths', 'year']
df['deaths']=df['deaths'].astype(int)
df['FIPS']=df['FIPS'].astype(str)
df.reset_index().drop(['index'], axis=1)
change=list(df.FIPS.values)
fips=[]
for element in change:
    if len(element)==4:
        element='0'+element
        fips.append(element)
    else:
        fips.append(element)
df.FIPS=fips
df=df.groupby(['FIPS','year']).sum()
df8918=df
df8918.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,deaths
FIPS,year,Unnamed: 2_level_1
1001,1989,259
1001,1990,304
1001,1991,283
1001,1992,310
1001,1993,309
1001,1994,354
1001,1995,298
1001,1996,345
1001,1997,328
1001,1998,372


In [11]:
df=pd.concat([df6888,df8918])
df.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,deaths
FIPS,year,Unnamed: 2_level_1
1001,1968,226
1001,1969,191
1001,1970,204
1001,1971,229
1001,1972,206
1001,1973,257
1001,1974,221
1001,1975,196
1001,1976,234
1001,1977,224


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 158096 entries, ('01001', 1968) to ('56045', 2018)
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   deaths  158096 non-null  int32
dtypes: int32(1)
memory usage: 1.1+ MB


## Modeling