## This code takes intermediate CSVs transcribed from PDFs and puts it in a consistent format

In [1]:
import pandas as pd

# Loading CSVs parsed from PDF through Tabula and manually fixed
years = [2015, 2016, 2017, 2018]
csvs = {year : f'Intermediate CSVs/tabula-cases-by-county-{year}.csv' for year in years}
dfs = {year : pd.read_csv(csvs[year]) for year in csvs.keys()}

# Tranpose tables and clean up whitespace
for year in years:
    dfs[year] = dfs[year].T
    dfs[year].columns = dfs[year].iloc[0].apply(lambda x: x.strip())
    dfs[year].drop(dfs[year].index[0], inplace=True)

In [2]:
# Checking that county totals add up to "Arizona" column
for year in years:
    print(year, dfs[year].loc['Arizona'] - dfs[year].drop(dfs[year].index[0]).sum())

2015 Disease
Measles                                                            0
Mumps                                                              0
Pertussis                                                          0
Pertussis confirmed cases                                          0
Rubella                                                            0
Congenital Rubella Syndrome                                        0
Haemophilus influenzae  type B (invasive, age < 5 years)           0
Varicella (chickenpox)                                             0
Aseptic Meningitis                                                 0
Meningococcal Disease                                              0
Viral Encephalitis                                                 0
Amebiasis                                                          0
Campylobacteriosis                                                 0
Cholera                                                            0
Cryptosporidiosis    

The statewide total for Mumps in 2017 is off by one, an error in the original data set. We'll ignore this error and drop statewide totals.

In [3]:
for year in years:
    dfs[year] = dfs[year].drop(dfs[year].index[0])

Now let's produce output CSVs for the diseases we are tracking across multiple states.

In [4]:
diseases = {'STEC' : 'E. coli, Shiga toxin-producing',
            'Campylobacteriosis' : 'Campylobacteriosis',
            'Salmonellosis' : 'Salmonellosis (except S. Typhi and S. Paratyphi)'}

output_dfs = {}
for disease, column_name in diseases.items():
    output_dfs[disease] = pd.concat([
        pd.concat([dfs[year][column_name] for year in years], axis=1),
        pd.concat([dfs[year][column_name].apply(lambda x: '-') for year in years], axis=1)
    ], axis=1)
    output_dfs[disease].columns = [f'{year} COUNT' for year in years] +  [f'{year} RATE' for year in years]
    output_dfs[disease].index.name ='COUNTY'

In [5]:
for disease in diseases:
    output_dfs[disease].to_csv(f'AZ_{disease}.csv') 