In [148]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.options.display.max_columns = 1000

In [149]:
## Date for the model
cases_date = '5_14_20'

#### Read coronavirus total cases data

In [150]:
dfcv = pd.read_csv('../files/usafacts_cases2.csv')

dates = dfcv.columns[4:]

changed_dates = {date: date.replace('/', '_') for date in dates}

dfcv = dfcv.rename(columns=changed_dates)

dfcv.countyFIPS = dfcv.countyFIPS.map(lambda x: '{:05d}'.format(x))

dfcv = dfcv.set_index('countyFIPS')

#### Prepare dataframe with cases and census data

In [151]:
df = pd.read_csv('../files/modeling_data/census_data.csv', index_col=0)

dft = pd.DataFrame(dfcv[cases_date])

df.index = df.index.map(lambda x: '{:05d}'.format(x))

df = df.join(dft)

df[cases_date] = 100_000 * df[cases_date] / df.TOT_POP

df.rename(columns={cases_date:'CPC'}, inplace = True)

df = df.reset_index()

df = df.rename(columns={'index':'FIPS'})

Drop Washington D.C.

In [152]:
# Drop the District of Columbia
df = df.loc[df.STNAME != 'District of Columbia']

len(df.STNAME.unique())

50

### Deal with missing values

In [155]:
# Columns with missing values
[col for col in df.columns if df[col].isnull().any()]

['Civilian_labor_force_2018',
 'Employed_2018',
 'Unemployed_2018',
 'Median_Household_Income_2018',
 'Rural_urban_continuum_code_2013',
 'Urban_influence_code_2013',
 'Metro_2013']

In [156]:
# Counties with missing values
counties_with_missing = dict()
for col in df.columns:
    if df[col].isnull().any():
        counties = df.loc[df[col].isnull()].CTYNAME.tolist()
        for county in counties:
            county_state = county + ', ' + df.loc[df.CTYNAME == county].STNAME.iloc[0]
            if county_state not in counties_with_missing:
                counties_with_missing[county_state] = [col]
            else:
                counties_with_missing[county_state].append(col)

In [157]:
counties_with_missing

{'Kalawao County, Hawaii': ['Civilian_labor_force_2018',
  'Employed_2018',
  'Unemployed_2018',
  'Median_Household_Income_2018',
  'Rural_urban_continuum_code_2013',
  'Urban_influence_code_2013',
  'Metro_2013']}

In [158]:
len(counties_with_missing)

1

There (were) only three counties with missing values, we manually googled and insert the land area values for the two counties with missing land areas and we can drop the county in Hawaii

In [None]:
df = df.loc[~(df.CTYNAME == 'Kalawao County')]

### Modeling

In [167]:
# Target is cases per capita
y = df.CPC
X = df.drop(['CPC'], axis=1)

In [168]:
from sklearn.model_selection import train_test_split

# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, \
                                                      random_state=0)

### Categorical variables

In [173]:
# We should one hot encode the region
df.REGION.unique()

array([3, 4, 1, 2])

In [188]:
# These ordinal columns don't need to be changed but should be integers not floats
print(df['Rural_urban_continuum_code_2013'].sort_values().unique())

print(df['Urban_influence_code_2013'].sort_values().unique())

print(df.Metro_2013.sort_values().unique())

[1. 2. 3. 4. 5. 6. 7. 8. 9.]
[ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12.]
[0. 1.]


We can first try dropping the state name, and then try one hot encoding the state name.