In [53]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pylab as pltlab
import seaborn as sns
import calendar
import geopandas as gpd
import censusdata
import warnings
import pipeline as pipe

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
warnings.filterwarnings('ignore')

%matplotlib inline

sns.set(rc={'figure.figsize':(11, 4)})

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [54]:
data = pd.read_pickle('data/final_dataset.pk1')

In [55]:
pipe.find_outliers(data, 'median_income', 0, 100000000000000000000000)

Number of outliers found: 0
Outlier values found: []


Unnamed: 0,name,total_pop,median_income,state,county,county_name,state_name,prop_white,prop_black,prop_hisp,...,Testing_Rate,gov_party,election_diff,Apr-19,Mar-19,Feb-20,Mar-20,days_closed,yearly_change,monthly_change


In [56]:
data.head()

Unnamed: 0,name,total_pop,median_income,state,county,county_name,state_name,prop_white,prop_black,prop_hisp,...,Testing_Rate,gov_party,election_diff,Apr-19,Mar-19,Feb-20,Mar-20,days_closed,yearly_change,monthly_change
0,"Washington County, Mississippi",47086,30834.0,28,151,Washington,Mississippi,0.256913,0.721701,0.015482,...,2291.374085,1,-0.364695,6.4,7.5,7.4,7.0,17.0,-0.066667,-0.054054
1,"Perry County, Mississippi",12028,39007.0,28,111,Perry,Mississippi,0.787745,0.196874,0.015048,...,2291.374085,1,0.536635,5.7,6.2,6.8,6.3,17.0,0.016129,-0.073529
2,"Choctaw County, Mississippi",8321,37203.0,28,19,Choctaw,Mississippi,0.676722,0.311982,0.003966,...,2291.374085,1,0.386224,4.8,5.3,5.4,5.0,17.0,-0.056604,-0.074074
3,"Itawamba County, Mississippi",23480,40510.0,28,57,Itawamba,Mississippi,0.909114,0.071593,0.015332,...,2291.374085,1,0.755161,4.1,4.4,4.7,4.7,17.0,0.068182,0.0
4,"Carroll County, Mississippi",10129,43060.0,28,15,Carroll,Mississippi,0.643992,0.345839,0.002863,...,2291.374085,1,0.383321,5.5,5.8,6.2,6.0,17.0,0.034483,-0.032258


In [57]:
pipe.get_summary_stats(data)

          total_pop  median_income   prop_white   prop_black    prop_hisp  \
count  2.826000e+03    2825.000000  2826.000000  2826.000000  2826.000000   
mean   1.117590e+05   51714.241416     0.827329     0.098748     0.091962   
std    3.414925e+05   13821.224638     0.164131     0.149708     0.135249   
min    4.180000e+02   20188.000000     0.093534     0.000000     0.000000   
25%    1.412200e+04   42491.000000     0.757614     0.008110     0.021500   
50%    3.013550e+04   49936.000000     0.889050     0.028182     0.041568   
75%    7.725500e+04   57886.000000     0.947572     0.117748     0.095686   
max    1.009805e+07  136268.000000     0.997743     0.874123     0.990688   

       log_med_income  prop_no_internet      prop_ba  prop_services  \
count     2825.000000       2826.000000  2826.000000    2825.000000   
mean        10.821023          0.225514     0.217787       0.083571   
std          0.251809          0.086132     0.096556       0.034479   
min          9.912844 

In [58]:
data.dtypes

name                 object
total_pop             int64
median_income       float64
state                object
county               object
county_name          object
state_name           object
prop_white          float64
prop_black          float64
prop_hisp           float64
log_med_income      float64
prop_no_internet    float64
prop_ba             float64
prop_services       float64
pop_density         float64
FIPS                float64
covid_cases           int64
Testing_Rate        float64
gov_party             int64
election_diff       float64
Apr-19              float64
Mar-19              float64
Feb-20              float64
Mar-20              float64
days_closed         float64
yearly_change       float64
monthly_change      float64
dtype: object

In [51]:
# Drop attributes that are not used as features or targets
data = data.drop(['Apr-19','Mar-20','Feb-20','Mar-20','median_income',
                  'monthly_change','FIPS','name','state_name','county_name','county'], axis=1)

data = pipe.hot_encode(data, ['state'])

# # WITHOUT STATES
# data = data.drop(['Apr-19','Mar-20','Feb-20','Mar-20','median_income',
#                   'monthly_change','FIPS','name','state_name','county_name','county','state'], axis=1)

In [52]:
train, test = train_test_split(data, random_state=246012349, test_size=0.2)
print(len(train))
print(len(test))

2260
566


In [29]:
numeric_cols = train.select_dtypes(include=['float64','uint8']).columns #Normalizes state one-hots as well
# numeric_cols = train.select_dtypes(include=['float64']).columns
train, test = pipe.impute_missing(train, test, numeric_cols)

total_pop           int64
prop_white        float64
prop_black        float64
prop_hisp         float64
log_med_income    float64
                   ...   
state_51            uint8
state_53            uint8
state_54            uint8
state_55            uint8
state_56            uint8
Length: 67, dtype: object


In [30]:
train, test = pipe.normalize(train, test, numeric_cols)

In [42]:
train.describe()

count    2.260000e+03
mean    -6.008714e-16
std      1.000221e+00
min     -1.254206e-01
25%     -1.254206e-01
50%     -1.254206e-01
75%     -1.254206e-01
max      7.973169e+00
Name: state_46, dtype: float64

In [33]:
train_features = train.drop('yearly_change', axis=1)
test_features = test.drop('yearly_change', axis=1)

train_target = train['yearly_change']
test_target = test['yearly_change']

### Model Training

In [34]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

# Create a Pipeline.
pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=1,include_bias=False)),
    ('ridge', Ridge(alpha=0.1)),
])


from sklearn.model_selection import GridSearchCV

alpha_range = np.arange(0,1,0.1)
params = {
          'ridge__alpha': alpha_range,
          'poly__degree': (1,2)
         }

grid_model = GridSearchCV(estimator=pipeline, 
                          param_grid=params, 
                          cv=5,
                          return_train_score=True,
                          scoring='neg_mean_squared_error',
                          iid=True)

grid_model_result = grid_model.fit(train_features, train_target)

In [35]:
grid_model_result.best_estimator_

Pipeline(steps=[('poly', PolynomialFeatures(degree=1, include_bias=False)),
                ('ridge', Ridge(alpha=0.9))])

In [36]:
predicted = grid_model.predict(test_features)
mse = mean_squared_error(test_target, predicted)
r2 = r2_score(test_target, predicted)
print(mse, r2)

0.20075049730929043 0.7851097487368053


In [37]:
n, p = train_features.shape
pipe.adj_r2(r2, n, p)

0.7786424634730703

In [38]:
grid_model_result.best_estimator_.steps[-1][1].coef_

array([ 1.49951482e-07, -4.81754614e-03, -2.31001762e-02,  1.41674743e-02,
       -5.02737055e-02,  4.95419675e-02, -2.54717028e-02,  5.84146322e-03,
       -2.20477025e-02,  2.73498842e-06, -2.94047757e-02,  2.89125703e-02,
       -4.39093758e-03, -7.50650926e-02, -3.52564525e-02, -1.03913424e-01,
        0.00000000e+00,  2.31439535e-04,  1.23795321e-01,  6.10249616e-02,
        2.80635381e-01, -2.34896619e-02,  2.97921073e-02,  2.47118739e-03,
        1.00094842e-01,  8.31279999e-02, -4.70583879e-02, -1.46146643e-01,
       -2.88075667e-01, -1.65210217e-01,  4.66529662e-02, -1.56691749e-01,
        1.04034023e-01,  2.46412211e-01, -4.05827670e-02, -5.96808108e-02,
       -1.70445775e-02, -1.82450932e-01, -8.95259161e-02, -9.30730224e-02,
        5.57909330e-02, -6.58645216e-02,  9.47578877e-02,  1.15781737e-01,
       -3.95791026e-02, -1.34545647e-02,  3.78479884e-02,  1.82354814e-02,
       -5.15609794e-02, -9.70836656e-02,  1.28683529e-01, -1.85738171e-01,
       -1.22378072e-01,  

In [44]:
features = np.array(train_features.columns).reshape(-1,1)
coefficients = grid_model_result.best_estimator_.steps[-1][1].coef_.reshape(-1,1)

importance = pd.DataFrame(np.concatenate((features, coefficients), axis=1))
importance['abs'] = [abs(x) for x in importance[1]]
importance.sort_values('abs', ascending=False).head(60)

(66, 1)
(66, 1)


Unnamed: 0,0,1,abs
28,state_17,-0.288076,0.288076
20,state_08,0.280635,0.280635
58,state_48,0.269358,0.269358
33,state_22,0.246412,0.246412
53,state_42,0.213024,0.213024
51,state_40,-0.185738,0.185738
37,state_26,-0.182451,0.182451
29,state_18,-0.16521,0.16521
31,state_20,-0.156692,0.156692
27,state_16,-0.146147,0.146147
