In [1]:
### Before we submit, review which modules we actually use !!!

%load_ext autoreload
%autoreload 2

import warnings
import calendar

import pandas as pd
import numpy as np
import seaborn as sns
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.pylab as pltlab

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, train_test_split

import pipeline as pipe
pd.set_option('display.max_columns', None)

# warnings.filterwarnings('ignore')
%matplotlib inline
sns.set(rc={'figure.figsize':(11, 4)})

# Data Import & Exploration

In [2]:
data = pd.read_pickle('data/final_dataset.pk1')

In [3]:
pipe.find_outliers(data, 'median_income', 0, 10000000)

Number of outliers found: 0
Outlier values found: []


Unnamed: 0,name,total_pop,median_income,state,county,county_name,state_name,prop_white,prop_black,prop_hisp,log_med_income,prop_no_internet,prop_ba,prop_services,pop_density,FIPS,covid_cases,Testing_Rate,gov_party,election_diff,Apr-19,Mar-19,Feb-20,Mar-20,days_closed,yearly_change,monthly_change


In [15]:
data.head()

Unnamed: 0,name,total_pop,median_income,state,county,county_name,state_name,prop_white,prop_black,prop_hisp,log_med_income,prop_no_internet,prop_ba,prop_services,pop_density,FIPS,covid_cases,Testing_Rate,gov_party,election_diff,Apr-19,Mar-19,Feb-20,Mar-20,days_closed,yearly_change,monthly_change
0,"Washington County, Mississippi",47086,30834.0,28,151,Washington,Mississippi,0.256913,0.721701,0.015482,10.336373,0.336958,0.187937,0.100274,62.178441,28151.0,77,2291.374085,1,-0.364695,6.4,7.5,7.4,7.0,17.0,-0.066667,-0.054054
1,"Perry County, Mississippi",12028,39007.0,28,111,Perry,Mississippi,0.787745,0.196874,0.015048,10.571496,0.310103,0.109539,0.07885,18.434583,28111.0,27,2291.374085,1,0.536635,5.7,6.2,6.8,6.3,17.0,0.016129,-0.073529
2,"Choctaw County, Mississippi",8321,37203.0,28,19,Choctaw,Mississippi,0.676722,0.311982,0.003966,10.524145,0.368837,0.176582,0.014207,19.794546,28019.0,13,2291.374085,1,0.386224,4.8,5.3,5.4,5.0,17.0,-0.056604,-0.074074
3,"Itawamba County, Mississippi",23480,40510.0,28,57,Itawamba,Mississippi,0.909114,0.071593,0.015332,10.609304,0.226281,0.134468,0.057524,44.138916,28057.0,57,2291.374085,1,0.755161,4.1,4.4,4.7,4.7,17.0,0.068182,0.0
4,"Carroll County, Mississippi",10129,43060.0,28,15,Carroll,Mississippi,0.643992,0.345839,0.002863,10.67035,0.332969,0.145006,0.057415,15.774563,28015.0,38,2291.374085,1,0.383321,5.5,5.8,6.2,6.0,17.0,0.034483,-0.032258


In [4]:
pipe.get_summary_stats(data)

          total_pop  median_income   prop_white   prop_black    prop_hisp  \
count  2.826000e+03    2825.000000  2826.000000  2826.000000  2826.000000   
mean   1.117590e+05   51714.241416     0.827329     0.098748     0.091962   
std    3.414925e+05   13821.224638     0.164131     0.149708     0.135249   
min    4.180000e+02   20188.000000     0.093534     0.000000     0.000000   
25%    1.412200e+04   42491.000000     0.757614     0.008110     0.021500   
50%    3.013550e+04   49936.000000     0.889050     0.028182     0.041568   
75%    7.725500e+04   57886.000000     0.947572     0.117748     0.095686   
max    1.009805e+07  136268.000000     0.997743     0.874123     0.990688   

       log_med_income  prop_no_internet      prop_ba  prop_services  \
count     2825.000000       2826.000000  2826.000000    2825.000000   
mean        10.821023          0.225514     0.217787       0.083571   
std          0.251809          0.086132     0.096556       0.034479   
min          9.912844 

After reviewing the summary statistics, we confirmed that the median income is non-negative. We also looked into the potential outlier on population density and confirmed that that value is New York City.

In [5]:
data.dtypes

name                 object
total_pop             int64
median_income       float64
state                object
county               object
county_name          object
state_name           object
prop_white          float64
prop_black          float64
prop_hisp           float64
log_med_income      float64
prop_no_internet    float64
prop_ba             float64
prop_services       float64
pop_density         float64
FIPS                float64
covid_cases           int64
Testing_Rate        float64
gov_party             int64
election_diff       float64
Apr-19              float64
Mar-19              float64
Feb-20              float64
Mar-20              float64
days_closed         float64
yearly_change       float64
monthly_change      float64
dtype: object

# Model Fitting & Evaluation

In [6]:
# Set up Pipeline
pipeline = Pipeline([
    ('poly', PolynomialFeatures(include_bias=False)),
    ('ridge', Ridge(alpha=0.1))
])

alpha_range = np.arange(0,1,0.1)
params = {
          'ridge__alpha': alpha_range,
          'poly__degree': (1,2)
         }

grid_model = GridSearchCV(estimator=pipeline, 
                          param_grid=params, 
                          cv=5,
                          return_train_score=True,
                          scoring='neg_mean_squared_error',
                          iid=True)

## Yearly Data

### With States

In [7]:
# Select variables from full dataframe
df_yr_st = data.drop(['Apr-19','Mar-20','Feb-20','Mar-20','median_income',
                  'monthly_change','FIPS','name','state_name','county_name','county'], axis=1)

# One-hot encode state
df_yr_st = pipe.hot_encode(df_yr_st, ['state'])

In [8]:
# Create training and testing sets
train, test = pipe.split_data(df_yr_st)

# Impute/normalize continuous variables
numeric_cols = train.select_dtypes(include=['float64','uint8']).columns #Normalizes state one-hots as well
train, test = pipe.impute_missing(train, test, numeric_cols)
train, test = pipe.normalize(train, test, numeric_cols)

# Separate features and targets
train_features = train.drop('yearly_change', axis=1)
test_features = test.drop('yearly_change', axis=1)

train_target = train['yearly_change']
test_target = test['yearly_change']

Training set contains 2260 observations
Testing set contains 566 observation

Imputing log_med_income missing values with median 10.819318178577827
Imputing prop_services missing values with median 0.0790869921304704


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[atts] = scaler.fit_transform(train[atts])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value inst

In [None]:
# Fit model
grid_model_result = grid_model.fit(train_features, train_target)
# cv_results = pd.DataFrame(grid_model_result.cv_results_)
# cv_results



In [None]:
print(grid_model_result.best_estimator_, '\n')

# Evaluate model
pipe.evaluate_model(grid_model, test_features, test_target)

# Rate coefficients
pipe.rank_coefs(grid_model_result, train_features.columns)

### Without States

In [11]:
# Select variables from full dataframe
df_yr = data.drop(['Apr-19','Mar-20','Feb-20','Mar-20','median_income',
                  'monthly_change','FIPS','name','state_name','county_name','county','state'], axis=1)

In [12]:
# Create training and testing sets
train, test = pipe.split_data(df_yr)

# Impute/normalize continuous variables
numeric_cols = train.select_dtypes(include=['float64']).columns
train, test = pipe.impute_missing(train, test, numeric_cols)
train, test = pipe.normalize(train, test, numeric_cols)

#Separate features and targets
train_features = train.drop('yearly_change', axis=1)
test_features = test.drop('yearly_change', axis=1)

train_target = train['yearly_change']
test_target = test['yearly_change']

Training set contains 2260 observations
Testing set contains 566 observation

Imputing log_med_income missing values with median 10.819318178577827
Imputing prop_services missing values with median 0.0790869921304704


In [13]:
# Fit Model
grid_model_result = grid_model.fit(train_features, train_target)
print(grid_model_result.best_estimator_, '\n')

# Evaluate Model
pipe.evaluate_model(grid_model, test_features, test_target)

# Rank Coefficients
pipe.rank_coefs(grid_model_result, train_features.columns)

Pipeline(steps=[('poly', PolynomialFeatures(degree=1, include_bias=False)),
                ('ridge', Ridge(alpha=0.9))]) 

Mean Squared Error: 0.7831932329846288
R-Squared: 0.16164297035635777
Adjusted R-Squared: 0.1387786877297129


Unnamed: 0,feature,coefficient,coefficient (absolute)
2,prop_black,0.353501,0.3535007
3,prop_hisp,0.307494,0.3074944
1,prop_white,0.271433,0.2714334
12,election_diff,0.18977,0.1897704
13,Mar-19,-0.148669,0.1486691
5,prop_no_internet,0.126295,0.1262953
7,prop_services,0.10444,0.1044404
6,prop_ba,0.0697799,0.06977987
4,log_med_income,-0.0637438,0.06374375
14,days_closed,-0.0530971,0.05309712


## Monthly Data

### With States

In [14]:
# Select variables from full dataframe
df_mo_st = data.drop(['Apr-19','Mar-20','Feb-20','Mar-20','median_income',
                  'yearly_change','FIPS','name','state_name','county_name','county'], axis=1)

In [15]:
# One-hot encode state
df_mo_st = pipe.hot_encode(df_mo_st, ['state'])

# Create training and testing sets
train, test = pipe.split_data(df_mo_st)

# Impute/nomralize continuous variables
numeric_cols = train.select_dtypes(include=['float64','uint8']).columns #Normalizes state one-hots as well
train, test = pipe.impute_missing(train, test, numeric_cols)
train, test = pipe.normalize(train, test, numeric_cols)

# Separate features and targets
train_features = train.drop('monthly_change', axis=1)
test_features = test.drop('monthly_change', axis=1)

train_target = train['monthly_change']
test_target = test['monthly_change']

Training set contains 2260 observations
Testing set contains 566 observation

Imputing log_med_income missing values with median 10.819318178577827
Imputing prop_services missing values with median 0.0790869921304704


In [16]:
# Fit model
grid_model_result = grid_model.fit(train_features, train_target)
print(grid_model_result.best_estimator_, '\n')

# Evaluate model
pipe.evaluate_model(grid_model, test_features, test_target)

# Rank coefficients
pipe.rank_coefs(grid_model_result, train_features.columns)

Pipeline(steps=[('poly', PolynomialFeatures(degree=1, include_bias=False)),
                ('ridge', Ridge(alpha=0.9))]) 

Mean Squared Error: 0.17901481774533548
R-Squared: 0.824816743207968
Adjusted R-Squared: 0.8016462122495029


Unnamed: 0,feature,coefficient,coefficient (absolute)
20,state_08,0.362565,3.625650e-01
33,state_22,0.247366,2.473659e-01
58,state_48,0.246354,2.463540e-01
64,state_55,-0.229637,2.296374e-01
42,state_31,0.211224,2.112235e-01
...,...,...,...
8,pop_density,-0.00152606,1.526061e-03
11,gov_party,-0.000735508,7.355081e-04
9,covid_cases,3.40473e-06,3.404729e-06
0,total_pop,1.65259e-07,1.652587e-07


### Without States

In [17]:
# Select variables from full dataframe
df_mo = data.drop(['Apr-19','Mar-20','Feb-20','Mar-20','median_income',
                  'yearly_change','FIPS','name','state_name','county_name','county','state'], axis=1)

In [18]:
# Create training and testing sets
train, test = pipe.split_data(df_mo)

# Impute/normalize continuous variables
numeric_cols = train.select_dtypes(include=['float64']).columns #Normalizes state one-hots as well
train, test = pipe.impute_missing(train, test, numeric_cols)
train, test = pipe.normalize(train, test, numeric_cols)

# Separate features and targets                              
train_features = train.drop('monthly_change', axis=1)
test_features = test.drop('monthly_change', axis=1)

train_target = train['monthly_change']
test_target = test['monthly_change']

Training set contains 2260 observations
Testing set contains 566 observation

Imputing log_med_income missing values with median 10.819318178577827
Imputing prop_services missing values with median 0.0790869921304704


In [19]:
# Fit Model
grid_model_result = grid_model.fit(train_features, train_target)
print(grid_model_result.best_estimator_, '\n')

# Evaluate model
pipe.evaluate_model(grid_model, test_features, test_target)

# Rank coefficients
pipe.rank_coefs(grid_model_result, train_features.columns)

Pipeline(steps=[('poly', PolynomialFeatures(degree=1, include_bias=False)),
                ('ridge', Ridge(alpha=0.9))]) 

Mean Squared Error: 0.7667409541331213
R-Squared: 0.24967005998379077
Adjusted R-Squared: 0.22920651616516685


Unnamed: 0,feature,coefficient,coefficient (absolute)
2,prop_black,0.406446,0.4064456
3,prop_hisp,0.332707,0.3327068
12,election_diff,0.267847,0.2678465
1,prop_white,0.218845,0.218845
11,gov_party,-0.203038,0.2030378
6,prop_ba,0.172722,0.1727218
13,Mar-19,-0.154737,0.1547369
7,prop_services,0.116795,0.1167948
4,log_med_income,-0.0960348,0.09603484
10,Testing_Rate,-0.0945844,0.09458437
