In [2]:
from folktables import ACSDataSource
import folktables
import pandas as pd
import numpy as np
import random
import pickle
from sklearn.metrics import r2_score, mean_squared_error
from tqdm import tqdm
from sklearn.linear_model import LinearRegression, Ridge, Lasso

import matplotlib.pyplot as plt
from folktables.load_acs import state_list

In [3]:
df_all = pd.read_pickle("./data_frames/dense_acs_mm_notoh.pkl")

In [4]:
cat_cols_sig = ['OCCP', 'SCHL', 'ST', 'JWTRNS', 'DRAT', 'COW', 'SEX', \
       'RELSHIPP', 'POBP', 'ENG', 'MAR', 'RAC1P'] # significant features from the earlier analysis
numeric_cols = ['WKHP', 'AGEP', 'PINCP']
print(len(cat_cols_sig), len(numeric_cols))

12 3


In [5]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1569188 entries, 0 to 1630166
Data columns (total 15 columns):
 #   Column    Non-Null Count    Dtype  
---  ------    --------------    -----  
 0   OCCP      1569188 non-null  int64  
 1   WKHP      1569188 non-null  float64
 2   AGEP      1569188 non-null  float64
 3   SCHL      1569188 non-null  int64  
 4   ST        1569188 non-null  int64  
 5   JWTRNS    1569188 non-null  int64  
 6   DRAT      1569188 non-null  int64  
 7   COW       1569188 non-null  int64  
 8   SEX       1569188 non-null  int64  
 9   RELSHIPP  1569188 non-null  int64  
 10  POBP      1569188 non-null  int64  
 11  ENG       1569188 non-null  int64  
 12  MAR       1569188 non-null  int64  
 13  RAC1P     1569188 non-null  int64  
 14  PINCP     1569188 non-null  float64
dtypes: float64(3), int64(12)
memory usage: 191.6 MB


In [13]:
df_all.head() #numeric has been mm scaled, categoricals still as they were in ACS encoded

Unnamed: 0,OCCP,WKHP,AGEP,SCHL,ST,JWTRNS,DRAT,COW,SEX,RELSHIPP,POBP,ENG,MAR,RAC1P,PINCP
0,299.0,0.295918,0.025316,17.0,0.0,0.0,0.0,1.0,1.0,18.0,47.0,0.0,4.0,0.0,0.039472
1,292.0,0.397959,0.037975,18.0,0.0,1.0,0.0,0.0,0.0,18.0,10.0,0.0,4.0,1.0,0.030967
2,325.0,0.173469,0.025316,17.0,0.0,0.0,0.0,1.0,1.0,18.0,18.0,0.0,4.0,0.0,0.030467
3,268.0,0.05102,0.21519,18.0,0.0,12.0,0.0,1.0,1.0,18.0,14.0,0.0,2.0,0.0,0.053479
4,161.0,0.091837,0.025316,17.0,0.0,0.0,0.0,0.0,0.0,18.0,32.0,0.0,4.0,0.0,0.009455


In [6]:
from utils import ordinal_encoder

In [7]:
df_all = ordinal_encoder(df_all, cat_cols_sig)

In [69]:
Counter(df_all['RAC1P'])

Counter({0.0: 1045807,
         8.0: 168468,
         1.0: 136227,
         5.0: 100852,
         7.0: 97582,
         2.0: 14455,
         6.0: 2755,
         4.0: 2316,
         3.0: 726})

In [14]:
from sklearn.model_selection import train_test_split
rand_seed = 21
df_train, df_test = train_test_split(df_all, test_size=0.2, random_state = rand_seed)

In [15]:
df_train.head()

Unnamed: 0,OCCP,WKHP,AGEP,SCHL,ST,JWTRNS,DRAT,COW,SEX,RELSHIPP,POBP,ENG,MAR,RAC1P,PINCP
1282566,34.0,0.071429,0.468354,19.0,39.0,11.0,0.0,0.0,1.0,5.0,32.0,1.0,4.0,1.0,0.059483
979236,269.0,0.397959,0.278481,3.0,31.0,1.0,0.0,0.0,1.0,1.0,55.0,0.0,0.0,0.0,0.139527
1495594,260.0,0.397959,0.316456,18.0,45.0,0.0,0.0,0.0,1.0,1.0,46.0,0.0,0.0,0.0,0.155536
1380416,268.0,0.397959,0.468354,18.0,42.0,1.0,0.0,0.0,0.0,5.0,43.0,0.0,2.0,0.0,0.029466
921721,323.0,0.397959,0.506329,15.0,29.0,1.0,0.0,2.0,1.0,1.0,30.0,0.0,0.0,0.0,0.159538


In [16]:
from collections import Counter
Counter(df_train['RAC1P'])

Counter({0.0: 836563,
         8.0: 134697,
         1.0: 108927,
         5.0: 80758,
         7.0: 78125,
         2.0: 11646,
         6.0: 2207,
         4.0: 1854,
         3.0: 573})

In [17]:
y_train = df_train['PINCP']
X_train = df_train.drop(['PINCP'], axis=1, inplace = False)

In [18]:
y_test = df_test['PINCP']
X_test = df_test.drop(['PINCP'], axis=1, inplace = False)

# Baseline Ridge and Linear

# Grid search for "best" decision tree

In [47]:
from sklearn.model_selection import KFold, GridSearchCV
# cv = KFold(n_splits=5)

In [121]:
grid = dict()
max_depth = np.arange(1, 20)
splitter = ['best', 'random']
min_samples_leaf = [0.001, 0.01, 0.1]
min_samples_split = [0.001, 0.005]


grid_dt = {'max_depth': max_depth,
               'splitter': splitter,
               'min_samples_leaf': min_samples_leaf,
               'min_samples_split': min_samples_split}

In [122]:
grid_dt

{'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19]),
 'splitter': ['best', 'random'],
 'min_samples_leaf': [0.001, 0.01, 0.1],
 'min_samples_split': [0.001, 0.005]}

In [None]:
from sklearn import tree
dtree_reg = tree.DecisionTreeRegressor()
grid_search = GridSearchCV(estimator = dtree_reg, param_grid = grid_dt, n_jobs=4, verbose=10)
grid_result = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 228 candidates, totalling 1140 fits


In [None]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [76]:
from sklearn import tree
dtree_reg = tree.DecisionTreeRegressor()

In [77]:
grid_search = GridSearchCV(estimator=dtree_reg, param_grid=grid, n_jobs=4, cv=cv)

In [78]:
grid_result = grid_search.fit(X_train, y_train)

In [79]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))


Best: 0.493448 using {'max_depth': 13}


In [81]:
dtree_reg_best = tree.DecisionTreeRegressor(max_depth = 13)

In [82]:
dtree_reg_best.fit(X_train, y_train)

In [84]:
dtree_reg_best.score(X_train, y_train)

0.5258775521586688

In [83]:
dtree_reg_best.score(X_test, y_test)

0.5002549062826196

# Grid search for "best" random forest

In [95]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 1000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 20, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid

In [101]:
n_estimators = [10, 100, 500]
max_depth = [5, 10, 15]

In [102]:
grid_rf = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [104]:
rf = RandomForestRegressor()
cv = KFold(n_splits=3)
grid_search = GridSearchCV(estimator=rf, param_grid = grid_rf, n_jobs=4, cv=cv)
grid_result = grid_search.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

{'n_estimators': [10, 100, 500],
 'max_features': ['auto', 'sqrt'],
 'max_depth': [5, 10, 15],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [1, 2, 4],
 'bootstrap': [True, False]}

In [None]:
grid_search = GridSearchCV(estimator=dtree_reg, param_grid=grid, n_jobs=4, cv=cv)

In [None]:
rf_random.best_params_

In [107]:
rf8depth = RandomForestRegressor(n_estimators = 10, max_depth = 8)
rf8depth.fit(X_train,y_train)

In [108]:
rf.score(X_train, y_train), rf.score(X_test, y_test)

(0.4686889963825148, 0.47022364820004336)

In [109]:
rf5depth = RandomForestRegressor(n_estimators = 10, max_depth = 5)
rf5depth.fit(X_train,y_train)

In [110]:
rf5depth.score(X_train, y_train), rf5depth.score(X_test, y_test)

(0.40950996159653574, 0.4117925190246482)

In [111]:
rf10depth = RandomForestRegressor(n_estimators = 10, max_depth = 10)
rf10depth.fit(X_train,y_train)
rf10depth.score(X_train, y_train), rf10depth.score(X_test, y_test)

(0.49882551056576474, 0.4965951700895338)

In [115]:
max_depth = [5, 8, 10]
n_estimators = [10]

grid_rf = {'n_estimators': n_estimators,
               # 'max_features': max_features,
               'max_depth': max_depth,
               # 'min_samples_split': min_samples_split,
               # 'min_samples_leaf': min_samples_leaf,
               # 'bootstrap': bootstrap
          }
rf_temp = RandomForestRegressor()
grid_search = GridSearchCV(estimator=rf_temp, param_grid=grid_rf, cv = 2)
grid_result = grid_search.fit(X_train, y_train)

In [116]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.491849 using {'max_depth': 10, 'n_estimators': 10}


In [117]:
rf_5dep_100est = RandomForestRegressor(n_estimators = 100, max_depth = 5)
rf_5dep_100est.fit(X_train, y_train)

In [118]:
rf_5dep_100est.score(X_train, y_train)

0.4108138805594156

In [119]:
rf_5dep_100est.score(X_test, y_test)

0.41329075810028315

# Gradient boosted regressor

In [29]:
%%time
from sklearn.ensemble import GradientBoostingRegressor
gbr_good = GradientBoostingRegressor(max_depth = 8, n_estimators=200, learning_rate = 0.1)
gbr_good.fit(X_train, y_train)

CPU times: user 13min 53s, sys: 67 ms, total: 13min 53s
Wall time: 13min 53s


In [30]:
gbr_good.score(X_train, y_train), gbr_good.score(X_test, y_test)

(0.5891244709897457, 0.5712725552712814)

# Knn regressor

In [19]:
from sklearn.neighbors import KNeighborsRegressor
neigh5 = KNeighborsRegressor(n_neighbors = 5)
neigh5.fit(X_train, y_train)

In [20]:
neigh5.score(X_train,y_train)

0.5670677879060622

In [21]:
neigh5.score(X_test,y_test)

0.34857123948720314

In [22]:
from sklearn.neighbors import KNeighborsRegressor
neigh10 = KNeighborsRegressor(n_neighbors = 10)
neigh10.fit(X_train, y_train)

In [23]:
neigh10.score(X_train,y_train), neigh10.score(X_test,y_test)

(0.49791175065745363, 0.38774123935632665)