# USCensus Model Fitting

This notebook fits the various regression models based on the different cleaned train sets of the USCensus data. Note: it is likely not possible to run this locally. This can instead be done by dividing tasks on a clusters.

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score
from scipy import stats
from sklearn.utils import parallel_backend
from sklearn.model_selection import GridSearchCV
import multiprocessing

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor

import joblib

folds = 5
score_calc = 'neg_mean_squared_error'
n_jobs = multiprocessing.cpu_count()

rfr_param_grid = {'min_samples_split' : [60, 70, 80], 'n_estimators' : [350, 400, 450], 'random_state': [5] }

xgb_param_grid = {'learning_rate' : [0.04,0.05,0.06], 'n_estimators' : [700,800,900], 'random_state': [5],
              'max_depth' : [3,4,5]}

gbr_param_grid = {'min_samples_split' : [8, 10, 12], 'n_estimators' : [650, 700, 750], 'random_state': [5] }

def fit_all_models(features, target):
 
    grid_rfr = GridSearchCV(RandomForestRegressor(), rfr_param_grid, cv = folds, refit=True, verbose = 0, scoring = score_calc, n_jobs = n_jobs)
    grid_rfr.fit(features, target)

    grid_xgb = GridSearchCV(XGBRegressor(), xgb_param_grid, cv = folds, refit=True, verbose = 0, scoring = score_calc, n_jobs = -1)
    with parallel_backend('threading'):
        grid_xgb.fit(features, target)
 
    grid_gbr = GridSearchCV(GradientBoostingRegressor(), gbr_param_grid, cv = folds, refit=True, verbose = 0, scoring = score_calc, n_jobs = n_jobs)
    grid_gbr.fit(features, target)

    return grid_rfr, grid_xgb, grid_gbr

In [2]:
for i in range(1):
    all_training_data = pd.read_pickle('uscensus_cleaned_train_df_' + str(i) + '.pkl')
    
    func_fitted_models = dict()
    
    for j in range(len(all_training_data)):
        
        train_data = all_training_data.loc[j].dataset
        
        train_data = train_data.drop('Education-num', axis = 1)

        processed_data = pd.get_dummies(train_data)
        processed_data = processed_data.rename(columns={"Income_<=50k": "Income_less_50k", "Income_>50k": "Income_greater_50k"})

        target = processed_data["Age"]
        features = processed_data.drop("Age", axis = 1)

        rfr, xgb, gbr = fit_all_models(features, target)

        func_fitted_models['uscensus_rfr_' + str(i) + '_' + str(j)] = rfr
        func_fitted_models['uscensus_xgb_' + str(i) + '_' + str(j)] = xgb
        func_fitted_models['uscensus_gbr_' + str(i) + '_' + str(j)] = gbr
    
    for model in func_fitted_models:
        fitted_model = func_fitted_models[model]
        joblib.dump(fitted_model, model+'_compressed.joblib', compress=3)

KeyboardInterrupt: 

In [None]:
for i in range(20):
    all_training_data = pd.read_pickle('uscensus_cleaned_train_df_' + str(i) + '.pkl')
    
    func_fitted_models = dict()
    
    for j in range(len(all_training_data)):
        
        train_data = all_training_data.loc[j].dataset
        
        train_data = train_data.drop('Education-num', axis = 1)

        processed_data = pd.get_dummies(train_data)
        processed_data = processed_data.rename(columns={"Income_<=50k": "Income_less_50k", "Income_>50k": "Income_greater_50k"})

        target = processed_data["Age"]
        features = processed_data.drop("Age", axis = 1)

        rfr, xgb, gbr = fit_all_models(features, target)

        func_fitted_models['uscensus_rfr_' + str(i) + '_' + str(j)] = rfr
        func_fitted_models['uscensus_xgb_' + str(i) + '_' + str(j)] = xgb
        func_fitted_models['uscensus_gbr_' + str(i) + '_' + str(j)] = gbr
    
    for model in func_fitted_models:
        fitted_model = func_fitted_models[model]
        joblib.dump(fitted_model, model+'_compressed.joblib', compress=3)