In [1]:
import multiprocessing as mp
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.utils import resample
from sklearn.model_selection import StratifiedKFold

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor

# https://johaupt.github.io/python/parallel%20processing/cross-validation/multiprocessing_cross_validation.html

In [2]:
def standardize(non_binary_predictors, X_train, X_test=None):
    
    scaler = StandardScaler().fit(X_train[non_binary_predictors])
    
    # if x_test is passed in, or just x_train 
    if X_test is not None:
        X_train[non_binary_predictors] = scaler.transform(X_train[non_binary_predictors])
        X_test[non_binary_predictors] = scaler.transform(X_test[non_binary_predictors])
        return X_train, X_test
    else:
        X_train[non_binary_predictors] = scaler.transform(X_train[non_binary_predictors])
        return X_train  

def split_and_standardize(df, x_cols, y_col, non_binary_preds, interact_cols=None):
    # separate into X and y
    X = df[x_cols]
    y = df[y_col]
    
    if interact_cols != None:
        for i,c in enumerate(interact_cols.keys()):
            X[c] = X[interact_cols[c][0]]*X[interact_cols[c][1]]
    
    # split into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=109)
    
    # standardize
    X_train_standardized, X_test_standardized = standardize(non_binary_preds, X_train,X_test)
    
    return X_train_standardized, y_train, X_test_standardized, y_test

In [3]:
df_train = pd.read_csv('zip_train_v2.csv')
df_test= pd.read_csv('zip_test_v2.csv')

In [4]:
x_col = ['sex', 'age','poverty', 'popdensity', 'medianhousevalue', 
        'medhouseholdincome', 'pct_owner_occ','education',
        'smoke_rate', 'mean_bmi', 'rmax', 'pr', 'race_0', 'race_1', 'race_2',
        'race_3', 'race_4', 'race_5', 'race_6','ICU_DAY', 'CCI_DAY', 'LOS', 
        'Parkinson_pdx2dx_25','Alzheimer_pdx2dx_25', 'Dementia_pdx2dx_25', 'CHF_pdx2dx_25',
       'AMI_pdx2dx_25', 'COPD_pdx2dx_25', 'DM_pdx2dx_25', 'Stroke_pdx2dx_25',
       'CVD_pdx2dx_25', 'Ischemic_stroke_pdx2dx_25','Hemo_Stroke_pdx2dx_25',
        'neo_140_149', 'neo_150_159', 'neo_160_165',
       'neo_170_176', 'neo_179_189', 'neo_190_199', 'neo_200_209',
       'neo_210_229', 'neo_230_234', 'neo_235_238', 'neo_239','pm25_summer_4y_avg', 
       'pm25_winter_4y_avg', 'pm25_fall_4y_avg', 'pm25_spring_4y_avg','ozone_summer_4y_avg', 'ozone_winter_4y_avg', 
       'ozone_fall_4y_avg', 'ozone_spring_4y_avg','no2_summer_4y_avg', 'no2_winter_4y_avg', 'no2_fall_4y_avg', 
       'no2_spring_4y_avg', 'summer_tmmx_4y_avg','summer_rmax_4y_avg', 'winter_tmmx_4y_avg', 'winter_rmax_4y_avg']

y_col = 'death'
# non_binary_preds = x_col

# split into train and test and standardize
# X_train_standardized, y_train, X_test_standardized, y_test = split_and_standardize(df_train, x_col, y_col, non_binary_preds)


In [5]:
# model_library = {}
# for n in [30,40,50,60]:
#     for d in range(10,14):
#         model_library['rf_'+str(n)+'_'+str(d)] = RandomForestRegressor(n_estimators = n, max_depth = d, max_features='sqrt')


In [28]:
def benchmark_models(X,y, split):
    # split
    X_train, y_train = X.iloc[list(split[0]),:], y.iloc[list(split[0])]
    X_test, y_test =  X.iloc[list(split[1]),:], y.iloc[list(split[1])]
    
    # model library
    model_library = {}
    for n in [60]:
        for d in [15,16,17,18]:
            model_library['rf_'+str(n)+'_'+str(d)] = RandomForestRegressor(n_estimators = n, max_depth = d, max_features='sqrt')
                                                 
    results_train = {}
    results_test = {}
    for model_name, model in model_library.items():
        model.fit(X_train, y_train)
        results_train[model_name] = model.score(X_train, y_train)
        results_test[model_name] = model.score(X_test, y_test)
    return (pd.DataFrame(results_train, index=['rf_train_scores']),pd.DataFrame(results_test, index=['rf_test_scores']))

In [29]:
splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
folds = list(splitter.split(df_train[x_col], df_train[y_col]))



In [30]:
# benchmark_models(df_train[x_col], df_train[y_col], split=folds[0])

In [36]:
pool = mp.Pool(5)
# mp.cpu_count()

In [37]:
results = []
def rf_result(x):
    results.append(x)
    
for fold in folds:
    pool.apply_async(benchmark_models, args=(df_train[x_col], df_train[y_col], fold), callback=rf_result)

In [38]:
pool.close()
pool.join()

In [40]:
results

In [41]:
result_train = pd.concat([results[i][0] for i in range(len(folds))], axis=0, sort=True)
result_test = pd.concat([results[i][1] for i in range(len(folds))], axis=0, sort=True)

In [42]:
result_train.mean(axis=0), result_test.mean(axis=0)

In [None]:
# 60 trees, # 50 depth