In [1]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import scale
from sklearn.ensemble import RandomForestRegressor

#df_13 = pd.read_csv('13_clean.csv')
#df_13.head()

years = ['09', '11', '13']
years_num = [2009, 2011, 2013]

df_allYears = pd.DataFrame()
for year, i in zip(years, years_num):
    df = pd.read_csv(year + '_clean.csv')
    df['YEAR'] = i
    df_allYears = df_allYears.append(df)
    
#choose columns that are independent, e.g. COSTMED, COSTMEDRELAMITPCT, FMTCOSTMEDRELAMIPCT 
#show the same data in different format; ditto for FMTINCRELAMICAT and INCRELAMIPCT
# drop CONTROL as well
cols = ['CONTROL', 'COSTMED', 'FMTCOSTMEDRELAMICAT', 'FMTINCRELAMICAT']
df_toKeep = df_allYears.drop(cols, axis=1)
print(df_toKeep.shape)

#removing columns related to cost since these are related to target variable ZSMHC (monthly housing cost)
cols = ['COSTMEDRELAMIPCT', 'BURDEN', 'OTHERCOST', 'UTILITY']
df_toKeep = df_toKeep.drop(cols, axis=1)
print(df_toKeep.shape)

df_toKeep = df_toKeep.dropna()
print(df_toKeep.shape)

#remove values of FMTMETRO set to 9;
df_toKeep = df_toKeep.loc[~df_toKeep['FMTMETRO'].str.contains('9', regex=False)]


#drop FMTSTATUS - earlier generated ill-defined matrix warning for ridge regression;
#values for this column is constanct for several years
df_toKeep = df_toKeep.drop('FMTSTATUS', axis=1)

#converting REGION from int type to string type
df_toKeep['REGION'] = df_toKeep['REGION'].apply(str)

#convert categorical variables to indicator variables
df_toKeep = pd.get_dummies(df_toKeep, drop_first=True)

print(df_toKeep.columns)

(240072, 33)
(240072, 29)
(148498, 29)
Index(['AGE', 'BEDRMS', 'FMR', 'INCRELAMIPCT', 'IPOV', 'LMED', 'NUNITS', 'PER',
       'ROOMS', 'TOTSAL', 'VALUE', 'ZINC2', 'ZSMHC', 'ABL50', 'ABL80',
       'ABLMED', 'GL50', 'GL80', 'GLMED', 'L50', 'YEAR',
       'FMTBUILT_'1960-1979'', 'FMTBUILT_'1980-1989'', 'FMTBUILT_'1990-1999'',
       'FMTBUILT_'2000-2009'', 'FMTBUILT_'After 2010'',
       'FMTBUILT_'not_defined'', 'FMTSTRUCTURETYPE_'2 2-4 units'',
       'FMTSTRUCTURETYPE_'3 5-19 units'', 'FMTSTRUCTURETYPE_'4 20-49 units'',
       'FMTSTRUCTURETYPE_'5 50+ units'', 'FMTSTRUCTURETYPE_'6 Mobile Home'',
       'REGION_2.0', 'REGION_3.0', 'REGION_4.0', 'FMTMETRO_'Central City'',
       'FMTZADEQ_'2 Moderately Inadequ'', 'FMTZADEQ_'3 Severely Indadequa''],
      dtype='object')


In [2]:
X = df_toKeep.drop(['ZSMHC'], axis=1).values

y = df_toKeep['ZSMHC'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

param_grid = {'max_depth': [3, 5, 7, None], 'max_features': ['auto', 'sqrt', 'log2'], 'n_estimators': [10, 100]}

rf = RandomForestRegressor(random_state=42, n_jobs=-1)

gsc = GridSearchCV(rf, param_grid, cv=3)

gsc.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=-1,
                                             oob_score=False, random_state=42,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [3, 5, 7, None],
     

In [3]:
print(gsc.best_params_)
print(gsc.best_score_)

{'max_depth': None, 'max_features': 'auto', 'n_estimators': 100}
0.5409886400229429


In [4]:
gsc.cv_results_

{'mean_fit_time': array([  4.4289515 ,  28.57955257,   0.74353242,   6.39285485,
          0.71794534,   5.55334536,   4.69331495,  46.65495968,
          1.04727054,   9.86921906,   0.9920675 ,   8.72621846,
          6.05841152,  60.0687139 ,   1.30201046,  12.71239138,
          1.18374602,  11.07852888,  13.6378599 , 135.74470663,
          3.12850293,  30.43935847,   2.81323592,  26.81433837]),
 'std_fit_time': array([2.13674150e+00, 1.48503032e-01, 5.56654226e-03, 5.75579739e-02,
        3.72837381e-02, 3.36273894e-02, 1.65456001e-02, 4.11011989e-01,
        1.59450619e-02, 4.14855631e-02, 2.51392769e-02, 1.14410969e-01,
        6.23915436e-02, 2.60462930e-01, 1.53459525e-03, 1.38878185e-01,
        1.90786013e-02, 9.41861755e-02, 3.57867339e-02, 8.65970508e-01,
        1.68661015e-02, 1.93594420e-01, 7.19907055e-02, 1.24742169e-01]),
 'mean_score_time': array([0.16270693, 0.3152411 , 0.11470381, 0.31525739, 0.11482986,
        0.31537747, 0.11582669, 0.38549479, 0.11552024, 0.31