In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import matplotlib as mpl, matplotlib.pyplot as plt, numpy as np, pandas as pd

In [13]:
df = pd.read_pickle('AmesHousing_Encoded_Combined.pkl')
df.drop(columns=['Lot Frontage'], inplace=True)
cols = list(df.columns)

X, y = df.iloc[:,:-1].values, df.iloc[:,-1].values
rf_reg = RandomForestRegressor(n_estimators=100,max_depth=22, random_state=21)
rf_reg.fit(X,y)

train_r_sq = rf_reg.score(X,y)

print(train_r_sq)

0.9857617750240101


$R^2 = 0.98576$ on training data when including all features

In [33]:
rf_regs = list()
indiv_rf_scores = list()

for i in range(10):
    
    rf_reg = RandomForestRegressor(n_estimators=100,max_depth=22, random_state=21)
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
    
    print(X_train[0][0])
    
    rf_reg.fit(X_train, y_train)
    
    rf_regs.append(rf_reg)
    
    indiv_rf_scores.append(rf_reg.score(X_test, y_test))
    
indiv_rf_scores

0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0


[0.9036103033327813,
 0.8848048071910736,
 0.872996043925884,
 0.8847456617727617,
 0.8572744722439011,
 0.8859016556781873,
 0.8726224551315038,
 0.891859417386925,
 0.8952296306906703,
 0.8727764554452093]

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [35]:
rf_regs

[RandomForestRegressor(max_depth=22, random_state=21),
 RandomForestRegressor(max_depth=22, random_state=21),
 RandomForestRegressor(max_depth=22, random_state=21),
 RandomForestRegressor(max_depth=22, random_state=21),
 RandomForestRegressor(max_depth=22, random_state=21),
 RandomForestRegressor(max_depth=22, random_state=21),
 RandomForestRegressor(max_depth=22, random_state=21),
 RandomForestRegressor(max_depth=22, random_state=21),
 RandomForestRegressor(max_depth=22, random_state=21),
 RandomForestRegressor(max_depth=22, random_state=21)]

In [36]:
rf_regs[0].predict(X_test)[0], rf_regs[1].predict(X_test)[0]

(98079.71, 93934.0)

In [37]:
y_preds = [rf_reg.predict(X_test) for rf_reg in rf_regs]

In [41]:
mean_y_pred = [np.mean([y_pred[i] for y_pred in y_preds]) for i in range(len(y_preds[0]))]

In [43]:
print([int(np.around(item,1)) for item in mean_y_pred[:10]])
print(y_test[:10])

[96423, 128774, 172425, 255045, 104909, 146402, 278404, 257472, 458986, 130636]
[ 89500 120000 172500 278000  97500 148000 183850 270000 445000 129500]


In [44]:
from sklearn.metrics import r2_score

In [45]:
r2_score(mean_y_pred, y_test)

0.9709275503117615

Let's be more careful about making sure we aren't training any of the random forest classifiers on the validation set.

In [94]:
df = pd.read_pickle('AmesHousing_Encoded_Combined.pkl')
df.drop(columns=['Lot Frontage'], inplace=True)

cols = list(df.columns)


X, y = df.iloc[:,:-1].values, df.iloc[:,-1].values
X, X_valid, y, y_valid = train_test_split(X, y, test_size=0.15)

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [51]:
rf_reg.fit(X_train, y_train)
rf_reg.score(X_test, y_test)

0.8828720433224921

In [52]:
rf_regs = list()
indiv_rf_scores = list()

for i in range(10):
    
    rf_reg = RandomForestRegressor(n_estimators=100,max_depth=22,random_state=21)
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
    
    print(X_train[0][0])
    
    rf_reg.fit(X_train, y_train)
    
    rf_regs.append(rf_reg)
    
    indiv_rf_scores.append(rf_reg.score(X_test, y_test))
    
indiv_rf_scores

0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


[0.8984642074366375,
 0.8286158713047747,
 0.8236586340968126,
 0.8795669109113073,
 0.9008677707480733,
 0.8981478799613283,
 0.8946806426593981,
 0.8515995305984716,
 0.9149527029509773,
 0.8890661971753628]

In [55]:
y_preds = [rf_reg.predict(X_valid) for rf_reg in rf_regs]

In [63]:
mean_y_pred = [np.mean([y_pred[i] for y_pred in y_preds]) for i in range(len(y_preds[0]))]

In [62]:
weighted_mean_y_pred = [np.dot([y_pred[i] for y_pred in y_preds], indiv_rf_scores/np.sum(indiv_rf_scores)) for i in range(len(y_preds[0]))]

In [64]:
r2_score(weighted_mean_y_pred, y_valid)

0.854522683680817

In [65]:
r2_score(mean_y_pred, y_valid)

0.8547780137809988

Before moving on to imputing the `Lot Frontage` null values with a similar random forest approach as has been taken up above, let's include polynomial features.

In [66]:
df

Unnamed: 0,MS SubClass_120,MS SubClass_150,MS SubClass_160,MS SubClass_180,MS SubClass_190,MS SubClass_20,MS SubClass_30,MS SubClass_40,MS SubClass_45,MS SubClass_50,...,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold,SalePrice
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,210,62,0,0,0,0,0,5,2010,215000
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,140,0,0,0,120,0,0,6,2010,105000
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,393,36,0,0,0,0,12500,6,2010,172000
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,4,2010,244000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,212,34,0,0,0,0,0,3,2010,189900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2673,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,474,0,0,0,0,0,0,9,2006,131000
2674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,120,0,0,0,0,0,0,3,2006,142500
2675,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,164,0,0,0,0,0,0,6,2006,131000
2676,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,240,38,0,0,0,0,0,4,2006,170000


In [67]:
from sklearn.preprocessing import PolynomialFeatures

In [95]:
poly = PolynomialFeatures(2, interaction_only=True)
poly_X = poly.fit_transform(X)

In [107]:
len(poly_X[0]), len(poly_X)

(25426, 2276)

In [108]:
len(y)

2276

2678

In [96]:
rf_reg = RandomForestRegressor(n_estimators=100,max_depth=22,random_state=21)
poly_X_train, poly_X_test, y_train, y_test = train_test_split(poly_X,y,test_size=0.2)

In [97]:
num_poly_feats = len(poly_X_train[0])

In [98]:
new_feats = poly.get_feature_names()

In [99]:
poly_df = pd.DataFrame(columns=new_feats)

In [114]:
poly_X_cols = [[poly_X[i][j] for i in range(len(poly_X))] for j in range(len(poly_X[0]))]

In [115]:
from scipy import stats

In [123]:
keep_feats = list()
for index, feat in enumerate(new_feats):
    pearsonr = stats.pearsonr(poly_X_cols[index],y)[0]
    if np.abs(pearsonr) > 0.1:
        keep_feats.append([index, feat])



In [None]:
poly_df

In [76]:
rf_reg.fit(poly_X_train, y_train)

KeyboardInterrupt: 

In [None]:
rf_reg.score(poly_X_test, y_test)

In the next notebook, let's construct product features manually and compute the 