# Ames Housing ML Challenge

# Linear Modeling

In [33]:
import pandas as pd
import numpy as np

In [34]:
housing_train = pd.read_csv('housing_train.csv')
housing_test = pd.read_csv('housing_test.csv')

y_housing = pd.read_csv('y_housing.csv')

In [35]:
print('Full Train dimension:', housing_train.shape)
print('Y train dimension:', y_housing.shape)
print('Test data dimension:', housing_test.shape)



Full Train dimension: (1353, 266)
Y train dimension: (1353, 1)
Test data dimension: (1459, 266)


In [36]:
from sklearn import linear_model
from sklearn.linear_model import ElasticNet, Lasso, Ridge, SGDRegressor
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import  train_test_split
from sklearn.metrics import mean_squared_error, log_loss

import sklearn.model_selection as ms

from sklearn import ensemble
from sklearn.ensemble import RandomForestRegressor
from pprint import pprint
from sklearn.svm import SVR

In [37]:
X_train, X_test, y_train, y_test = train_test_split(housing_train, y_housing, test_size = 0.30, random_state = 42)

In [38]:
ridge = linear_model.Ridge(alpha = 1, normalize=True)
lasso = linear_model.Lasso(alpha= 0, normalize = True)
elasticnet = linear_model.ElasticNet(alpha = 0.01, l1_ratio=0.5, normalize=False)
clf = linear_model.SGDRegressor(max_iter=1000, tol=1e-3)

gbm = ensemble.GradientBoostingRegressor()
rf = ensemble.RandomForestRegressor(n_estimators = 800, min_samples_split=5,
                                    min_samples_leaf = 1, max_features = 'sqrt',
                                    max_depth = 92, bootstrap = False, random_state=42)



In [39]:
housing_train.shape

(1353, 266)

In [40]:
y_housing.shape

(1353, 1)

In [41]:
modelList =[ridge, lasso, elasticnet, clf, gbm, rf]
modelSeries= pd.Series(modelList, index =[ 'Ridge', 'Lasso', 'Elasticnet', 'SGD', 'XGboost', 'Random Forest'])



In [42]:
modelSeries.apply(lambda x:x.fit(X_train, y_train))

  """Entry point for launching an IPython kernel.
  positive)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  """Entry point for launching an IPython kernel.


Ridge            Ridge(alpha=1, copy_X=True, fit_intercept=True...
Lasso            Lasso(alpha=0, copy_X=True, fit_intercept=True...
Elasticnet       ElasticNet(alpha=0.01, copy_X=True, fit_interc...
SGD              SGDRegressor(alpha=0.0001, average=False, earl...
XGboost          ([DecisionTreeRegressor(criterion='friedman_ms...
Random Forest    (DecisionTreeRegressor(criterion='mse', max_de...
dtype: object

In [43]:
ans = pd.concat([modelSeries.apply(lambda x: x.score(X_train,y_train)),modelSeries.apply(lambda x: x.score(X_test,y_test)), 
                                    modelSeries.apply(lambda x: np.sqrt(mean_squared_error(x.predict(X_test), y_test)))],axis=1)

ans.columns = ['train score', 'test score', 'rmse']
ans

Unnamed: 0,train score,test score,rmse
Ridge,0.9006409,0.8879706,0.1201919
Lasso,0.9470344,0.8649913,0.1319441
Elasticnet,0.8690652,0.8592552,0.134718
SGD,-4.6643850000000004e+32,-4.77302e+32,7845233000000000.0
XGboost,0.9601478,0.8788919,0.1249671
Random Forest,0.9943611,0.8753699,0.1267712


## Prediction

In [44]:
import math

X_test['Prediction'] = ridge.predict(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [45]:
X_test.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_5,SaleType_6,SaleType_7,SaleType_8,SaleCondition_1,SaleCondition_2,SaleCondition_3,SaleCondition_4,SaleCondition_5,Prediction
49,50,3.044522,4.204693,8.954544,5,2.079442,1966,1966,0.0,6.638568,...,0,0,0,1,0,0,0,1,0,11.758386
638,687,4.110874,4.442651,9.230927,7,1.94591,2007,2007,0.0,0.0,...,0,1,0,0,0,0,0,0,1,12.303117
1033,1110,3.044522,4.682131,9.338118,8,1.791759,2004,2005,3.7612,6.946976,...,0,0,0,1,0,0,0,1,0,12.556005
746,806,3.044522,4.521789,9.41532,7,1.791759,2008,2008,5.549076,0.0,...,0,1,0,0,0,0,0,0,1,12.224043
918,986,5.252273,4.234107,9.294773,5,1.791759,1950,1950,0.0,6.947937,...,0,0,0,0,0,0,0,1,0,11.704229


In [46]:
X_test['Prediction'] = X_test['Prediction'] .apply(lambda x: math.exp(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [47]:
X_test.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_5,SaleType_6,SaleType_7,SaleType_8,SaleCondition_1,SaleCondition_2,SaleCondition_3,SaleCondition_4,SaleCondition_5,Prediction
49,50,3.044522,4.204693,8.954544,5,2.079442,1966,1966,0.0,6.638568,...,0,0,0,1,0,0,0,1,0,127821.034665
638,687,4.110874,4.442651,9.230927,7,1.94591,2007,2007,0.0,0.0,...,0,1,0,0,0,0,0,0,1,220381.804965
1033,1110,3.044522,4.682131,9.338118,8,1.791759,2004,2005,3.7612,6.946976,...,0,0,0,1,0,0,0,1,0,283794.332395
746,806,3.044522,4.521789,9.41532,7,1.791759,2008,2008,5.549076,0.0,...,0,1,0,0,0,0,0,0,1,203626.538196
918,986,5.252273,4.234107,9.294773,5,1.791759,1950,1950,0.0,6.947937,...,0,0,0,0,0,0,0,1,0,121082.703983


In [52]:
submission = X_test[['Id', 'Prediction']].sort_values(ascending = True, by ='Id')

In [53]:
submission.head()

Unnamed: 0,Id,Prediction
10,11,129202.34975
23,24,144767.687934
29,30,78341.307299
31,32,137638.991311
32,33,185293.444236


In [54]:
submission.to_csv('submission.csv')

In [None]:
#############################################################################################################

In [None]:
###################################################################################################################

# HyperParameter Tuning

In [None]:
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn import utils

lab_enc = preprocessing.LabelEncoder()
y_test_encoded = lab_enc.fit_transform(y_test)
print(y_test_encoded)

print(utils.multiclass.type_of_target(y_test))
print(utils.multiclass.type_of_target(y_test.astype('int')))
print(utils.multiclass.type_of_target(y_test_encoded))

In [None]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix
mistakes = 0 # the mistakes each individual tree makes
confusion_pair = 0

n_pairs = 0.5 * rf.n_estimators * (rf.n_estimators-1)
for idx, tree in enumerate(rf.estimators_):
    mistakes += confusion_matrix(tree.predict(X_test), y_test)
    for idx2, tree2 in enumerate(rf.estimators_):
            if idx2 == idx: continue
            confusion_pair += confusion_matrix(tree.predict(X_test), tree2.predict(X_test))   
print("Average per Tree Confusion:")
print(mistakes/rf.n_estimators*1.0)
print("Aggregate Pairwise Confusion:")
print(confusion_pair/n_pairs)

In [None]:
en_prediction = rf.predict(X_test)
en_mse = mean_squared_error(en_prediction, y_test)
en_rsme = np.sqrt(en_mse)
print('RF RMSE: %.2f' % en_rsme)

## Hyperparameter tuning RF

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
print('Parameters currently in use:\n')
pprint(rf.get_params())

In [None]:
###### number of trees in random forest
n_estimators =[int(x) for x in np.linspace(start =200, stop = 
                                           2000, num=10)]

In [None]:
##### Number of features to consider at every split
max_features = ['auto', 'sqrt']

In [None]:
#### Max number of levels in tree
max_depth = [int(x) for x in np.linspace(20, 110, num=11)]
max_depth.append(None)

In [None]:
###### min samples split a node
min_samples_split = [2, 5, 10]

In [None]:
#### Minimum number of samples required to split a node
min_samples_leaf = [1, 2, 4]

In [None]:
#### Method of selecting samples for training each tree
bootstrap = [True, False]

In [None]:
#### Create random grid
random_grid ={'n_estimators':n_estimators,
             'max_features': max_features,
             'max_depth': max_depth,
             'min_samples_split': min_samples_split,
             'min_samples_leaf': min_samples_leaf,
             'bootstrap':bootstrap}
pprint(random_grid)

In [None]:
#### Using random grid to search for best hyperparameters
### base mode
rf_random = RandomizedSearchCV(estimator = rf, param_distributions =
                              random_grid, n_iter = 100, cv = 3, verbose =2,
                              random_state=42,
                              n_jobs=-1)

In [None]:
#### Value error fix for refit model into rf_random

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn import utils

lab_enc = preprocessing.LabelEncoder()
y_train_encoded = lab_enc.fit_transform(y_train)
print(y_train_encoded)

print(utils.multiclass.type_of_target(y_train))
print(utils.multiclass.type_of_target(y_train.astype('int')))
print(utils.multiclass.type_of_target(y_train_encoded))

### New Model Fitting

In [None]:
rf_random.fit(X_train, y_train_encoded)

In [None]:
#### best parameters
rf_random.best_params_

### Ridge hyper parameters

In [None]:
from scipy.stats import uniform as sp_rand
print('Parameters currently in use:\n')
pprint(ridge.get_params())

In [None]:
param_grid ={'alpha':sp_rand()}
model = Ridge()
research = RandomizedSearchCV(estimator=model, param_distributions=param_grid,
                             n_iter = 100)

In [None]:
research.fit(X_train, y_train)
print(research)

In [None]:
print(research.best_score_)
print(research.best_estimator_.alpha)