In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

In [2]:
os.chdir("/Users/Hatim/Desktop/assignment 5")
data = pd.read_csv("home_prices_train.csv",index_col=0)

In [4]:
#removing outliers
data = data[data["GrLivArea"]<4000]

#treating missing data
def missing_data_treatment(dataframe):
    "It is likely that no street is connected to the property and hence the LotFrontage is zero"
    dataframe.loc[:,"LotFrontage"] = dataframe.loc[:,"LotFrontage"].fillna(0)
    "There is no alley access when Alley = NaN"
    dataframe.loc[:,"Alley"] = dataframe.loc[:,"Alley"].fillna("NoAlleyAccess")
    "No Basement when NaN values present in BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2"
    dataframe.loc[:,"BsmtQual"] = dataframe.loc[:,"BsmtQual"].fillna("No")
    dataframe.loc[:,"BsmtCond"] = dataframe.loc[:,"BsmtCond"].fillna("No")
    dataframe.loc[:,"BsmtExposure"] = dataframe.loc[:,"BsmtExposure"].fillna("No")
    dataframe.loc[:,"BsmtFinType1"] = dataframe.loc[:,"BsmtFinType1"].fillna("No")
    dataframe.loc[:,"BsmtFinType2"] = dataframe.loc[:,"BsmtFinType2"].fillna("No")
    "NaN in Electrical means No Electrial System"
    dataframe.loc[:,"Electrical"] = dataframe.loc[:,"Electrical"].fillna("NoElectricalSystem")
    "NaN in FireplaceQu means no Fireplace present"
    dataframe.loc[:,"FireplaceQu"] = dataframe.loc[:,"FireplaceQu"].fillna("NoFireplacePresent")
    "No Garage present wherever there is NaN in garage related fields as discussed in the above cell"
    dataframe.loc[:,"GarageType"] = dataframe.loc[:,"GarageType"].fillna("NoGarage")
    dataframe.loc[:,"GarageYrBlt"] = dataframe.loc[:,"GarageYrBlt"].fillna("NoGarage")
    dataframe.loc[:,"GarageFinish"] = dataframe.loc[:,"GarageFinish"].fillna("NoGarage")
    dataframe.loc[:,"GarageQual"] = dataframe.loc[:,"GarageQual"].fillna("NoGarage")
    dataframe.loc[:,"GarageCond"] = dataframe.loc[:,"GarageCond"].fillna("NoGarage")
    "Similarly for other variables"
    dataframe.loc[:,"PoolQC"] = dataframe.loc[:,"PoolQC"].fillna("NoPool")
    dataframe.loc[:,"Fence"] = dataframe.loc[:,"Fence"].fillna("NoFence")
    dataframe.loc[:,"MiscFeature"] = dataframe.loc[:,"MiscFeature"].fillna("NoMisc")
    """MasVnrType and MasVnrArea also has NaN values. We cannot comment anything on those NaN values because it 
    already has the field None. We cannot assume those NaN values to be None. The other option would be to drop those 
    NaN values. But since out of those 8 values one of them has a sales price of 465000, it wouldn't be wise to 
    drop those 8 values. Only option left is to assume it to be None and 0."""
    dataframe.loc[:,"MasVnrType"] = dataframe.loc[:,"MasVnrType"].fillna("None")
    dataframe.loc[:,"MasVnrArea"] = dataframe.loc[:,"MasVnrArea"].fillna(0)
    
#calling the missing data treatment function on the data
missing_data_treatment(data)

In [7]:
data['SaleCondition'] = data['SaleCondition'].astype('category')
data['SaleCondition'] = data['SaleCondition'].cat.reorder_categories(['Abnorml', 'Partial', "AdjLand",'Alloca',"Normal","Family"], ordered=True)
data['SaleCondition'] = data['SaleCondition'].cat.codes
data['Functional'] = data['Functional'].astype('category')
data['Functional'] = data['Functional'].cat.reorder_categories(['Typ','Min1','Min2','Mod','Maj1','Maj2','Sev'])
data["Functional"] = data['Functional'].cat.codes
data['ExterQual'] = data['ExterQual'].astype('category')
data['ExterQual'] = data['ExterQual'].cat.reorder_categories(['Ex','Gd','TA','Fa'])
data["ExterQual"] = data['ExterQual'].cat.codes
data['ExterCond'] = data['ExterCond'].astype('category')
data['ExterCond'] = data['ExterCond'].cat.reorder_categories(['Ex','Gd','TA','Fa','Po'])
data["ExterCond"] = data['ExterCond'].cat.codes

In [22]:
def gridsearch(X_train,y_train,X_test,y_test,model, params, scoring,cv=None):
    gs_cv = GridSearchCV(model,params,scoring=scoring,n_jobs=-1,cv=cv)
    gs_cv.fit(X_train, y_train)
    y_pred_train = gs_cv.predict(X_train)
    y_pred_test = gs_cv.predict(X_test)
    print("----------------------------TRAINING DATA----------------------------")
    print("mean_squared_error on training set is: "+str(mean_squared_error(y_train,y_pred_train)))
    print("r2_score score on training set is: "+str(r2_score(y_train,y_pred_train)))
    print("----------------------------TEST DATA----------------------------")     
    print("mean_squared_error on test set is: "+str(mean_squared_error(y_test,y_pred_test)))
    print("r2_score on test set is: "+str(r2_score(y_test,y_pred_test)))
    print("-----------------------------------------------------------------")
    print("BEST PARAMETERS BASED ON "+scoring+" AS A SCORING PARAMETER")  
    print("Best parameters: "+str(gs_cv.best_params_))

In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import RFECV
from sklearn.model_selection import ShuffleSplit
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.simplefilter(action='ignore', category=(UserWarning,RuntimeWarning,DeprecationWarning))

In [29]:
scaler = StandardScaler()
X=data[["LotFrontage","GrLivArea","OverallQual","OverallCond","YearBuilt","Functional","TotRmsAbvGrd","LotArea","ExterQual","1stFlrSF",
      "GarageArea","BedroomAbvGr","KitchenAbvGr","2ndFlrSF"]]
y=data["SalePrice"]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
rnd_reg = RandomForestRegressor(n_estimators = 500,n_jobs = -1)

In [16]:
params_grid = {'max_features': ['auto', 'sqrt', 'log2'],"max_depth":[None,8,9,10,11],
               "min_weight_fraction_leaf":[0.0,0.3,0.5]}

In [23]:
gridsearch(X_train,y_train,X_test,y_test,rnd_reg, params_grid, "r2",cv=10)

----------------------------TRAINING DATA----------------------------
mean_squared_error on training set is: 102350841.159
r2_score score on training set is: 0.983024529375
----------------------------TEST DATA----------------------------
mean_squared_error on test set is: 644486979.274
r2_score on test set is: 0.877203580469
-----------------------------------------------------------------
BEST PARAMETERS BASED ON r2 AS A SCORING PARAMETER
Best parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_weight_fraction_leaf': 0.0}


In [25]:
#using the above parameters, fitting a random regressor to our data 
rnd_reg = RandomForestRegressor(n_estimators = 500,max_depth=None,max_features="sqrt",n_jobs = -1,oob_score=True)
rnd_reg.fit(X,y)
print("oob_score "+str(rnd_reg.oob_score_))
for score, name in sorted(zip(rnd_reg.feature_importances_,X.columns)):
    print(name,score)

oob_score 0.875514282266
Functional 0.00330416143926
KitchenAbvGr 0.0048387766224
BedroomAbvGr 0.0118692593498
OverallCond 0.0139343820962
LotFrontage 0.025531064328
2ndFlrSF 0.0413537789821
TotRmsAbvGrd 0.0420313260821
LotArea 0.0527372713087
YearBuilt 0.100704981612
GarageArea 0.107900954135
1stFlrSF 0.108530574084
ExterQual 0.117419671145
GrLivArea 0.145417928778
OverallQual 0.224425870037


In [26]:
#dropping fucntional, kitchenabvgr, bedroomabvgr
X = X.drop(labels=["Functional","KitchenAbvGr","BedroomAbvGr"],axis=1)
rnd_reg = RandomForestRegressor(n_estimators = 500,max_depth=None,max_features="sqrt",n_jobs = -1,oob_score=True)
rnd_reg.fit(X,y)
print("oob_score "+str(rnd_reg.oob_score_))

oob_score 0.873954320023


In [27]:
#decrease in oob score

In [28]:
gridsearch(X_train,y_train,X_test,y_test,rnd_reg, params_grid, "neg_mean_squared_error",cv=10)

----------------------------TRAINING DATA----------------------------
mean_squared_error on training set is: 103393765.683
r2_score score on training set is: 0.982851554396
----------------------------TEST DATA----------------------------
mean_squared_error on test set is: 651582908.728
r2_score on test set is: 0.875851567537
-----------------------------------------------------------------
BEST PARAMETERS BASED ON neg_mean_squared_error AS A SCORING PARAMETER
Best parameters: {'max_depth': None, 'max_features': 'log2', 'min_weight_fraction_leaf': 0.0}


In [30]:
#using the above parameters, fitting a random regressor to our data
rnd_reg = RandomForestRegressor(n_estimators = 500,max_depth=None,max_features="log2",n_jobs = -1,oob_score=True)
rnd_reg.fit(X,y)
print("oob_score "+str(rnd_reg.oob_score_))
for score, name in sorted(zip(rnd_reg.feature_importances_,X.columns)):
    print(name,score)

oob_score 0.876406567512
Functional 0.00346708168588
KitchenAbvGr 0.00403101929474
BedroomAbvGr 0.0117242058602
OverallCond 0.0131737346431
LotFrontage 0.0256169001186
2ndFlrSF 0.0408139072587
TotRmsAbvGrd 0.045451604404
LotArea 0.0542070236801
GarageArea 0.0983542421289
YearBuilt 0.0986034924911
1stFlrSF 0.101083665413
ExterQual 0.109192859344
GrLivArea 0.150917791843
OverallQual 0.243362471835


In [31]:
#dropping fucntional, kitchenabvgr, bedroomabvgr
X = X.drop(labels=["Functional","KitchenAbvGr","BedroomAbvGr"],axis=1)
rnd_reg = RandomForestRegressor(n_estimators = 500,max_depth=None,max_features="log2",n_jobs = -1,oob_score=True)
rnd_reg.fit(X,y)
print("oob_score "+str(rnd_reg.oob_score_))

oob_score 0.872538381536


In [32]:
#decrease in oob score

In [39]:
#selecting the same variables as linear regression
X=data[["GrLivArea","OverallQual","YearBuilt","Functional","LotArea","ExterQual","1stFlrSF","2ndFlrSF"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
gridsearch(X_train,y_train,X_test,y_test,rnd_reg, params_grid, "r2",cv=10)

----------------------------TRAINING DATA----------------------------
mean_squared_error on training set is: 182187139.889
r2_score score on training set is: 0.969783223993
----------------------------TEST DATA----------------------------
mean_squared_error on test set is: 667157598.596
r2_score on test set is: 0.872884065923
-----------------------------------------------------------------
BEST PARAMETERS BASED ON r2 AS A SCORING PARAMETER
Best parameters: {'max_depth': 11, 'max_features': 'sqrt', 'min_weight_fraction_leaf': 0.0}


In [36]:
#using the above parameters, fitting a random regressor to our data 
rnd_reg = RandomForestRegressor(n_estimators = 500,max_depth=11,max_features="sqrt",n_jobs = -1,oob_score=True)
rnd_reg.fit(X,y)
print("oob_score "+str(rnd_reg.oob_score_))
for score, name in sorted(zip(rnd_reg.feature_importances_,X.columns)):
    print(name,score)

oob_score 0.867254734592
Functional 0.00623130041241
2ndFlrSF 0.0611593819661
LotArea 0.0763446744623
YearBuilt 0.12278217856
ExterQual 0.138820798522
1stFlrSF 0.153959222408
GrLivArea 0.184001828068
OverallQual 0.256700615601


In [40]:
X = X.drop(labels=["Functional","2ndFlrSF"],axis=1)
rnd_reg = RandomForestRegressor(n_estimators = 500,max_depth=11,max_features="sqrt",n_jobs = -1,oob_score=True)
rnd_reg.fit(X,y)
print("oob_score "+str(rnd_reg.oob_score_))

oob_score 0.865071656999


In [41]:
#decrease in oob score.

In [42]:
gridsearch(X_train,y_train,X_test,y_test,rnd_reg, params_grid, "neg_mean_squared_error",cv=10)

----------------------------TRAINING DATA----------------------------
mean_squared_error on training set is: 200603136.857
r2_score score on training set is: 0.966728825885
----------------------------TEST DATA----------------------------
mean_squared_error on test set is: 701392183.608
r2_score on test set is: 0.866361227451
-----------------------------------------------------------------
BEST PARAMETERS BASED ON neg_mean_squared_error AS A SCORING PARAMETER
Best parameters: {'max_depth': 10, 'max_features': 'log2', 'min_weight_fraction_leaf': 0.0}


In [43]:
#using the above parameters, fitting a random regressor to our data 
rnd_reg = RandomForestRegressor(n_estimators = 500,max_depth=10,max_features="log2",n_jobs = -1,oob_score=True)
rnd_reg.fit(X,y)
print("oob_score "+str(rnd_reg.oob_score_))
for score, name in sorted(zip(rnd_reg.feature_importances_,X.columns)):
    print(name,score)

oob_score 0.866779951001
LotArea 0.072347515251
YearBuilt 0.131219880901
1stFlrSF 0.136746793843
ExterQual 0.154551865629
GrLivArea 0.218452704252
OverallQual 0.286681240123


In [None]:
#Random Forest Regressor performed better than Linear regression and SVR