<a href="https://colab.research.google.com/github/ilteberkonuralp/Data-Science/blob/master/Housepricepredictionproject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predict sales prices and practice feature engineering, RFs, and gradient boosting

![houseimg](https://storage.googleapis.com/kaggle-competitions/kaggle/5407/media/housesbanner.png)

# Importing Packages


In [None]:
import numpy as np
import pandas as pd
import keras
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.models import Sequential
from keras.optimizers import Adam
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from keras.wrappers.scikit_learn import KerasRegressor

# Reading Data and Spliting as Test Train


In [None]:
traindata=pd.read_csv("train.csv")
y_train = traindata['SalePrice']
X_train = traindata.drop('SalePrice', axis=1)
X_test=pd.read_csv("test.csv")


In [None]:
print("train X shape: ",X_train.shape)
print("train y shape: ",y_train.shape)
print("test X shape: ",X_test.shape)

In [None]:
X_train.dtypes

In [None]:
X_train_obj=X_train.select_dtypes(include=['object']).copy()
X_train_obj.head()

In [None]:
np.sum(X_train.isnull())

# Decision Tree Regressor

In [None]:
# Instantiate a DecisionTreeClassifier 'dt' with a maximum depth of 6
dt = DecisionTreeRegressor(random_state=412,max_depth=6,criterion="squared_error")

# Fit dt to the training set
dt.fit(X_train,y_train)

# Predict test set labels
test_predictions=dt.predict(X_test)
test_rmse=mean_squared_error(y_test, test_predictions, squared=False)
test_mae=mean_absolute_error(y_test, test_predictions)
test_mape=mean_absolute_percentage_error(y_test,test_predictions)
print("RMSE: ",test_rmse)
print("MAE: ",test_mae)
print("MAPE: ",test_mape)

# Artificial Neural Network Model


In [None]:
def get_nn_model(hiddenLayerOne=500, hiddenLayerTwo=400,
	dropout=0.3, learnRate=0.3):
	
	model = Sequential()
	model.add(Flatten())
    	# add two stacks of FC => RELU => DROPOUT
	model.add(Dense(hiddenLayerOne, activation="relu",
		input_shape=(76,)))
	model.add(Dropout(dropout))
	model.add(Dense(hiddenLayerTwo, activation="relu"))
	model.add(Dropout(dropout))
	# add a softmax layer on top
	model.add(Dense(1))
	# compile the model
	model.compile(
		optimizer=Adam(learning_rate=learnRate),
		loss="mean_squared_error")
	# return compiled model
	return model

model_ann = get_nn_model()
model_ann.fit(x=X_train, y=y_train,
	batch_size=16,
	epochs=10,validation_split=0.2)
test_predictions=model_ann.predict(X_test)
test_rmse=mean_squared_error(y_test, test_predictions, squared=False)
test_mae=mean_absolute_error(y_test, test_predictions)
test_mape=mean_absolute_percentage_error(y_test,test_predictions)
print("RMSE: ",test_rmse)
print("MAE: ",test_mae)
print("MAPE: ",test_mape)

## ANN Parameter Tuning


In [None]:
# model = KerasRegressor(build_fn=get_nn_model, verbose=0)
# hiddenLayerOne = np.arange(100,650,100)
# hiddenLayerTwo = np.arange(100,650,100)
# learnRate = np.linspace(0.00001,0.1,10,endpoint=True)
# dropout = np.linspace(0.2,0.9,8,endpoint=True)
# batchSize = [32,64,128,256]
# epochs = [10, 20, 30]
# hiddenLayerTwo=hiddenLayerTwo
# dropout=dropout
# batch_size=batchSize
# epochs=epochs
# validation_split=np.linspace(0.2,0.9,8,endpoint=True)
# # create a dictionary from the hyperparameter grid
# grid = dict(
# 	dropout=[0.8],
# 	epochs=[20],
# 	batch_size=[32],
# 	hiddenLayerOne=hiddenLayerOne,
# 	hiddenLayerTwo=[600],
#   learnRate= learnRate
# )
# print("[INFO] performing random search...")
# searcher = GridSearchCV(estimator=model, n_jobs=-1, 
# 	cv=KFold(shuffle=True,n_splits=5,random_state=412),
# 	param_grid=grid, scoring="neg_root_mean_squared_error")
# searchResults = searcher.fit(X_train, y_train)
# # summarize grid search information
# bestParams = searchResults.best_params_
# print("[INFO] best param is {}".format(bestParams))

## ANN Best Model Results


In [None]:
model_ann_bp = get_nn_model(hiddenLayerOne=500, hiddenLayerTwo=600,
					dropout=0.8,learnRate=0.01)

model_ann_bp.fit(x=X_train, y=y_train,
	batch_size=32,
	epochs=20,validation_split=0.2)
test_predictions=model_ann_bp.predict(X_test)
test_rmse=mean_squared_error(y_test, test_predictions, squared=False)
test_mae=mean_absolute_error(y_test, test_predictions)
test_mape=mean_absolute_percentage_error(y_test,test_predictions)
print("RMSE: ",test_rmse)
print("MAE: ",test_mae)
print("MAPE: ",test_mape)
# RMSE:  1.2557826908392997
# MAE:  1.026227475859509
# MAPE:  5.420278132156377

# XGBoost Model

In [None]:
model_xgb = XGBRegressor()
model_xgb.fit(X_train, y_train)
test_predicts = model_xgb.predict(X_test)
test_rmse=mean_squared_error(y_test, test_predicts, squared=False)
test_mae=mean_absolute_error(y_test, test_predicts)
test_mape=mean_absolute_percentage_error(y_test,test_predicts)
print("RMSE: ",test_rmse)
print("MAE: ",test_mae)
print("MAPE: ",test_mape)


## XGBoost Parameter Parameter Tuning

In [None]:
# # GridSearchCV
# xgb=XGBRegressor()
# parameters = {
#  "eta": np.linspace(0,1,10),
#  "gamma":[0],
#  "max_depth":[2,3,4,5,6]
#  }
# grid = GridSearchCV(estimator=xgb,param_grid=parameters,
#  scoring='neg_mean_squared_error', n_jobs=-1, cv=10, verbose=False )
# grid.fit(X_train,y_train)
# grid.best_estimator_

## XGBoost Best Model Results


In [None]:
model_xgb_best=XGBRegressor(seed=412,eta=0.1111111111,gamma=0,max_depth=5)
model_xgb_best.fit(X_train,y_train)
test_predicts=model_xgb_best.predict(X_test)
test_rmse=mean_squared_error(y_test, test_predicts, squared=False)
test_mae=mean_absolute_error(y_test, test_predicts)
test_mape=mean_absolute_percentage_error(y_test,test_predicts)
print("RMSE: ",test_rmse)
print("MAE: ",test_mae)
print("MAPE: ",test_mape)

# Support Vector Machine Regression


In [None]:
model_svm=SVR()
model_svm.fit(X_train,y_train)
test_predicts=model_svm.predict(X_test)
test_rmse=mean_squared_error(y_test, test_predicts, squared=False)
test_mae=mean_absolute_error(y_test, test_predicts)
test_mape=mean_absolute_percentage_error(y_test,test_predicts)
print("RMSE: ",test_rmse)
print("MAE: ",test_mae)
print("MAPE: ",test_mape)

## Support Vector Machine Regression Tuning

In [None]:
# params={"kernel":["rbf"],
#         "gamma":["scale"],
#         "C": np.linspace(0.1, 1.0, 50)
#         }
# regr=SVR()
# grid = GridSearchCV(estimator=regr,param_grid=params,
#  scoring='neg_mean_squared_error', n_jobs=-1,
#   cv=KFold(shuffle=True,n_splits=5,random_state=412), verbose=False )
# grid.fit(X_train,y_train)
# grid.best_params_

## Support Vector Machine Best Model Results

In [None]:
model_svm_best=SVR(C=1,gamma="scale",kernel="rbf",epsilon=0.3)
model_svm_best.fit(X_train, y_train)

test_predicts = model_svm_best.predict(X_test)

scores_mse=mean_squared_error(y_test, test_predicts, squared=False)
scores_mae=mean_absolute_error(y_test, test_predicts)
scores_mape=mean_absolute_percentage_error(y_test,test_predicts)
print("RMSE",scores_mse)
print("MAE",scores_mae)
print("MAPE",scores_mape)

# Random Forest

In [None]:
# Random Forest Estimation
model_rf=RandomForestRegressor()
model_rf.fit(X_train,y_train)
test_predicts=model_rf.predict(X_test)
test_rmse=mean_squared_error(y_test, test_predicts, squared=False)
test_mae=mean_absolute_error(y_test, test_predicts)
test_mape=mean_absolute_percentage_error(y_test,test_predicts)
print("RMSE: ",test_rmse)
print("MAE: ",test_mae)
print("MAPE: ",test_mape)

## Random Forest Tuning

In [None]:
# rf=RandomForestRegressor()
# rfparams={
#     "n_estimators":np.array(100,501,50),
#     "max_depth":np.array(1,51),
#     'min_samples_split': [2, 5, 10],
#     "criterion":["squared_error"],
#     "max_features":np.arange(1,10)
# }
# grid = GridSearchCV(estimator=rf,param_grid=rfparams,
#  scoring='neg_mean_squared_error', n_jobs=-1,
#  cv=KFold(shuffle=True,n_splits=5,random_state=412), verbose=False )
#  grid.fit(X_train,y_train)
#  grid.best_params_ 

## Random Forest Best Model Results

In [None]:
model_rf_best=RandomForestRegressor(n_estimators=150,max_depth=16,min_samples_split=2,max_features= 9)
model_rf_best.fit(X_train,y_train)
test_predicts=model_rf_best.predict(X_test)
test_rmse=mean_squared_error(y_test, test_predicts, squared=False)
test_mae=mean_absolute_error(y_test, test_predicts)
test_mape=mean_absolute_percentage_error(y_test,test_predicts)
print("RMSE: ",test_rmse)
print("MAE: ",test_mae)
print("MAPE: ",test_mape)