In [1]:
from tqdm import tqdm
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import numpy as np
import pandas as pd 
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor, ExtraTreesRegressor, IsolationForest
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor



import os
for dirname, _, filenames in os.walk('/kaggle/input/house-prices-advanced-regression-techniques'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [2]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
df = pd.concat([train, test])

In [None]:
corr = df.corr()
corr.style.background_gradient(cmap = 'coolwarm')


checking co relation among the features

In [None]:
df.duplicated().sum()

Checking for duplicates. 

In [None]:
countmissing = df.isnull().sum().sort_values(ascending=False)
percentmissing = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
wholena = pd.concat([countmissing,percentmissing], axis=1)
print(wholena)

In [3]:
for col in ('Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish','GarageQual','GarageCond','PoolQC','Fence','MiscFeature'):
    df[col] = df[col].fillna('none')
for col in ('MSZoning', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Electrical','BsmtFullBath', 'BsmtHalfBath', 'KitchenQual', 'Functional','GarageCars', 'SaleType'):
    df[col] = df[col].fillna(df[col].mode()[0])
df.Utilities= df.Utilities.fillna('AllPub')
df.GarageYrBlt = df.GarageYrBlt.fillna(0)
for col in ('GarageArea', 'MasVnrArea', 'TotalBsmtSF', 'BsmtUnfSF', 'BsmtFinSF2', 'BsmtFinSF1','LotFrontage'):
    df[col]= df[col].fillna(df[col].mean())

In [4]:
df = pd.get_dummies(df)

One hot encoding using get dummies.

In [5]:
id = df[['Id', 'SalePrice']]
df = df.drop('Id', axis =1)
y = df.SalePrice
y = pd.DataFrame(y)
y = y[y['SalePrice'].notnull()]

In [6]:
trainX = df[df['SalePrice'].notnull()]
del trainX['SalePrice']
testX =  df[df['SalePrice'].isnull()]
del testX['SalePrice']

In [None]:
If = IsolationForest()
If.fit(trainX)
trainX['anamoly'] = If.predict(trainX)
trainX.anamoly.value_counts()

In [7]:
trainx, testx, trainy,testy = train_test_split(trainX, y, train_size = 0.7, random_state = 100)  

Splitting into test and train

In [8]:
trainxcon = trainx.loc[: , ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1','BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea']]
testxcon = testx.loc[: , ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1','BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea']]
ss = StandardScaler()
trainxcons = pd.DataFrame(ss.fit_transform(trainxcon), columns = trainxcon.columns)
testxcons = pd.DataFrame(ss.transform(testxcon), columns = testxcon.columns)

Standardizing the test and train set

In [9]:
trainx.drop(columns = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1','BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea'], inplace = True)
testx.drop(columns = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1','BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea'], inplace = True)

In [10]:
trainx.reset_index(inplace = True)
trainxcons.reset_index(inplace = True)
trainx = pd.concat([trainx, trainxcons], axis = 1)

testx.reset_index(inplace = True)
testxcons.reset_index(inplace = True)
testx = pd.concat([testx, testxcons], axis = 1)


Resetting the index and performing outer join

In [11]:
trainx = trainx.iloc[:, 1: ]
testx = testx.iloc[:, 1: ]

In [None]:
model = Ridge()
max_iter = np.array(range(100000, 1000000))
alpha = np.linspace(1, 10, 50)
solver = ['cholesky', 'svd', 'lsqr', 'lbfgs']
param = {'max_iter': max_iter, 'alpha': alpha, 'solver': solver}
rridge = RandomizedSearchCV(model, param, cv=50, scoring = 'neg_mean_squared_error', verbose=0)
n_iter = rridge.n_iter
with tqdm(total=n_iter) as pbar:
    for i in range(n_iter):
        rridge.set_params(n_iter=1)
        rridge.fit(trainx, trainy)
        pbar.update()
print(rridge.best_params_)

Used RandomizedsearchCV for parameter tuning

In [12]:
model = Ridge(solver = 'cholesky', max_iter = 625608, alpha =  4.4897959183673475, random_state = 100)
model.fit(trainx, trainy)
predy = model.predict(testx)
print('RMSE:', mean_squared_error(testy, predy, squared = False))
print('R2 score:', r2_score(testy, predy))

RMSE: 29488.808323622874
R2 score: 0.864901959519436


# The best model
## Ridge
RMSE: 29488.808323622874
R2 score: 0.864901959519436

In [None]:
model = Lasso()
alpha = np.linspace(1, 10, 10)
max_iter = np.array(range(100000, 1000000))
param = {'alpha': alpha, 'max_iter': max_iter}
rlasso = RandomizedSearchCV(model,param, cv= 20,scoring = 'neg_mean_squared_error', verbose = 0)
n_iter = rlasso.n_iter
with tqdm(total=n_iter) as pbar:
    for i in range(n_iter):
        rlasso.set_params(n_iter=1)
        rlasso.fit(trainx, trainy)
        pbar.update()
print(rlasso.best_params_)


Used RandomizedsearchCV for parameter tuning

In [19]:
model = Lasso(max_iter = 277590, alpha = 9)
model.fit(trainx, trainy)
predy = model.predict(testx)
print('RMSE:', mean_squared_error(testy, predy, squared = False))
print('R2 score:', r2_score(testy, predy))

RMSE: 33299.04253159029
R2 score: 0.8277345840924091


## Lasso
RMSE: 33299.04253159029
R2 score: 0.8277345840924091

In [None]:
model = ElasticNet()
alpha = np.linspace(1, 10, 20)
max_iter = np.array(range(100000, 1000000))
l1_ratio = np.linspace(0, 1, 10)
param = {'alpha': alpha, 'max_iter': max_iter, 'l1_ratio': l1_ratio}
relastic = RandomizedSearchCV(model,param, cv= 50,scoring = 'neg_mean_squared_error', verbose = 0)
n_iter = relastic.n_iter
with tqdm(total=n_iter) as pbar:
    for i in range(n_iter):
        relastic.set_params(n_iter=1)
        relastic.fit(trainx, trainy)
        pbar.update()
print(relastic.best_params_)

Used RandomizedsearchCV for parameter tuning

In [20]:
model = ElasticNet(max_iter = 811878, l1_ratio= 0.7777777777777777, alpha = 3.3684210526315788)
model.fit(trainx, trainy)
predy = model.predict(testx)
print('RMSE:', mean_squared_error(testy, predy, squared = False))
print('r2 score:', r2_score(testy, predy))

RMSE: 33613.34037720338
r2 score: 0.8244673330495373


## ElasticNet
RMSE: 33613.34037720338
r2 score: 0.8244673330495373

In [None]:
model = DecisionTreeRegressor()
min_samples_split = np.array(range(1, 100))
max_depth = np.array(range(1, 200))
criterion = ['squared_error', 'friedman_mse', 'poisson', 'absolute_error']
param = {'min_samples_split':min_samples_split, 'max_depth': max_depth, 'criterion': criterion}
rdecision = RandomizedSearchCV(model, param, cv=50, scoring = 'neg_mean_squared_error', verbose = 0)
n_iter = rdecision.n_iter
with tqdm(total=n_iter) as pbar:
    for i in range(n_iter):
        rdecision.set_params(n_iter=1)
        rdecision.fit(trainx, trainy.values.ravel())
        pbar.update()
print(rdecision.best_estimator_)

Used RandomizedsearchCV for parameter tuning

In [21]:
model = DecisionTreeRegressor(min_samples_split = 37, criterion = 'absolute_error', max_depth = 7, random_state = 100 )
model.fit(trainx, trainy)
predy = model.predict(testx)
print('RMSE:', mean_squared_error(testy, predy, squared = False))
print('r2 score:', r2_score(testy, predy))

RMSE: 41067.19478302372
r2 score: 0.7379858892096072


## Decision Tree
RMSE: 41067.19478302372
r2 score: 0.7379858892096072

In [None]:
model = RandomForestRegressor()
min_samples_split = np.array(range(1, 100))
max_depth = np.array(range(1, 100))
criterion = ['squared_error', 'friedman_mse', 'poisson', 'absolute_error']
n_estimators = np.array(range(1, 100))
param = {'min_samples_split': min_samples_split, 'max_depth': max_depth, 'criterion': criterion, 'n_estimators': n_estimators}
rrandom = RandomizedSearchCV(model, param, cv = 50, scoring = 'neg_mean_squared_error', verbose = 0)
n_iter = rrandom.n_iter
with tqdm(total=n_iter) as pbar:
    for i in range(n_iter):
        rrandom.set_params(n_iter=1)
        rrandom.fit(trainx, trainy.values.ravel())
        pbar.update()
print(rrandom.best_estimator_)

Used RandomizedsearchCV for parameter tuning

In [22]:
model = RandomForestRegressor(criterion = 'poisson', max_depth = 79, min_samples_split = 73, n_estimators = 71, random_state = 100)
model.fit(trainx, trainy.values.ravel())
predy = model.predict(testx)
print('RMSE:', mean_squared_error(testy, predy, squared = False))
print('r2 score:', r2_score(testy, predy))

RMSE: 45409.38570734214
r2 score: 0.6796491663840909


## Random Forest
RMSE: 45409.38570734214
r2 score: 0.6796491663840909

In [None]:
model = ExtraTreesRegressor()
criterion = ['squared_error', 'friedman_mse', 'poisson', 'absolute_error']
min_samples_split = np.array(range(1, 100))
max_depth = np.array(range(1, 100))
n_estimators = np.array(range(1, 100))
param = {'criterion': criterion, 'min_samples_split': min_samples_split, 'max_depth': max_depth, 'n_estimators': n_estimators}
rextra = RandomizedSearchCV(model, param, cv=10, scoring = 'neg_mean_squared_error')
n_iter = rextra.n_iter
with tqdm(total=n_iter) as pbar:
    for i in range(n_iter):
        rextra.set_params(n_iter=1)
        rextra.fit(trainx, trainy.values.ravel())
        pbar.update()
print(rextra.best_params_)

Used RandomizedsearchCV for parameter tuning

In [23]:
model = ExtraTreesRegressor(criterion = 'friedman_mse', max_depth = 44, min_samples_split = 66, n_estimators = 58, random_state = 100)
model.fit(trainx, trainy.values.ravel())
predy = model.predict(testx)
print('RMSE:', mean_squared_error(testy, predy, squared = False))
print('r2 score:', r2_score(testy, predy))

RMSE: 37436.73085857816
r2 score: 0.7822639003073216


## Extra Trees
RMSE: 37436.73085857816
r2 score: 0.7822639003073216

In [None]:
model = BaggingRegressor()
n_estimators = np.array(range(1, 100))
max_features = np.array(range(1, 304))
param = {'n_estimators': n_estimators, 'max_features': max_features}
rbagging = RandomizedSearchCV(model, param, cv=50, scoring = 'neg_mean_squared_error')
n_iter = rbagging.n_iter
with tqdm(total=n_iter) as pbar:
    for i in range(n_iter):
        rbagging.set_params(n_iter=1)
        rbagging.fit(trainx, trainy.values.ravel())
        pbar.update()
print(rbagging.best_params_)

Used RandomizedsearchCV for parameter tuning

In [24]:
model = BaggingRegressor(max_features = 164, n_estimators = 87)
model.fit(trainx, trainy.values.ravel())
predy = model.predict(testx)
print('RMSE:', mean_squared_error(testy, predy, squared = False))
print('r2 score:', r2_score(testy, predy))

RMSE: 29733.765245826977
r2 score: 0.8626481791207108


## Bagging Regressor
RMSE: 29733.765245826977
r2 score: 0.8626481791207108

In [None]:
model = AdaBoostRegressor()
learning_rate = np.linspace(0, 1, 100)
n_estimators = np.array(range(1, 100))
param = {'learning_rate': learning_rate, 'n_estimators': n_estimators}
rada = RandomizedSearchCV(model, param, cv=5, scoring = 'neg_mean_squared_error')
n_iter = rada.n_iter
with tqdm(total=n_iter) as pbar:
    for i in range(n_iter):
        rada.set_params(n_iter=1)
        rada.fit(trainx, trainy.values.ravel())
        pbar.update()
print(rada.best_params_)

Used RandomizedsearchCV for parameter tuning

In [25]:
model = AdaBoostRegressor(n_estimators = 91, learning_rate = 0.36363636363636365)
model.fit(trainx, trainy.values.ravel())
predy = model.predict(testx)
print('RMSE:', mean_squared_error(testy, predy, squared = False))
print('r2 score:', r2_score(testy, predy))

RMSE: 36499.585247643045
r2 score: 0.7930285397584124


## AdaBoost
RMSE: 36499.585247643045
r2 score: 0.7930285397584124

In [None]:
model = SVR()
kernel = ['rbf', 'sigmoid', 'linear']
C = np.linspace(.001, 100, 20)
gamma = np.linspace(0.001, 100, 20)
param = {'kernel': kernel, 'C': C, 'gamma': gamma}
rsvr = RandomizedSearchCV(model, param, cv=50, scoring = 'neg_mean_squared_error')
n_iter = rsvr.n_iter
with tqdm(total=n_iter) as pbar:
    for i in range(n_iter):
        rsvr.set_params(n_iter=1)
        rsvr.fit(trainx, trainy.values.ravel())
        pbar.update()
print(rsvr.best_params_)

Used RandomizedsearchCV for parameter tuning

In [26]:
model = SVR(kernel = 'linear', gamma = 94.7368947368421, C = 36.84273684210525)
model.fit(trainx, trainy.values.ravel())
predy = model.predict(testx)
print('RMSE:', mean_squared_error(testy, predy, squared = False))
print('r2 score:', r2_score(testy, predy))

RMSE: 37538.95925863853
r2 score: 0.781073133718348


## SVR
RMSE: 37538.95925863853
r2 score: 0.781073133718348

In [None]:
model = XGBRegressor()
booster = ['gbtree', 'dart']
eta = np.linspace(0.01, 0.3, 10)
gamma = np.array(range(1, 100))
max_depth = np.array(range(1, 10))
objective = ['reg:squarederror', 'reg:squaredlogerror']
alpha= np.array(range(1, 50))
reg_lambda = np.linspace(0, 1, 10)
colsample_bytree = np.linspace(0.5,1, 5)
min_child_weight = np.array(range(0,10))
n_estimators = np.array(range(1,100))
param = {'booster': booster, 'eta': eta, 'gamma': gamma, 'max_depth': max_depth, 'objective': objective, 'alpha': alpha,'reg_lambda': reg_lambda,
        'colsample_bytree': colsample_bytree, 'min_child_weight': min_child_weight, 'n_estimators': n_estimators}
rxgb = RandomizedSearchCV(model, param, cv = 50, scoring = 'neg_mean_squared_error')
n_iter = rxgb.n_iter
with tqdm(total = n_iter) as pbar:
    for i in range(n_iter):
        rxgb.set_params(n_iter = 1)
        rxgb.fit(trainx, trainy)
        pbar.update()
print(rxgb.best_params_)




Used RandomizedsearchCV for parameter tuning

In [27]:
model = XGBRegressor(booster = 'dart', eta = .267777, gamma = 95 , max_depth = 9, objective = 'reg:squaredlogerror',reg_lambda = 0.111111, n_estimators = 13, 
                     min_child_weight = 8, alpha = 7, colsample_bytree = 0.625, random_state = 100 )
model.fit(trainx, trainy)
predy = model.predict(testx)
print('RMSE:', mean_squared_error(testy, predy, squared = False))
print('r2 score:', r2_score(testy, predy))

RMSE: 198552.6801665315
r2 score: -5.124718211129857


## still working on xgboost

In [None]:
model = LGBMRegressor()
boosting_type = ['gbdt', 'dart', 'rf']
num_leaves = np.array(range(1, 100))
max_depth = np.array(range(1, 100))
learning_rate = np.linspace(0.1, 1, 10)
n_estimators = np.array(range(1, 200))
param = {'boosting_type': boosting_type, 'num_leaves': num_leaves, 'max_depth': max_depth, 'learning_rate': learning_rate, 'n_estimators': n_estimators}
rlgb = RandomizedSearchCV(model, param, cv= 50, scoring = 'neg_mean_squared_error')
n_iter = rlgb.n_iter
with tqdm(total = n_iter) as pbar:
    for i in range(n_iter):
        rlgb.set_params(n_iter =1)
        rlgb.fit(trainx, trainy)
        pbar.update()
print(rlgb.best_params_)

still working on LGBM

Used RandomizedsearchCV for parameter tuning

In [13]:
testXcon = testX.loc[: , ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1','BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea']]
testXcons = pd.DataFrame(ss.transform(testXcon), columns = testXcon.columns)
testX.drop(columns = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1','BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea'], inplace = True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Standardizing the test data set 

In [14]:
testX.reset_index(inplace = True)
testXcons.reset_index(inplace = True)
testX = pd.concat([testX, testXcons], axis = 1)
testX = testX.iloc[:, 1: ]

Resetting the index and performing outer join

In [17]:
id = id[id.SalePrice.isnull()]  
id['SalePrice'] = model.predict(testX)
id.to_csv('Predcition.csv', index = False)

Saving the data as csv file

In [None]:
model_ridge.predict

mse