This kernel is going to solve <font color="red"><b> House Prices with Advanced Regression Analysis</b></font>, a popular machine learning dataset for <b>Kaggler</b>.<br> 
I am going to share how I work with a dataset step by step  <b>from data preparation and data analysis to statistical tests and implementing machine learning models.</b> <br>
I will also describe the model results along with other tips.<br>
Let's get started.</div>
<br>
If you like this notebook or find this notebook helpful, Please feel free to <font color="red"><b>UPVOTE</b></font> and/or leave a comment.
 
<div> <b>This notebook is always a work in progress. So, please stay tuned for more to come.</b></div>

# Read Data

I'm going to proceed by importing the library whenever necessary, rather than collecting the library imports.<br>
For those of you reading this, <font color="Blue"><b>I will do my best to save your time.</b></font>

In [20]:
import pandas as pd

train = pd.read_csv("D:\edge\train.csv")
test = pd.read_csv("D:\‪edge\test.csv")

OSError: [Errno 22] Invalid argument: 'D:\\edge\train.csv'

In [None]:
train.describe()

In [None]:
def compare_columns(df1, df2):
    df1_columns_set = set(df1.columns)
    df2_columns_set = set(df2.columns)
    print('df1_columns_set - df2_columns_set :', df1_columns_set - df2_columns_set)
    print('df2_columns_set - df1_columns_set :', df2_columns_set - df1_columns_set)

# EDA and Preprocessing

In [None]:
import numpy as np

train["LogSalePrice"] = train['SalePrice'].apply(np.log)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_two(df, feat1_name, feat2_name, shape):
    plt.figure(figsize=shape)
    sns.regplot(data=df, x=feat1_name, y=feat2_name, scatter_kws={'alpha':0.2})
    plt.title(feat1_name+' vs '+feat2_name, fontsize=14)
    plt.show()

In [None]:
feat1_name, feat2_name = 'TotalBsmtSF', 'SalePrice'
plot_two(train, feat1_name, feat2_name, (24,6))

In [None]:
feat1_name, feat2_name = 'TotalBsmtSF', 'LogSalePrice'
plot_two(train, feat1_name, feat2_name, (24,6))

In [None]:
# determine the threshold for missing values
def calc_percent_missing(df):
    nan_percent = {col : df[col].isnull().mean() for col in df.columns}
    return nan_percent

In [None]:
nan_percent = calc_percent_missing(train)
nan_percent = pd.DataFrame(sorted(nan_percent.items(), key=lambda x: x[1], reverse=True))
nan_percent = nan_percent[nan_percent[1] >= 0.005]

In [None]:
import seaborn as sb

correlation_train=train[train.dtypes[train.dtypes != 'object'].index].corr()
sb.set(font_scale=2)
plt.figure(figsize = (50,35))
ax = sb.heatmap(correlation_train, annot=True,annot_kws={"size": 25},fmt='.1f',cmap='PiYG', linewidths=.5)

In [None]:
#box plot overallqual/saleprice
var = 'OverallQual'
data = pd.concat([train['SalePrice'], train[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);

In [None]:
num_cols = [col for col in train.columns if train[col].dtype in ["float16","float32","float64", "int64", "int32"]]
cat_cols = [col for col in train.columns if train[col].dtype not in ["float16","float32","float64", "int64", "int32"]]

In [None]:
#missing data
total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

In [None]:
df_train = train.apply(lambda x: x.fillna(x.median()) if x.dtype != "O" else x, axis=0)
df_test = test.apply(lambda x: x.fillna(x.median()) if x.dtype != "O" else x, axis=0)

In [None]:
# Checking the features with NaN remained out
def replace_nan_values(df):
    for col in df:
        df[col] = df[col].fillna(df[col].mode()[0])

In [None]:
replace_nan_values(df_train)

In [None]:
#missing data
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

In [None]:
compare_columns(df_train,df_test)

In [None]:
data = df_train.drop(columns=['SalePrice', 'LogSalePrice'])

In [None]:
compare_columns(data,df_test)

In [None]:
data_train_test = pd.concat([data, df_test])

In [None]:
data_train_test["SqFtPerRoom"] = data_train_test["GrLivArea"] / (data_train_test["TotRmsAbvGrd"] + data_train_test["FullBath"] + data_train_test["HalfBath"] + data_train_test["KitchenAbvGr"])
data_train_test['Total_Home_Quality'] = data_train_test['OverallQual'] + data_train_test['OverallCond']
data_train_test['Total_Bathrooms'] = data_train_test['FullBath'] + 0.5 * data_train_test['HalfBath'] + data_train_test['BsmtFullBath'] + 0.5 * data_train_test['BsmtHalfBath']
data_train_test["HighQualSF"] = data_train_test["1stFlrSF"] + data_train_test["2ndFlrSF"]

In [None]:
# Creating dummy variables from categorical features
data_train_test = pd.get_dummies(data_train_test)

In [None]:
data, df_test = data_train_test[:len(data)], data_train_test[len(data):]

In [None]:
target = df_train['LogSalePrice']
test_id = df_test['Id']

# Modeling

In [None]:
from sklearn.model_selection import RepeatedKFold, KFold, cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV

# 5 Fold Cross validation
kf = KFold(n_splits=5, shuffle=True)
cv_scores, cv_std = [], []

In [None]:
# Creation of the RMSE metric:    
def rmse(model):
    return np.sqrt(-cross_val_score(model, data, target, scoring="neg_mean_squared_error", cv=kf))

In [None]:
def apply_learning_algorithm(model):
    score = rmse(model)
    cv_scores.append(score.mean())
    cv_std.append(score.std())

In [None]:
from lightgbm                import LGBMRegressor
from sklearn.svm             import SVR
from sklearn.metrics         import mean_squared_error, mean_absolute_error, mean_squared_log_error
# import warnings
# warnings.simplefilter('ignore')

models = [LGBMRegressor(objective='regression',
                        num_leaves=966,
                        learning_rate=0.01, 
                        n_estimators=920,#720
                        max_bin = 55, 
                        bagging_fraction = 0.8,
                        bagging_freq = 5, 
                        feature_fraction = 0.2319,
                        feature_fraction_seed=9, 
                        bagging_seed=9,
                        min_data_in_leaf =6, 
                        min_sum_hessian_in_leaf = 11),
        SVR(kernel='rbf', C=1000000, epsilon=0.001)]

In [None]:
model_names = ['LGBMRegressor','SupportVectorRegressor']

In [None]:
for model in models:
    apply_learning_algorithm(model)

In [None]:
cv_scores

In [None]:
cv_std

In [None]:
final_cv_score = pd.DataFrame(model_names, columns = ['Regressors'])
final_cv_score['RMSE_mean'] = cv_scores
final_cv_score['RMSE_std'] = cv_std
final_cv_score

In [None]:
# Train-Test split the data
x_train, x_validation, y_train, y_validation = train_test_split(data, target, test_size = 0.1)

In [None]:
best_regressor_name = final_cv_score.sort_values(by=['RMSE_mean']).head(1)['Regressors'].iloc[0]
best_regressor = models[model_names.index(best_regressor_name)]
best_regressor

In [None]:
# The Best Regressor
best_model = best_regressor.fit(x_train, y_train)

In [None]:
# Creation of the RMSE metric:    
def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

In [None]:
pred = best_model.predict(x_validation)
score = rmse(y_validation, pred)
score

# Submission

In [None]:
test_pred = best_model.predict(df_test)
submission = pd.DataFrame(test_id, columns = ['Id'])
test_pred = np.expm1(test_pred)
submission['SalePrice'] = test_pred 
submission.head()
submission.to_csv("submission.csv", index = False, header = True)