In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy import stats
from scipy.stats import norm, skew #for some statistics
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading the datasets using Pandas built in read_csv function

In [None]:
train = pd.read_csv('/kaggle/input/home-data-for-ml-course/train.csv')
test = pd.read_csv('/kaggle/input/home-data-for-ml-course/test.csv')

# Introduction to the training dataset

From the author, he mentioned that there are 23 nominal (Not ordered) and 23 ordinal (in specific orders, i.e. Overall Condition) categorical variables, 20 continuous variables and 14 discrete variables. The **nominal** variables typically identify various types of dwellings, garages, materials, and environmental conditions while the **ordinal** variables typically rate various items within the property. 

So what are they? We can identify these variables simply by reading the documentation. Also we could have a quick look at the dataset to identify the continuous and discrete variables. Although we might notice that the OverallQual, OverallCond and MSSubClass is classified as numbers, but we do know that these 3 variables fall under the categorical variable group.

In [None]:
# Preview of the train dataset
train.head()

In [None]:
# Identify the number of numeric and non-numeric columns
print(train.select_dtypes(include='object').shape[1])
print()
print(train.select_dtypes(exclude='object').shape[1])

In [None]:
# Statistics summary of the train dataset
train.describe()

In [None]:
# The 23 nominal categorical variables
nominal_variables = ['MSSubClass', 'MSZoning', 'Street', 'Alley','LandContour',
                     'LotConfig', 'Neighborhood', 'Condition1','Condition2', 'BldgType',
                     'HouseStyle', 'RoofStyle', 'RoofMatl','Exterior1st', 'Exterior2nd',
                     'MasVnrType', 'Foundation', 'Heating', 'CentralAir', 'GarageType',
                     'MiscFeature', 'SaleType', 'SaleCondition']

In [None]:
# The 23 ordinal categorical variables
ordinal_variables = ['LotShape', 'Utilities', 'LandSlope', 'OverallQual', 'OverallCond',
                     'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
                     'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'Electrical', 'KitchenQual',
                     'Functional', 'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond',
                     'PoolQC', 'Fence', 'PavedDrive']

In [None]:
# The 20 continuous variables
continuous_variables = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
                        'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
                        'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch',
                        '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'SalePrice']

In [None]:
# The 14 discrete variables
discrete_variables = ['YearBuilt', 'YearRemodAdd', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
                      'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
                      'GarageYrBlt', 'GarageCars', 'MoSold', 'YrSold']

# Checking for outliers specifically in the GrLivArea column
As per mentioned by the author himself, it is recommended to remove outliers for GrLivArea > 4000.
Lets have a quick scatter plot to visualize the outliers. We can easily notice a few outliers in the plot below and lets drop them from the training set.

In [None]:
# import visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set(style='darkgrid')

In [None]:
# Plot scatter plot of SalePrice vs. GrLivArea
plt.figure(figsize=(10,8))
sns.scatterplot(train['GrLivArea'], train['SalePrice'])
plt.axvline(x=4000, c='r', linewidth=2)
plt.title('Scatter Plot of Sale Price vs. Ground Living Area')
plt.show()

In [None]:
# We can now drop those rows
train = train.drop(train[train['GrLivArea'] > 4000].index)
print(train.shape)
print()
print(test.shape)

# Analyzing correlation between target variable and numerical features

From the heatmap below we can clearly see that the variables, OverallQual (0.8), GrLivArea (0.72) and TotalBsmtSF (0.65) show significant correlation with SalePrice.

In [None]:
# highly correlated features
correlation = train.corr()
top_correlation = correlation.index[abs(correlation["SalePrice"])>0.5]
plt.figure(figsize=(10,10))
g = sns.heatmap(train[top_correlation].corr(),annot=True,cmap="coolwarm")

# Analyzing and dealing with skewed data when conducting regression analyses (target variable)

Lets first analyze our target variable, which is the SalePrice and see what information that we can gather. For this we can utilize seaborn's distplot function to view the histogram and scipy's stats probplot to view the probability plot. 

*For the most part, the normal P-P plot is better at finding deviations from normality in the center of the distribution, and the normal Q-Q plot is better at finding deviations in the tails. Q-Q plots tend to be preferred in research situations. Both Q-Q and P-P plots can be used for distributions other than normal.

In [None]:
# Plot histogram and probability plot before log transform
plt.figure(figsize=(8,10))
plt.subplot(2,1,1)
sns.distplot(train['SalePrice'], bins=30)
plt.axvline(x=train['SalePrice'].mean(), c='k', linewidth=2)
plt.title('Histogram of Sale Prices')
plt.show()

plt.figure(figsize=(10,10))
plt.subplot(2,2,1)
stats.probplot(train['SalePrice'], plot=plt)
plt.title('Probability plot of Sale Prices')
plt.show()

In [None]:
# Check skewness of the target variable
train['SalePrice'].skew()

We can see the distribution of SalePrice is heavily right skewed (showing a long right tail). And since this project focuses on machine learning with the assumption of normality in the distribution of the dataset, we simply just can't assume that the data we are working with is of normal distribution.A good approach is to transform the skewed data. Transformation technique is useful to stabilize variance, make the data more normal distribution-like, which improves the validity of measures of association. Such transformation methods are like the log-transformation, box-cox transformation, square-root transformation etc.

We can perform log transform to the target variable and plot the histogram and probabilty plot again. And from the histogram and probability plots below we can see that the sale price now is more towards being normalized at only roughly 0.065 skewness. One thing to note is that, it is very **IMPORTANT** to also transform numeric features that are skewed.

We will compare log transformation with box-cox transformation and we will choose the method that yields the better result.

In [None]:
# Create a copy and perform log transform
train_copy1 = train.copy()
train_copy1['SalePrice'] = np.log(train_copy1['SalePrice'])

In [None]:
# Import scipy special's boxcox library
from scipy.special import boxcox1p

# we will not go into detail on which lambda to select but the idea is
# the lambda will affect the transformed data's skewness
train_copy2 = train.copy()
lam = 0.15
train_copy2['SalePrice'] = boxcox1p(train_copy2['SalePrice'], lam)

In [None]:
# Plot histogram and probability plots
plt.figure(figsize=(20, 10))
plt.subplot(2,2,1)
sns.distplot(train_copy1['SalePrice'], bins=30)
plt.axvline(train_copy1['SalePrice'].mean(), c='k', linewidth=2)
plt.title('Histogram of Log Transformed Sale Prices')

plt.figure(figsize=(20, 10))
plt.subplot(2,2,2)
sns.distplot(train_copy2['SalePrice'], bins=30)
plt.axvline(train_copy2['SalePrice'].mean(), c='k', linewidth=2)
plt.title('Histogram of Box-Cox Transformed Sale Prices')

plt.figure(figsize=(20,10))
plt.subplot(2,2,3)
stats.probplot(train_copy1['SalePrice'], plot=plt)
plt.title('Probability plot of Log Transformed Sale Prices')

plt.figure(figsize=(20,10))
plt.subplot(2,2,4)
stats.probplot(train_copy2['SalePrice'], plot=plt)
plt.title('Probability plot of Box-Cox Transformed Sale Prices')

plt.show()

In [None]:
# Skew values after transformation
log_skew = train_copy1['SalePrice'].skew()
bc_skew = train_copy2['SalePrice'].skew()

print('Log Transform: {:.3f}\nBox-Cox Transform: {:.3f}'.format(log_skew, bc_skew))

 It seems that we yield much better skew using log transformation. We will proceed with using this method for the dependent variable.

In [None]:
train['SalePrice'] = np.log(train['SalePrice'])

# Handling the null values in both datasets

**TRAIN SET**
- PoolQC, MiscFeature, Alley, Fence, FireplaceQu, GarageCond, GarageType, GarageFinish, GarageQual, BsmtQual, BsmtCond, BsmtExposure, BsmtFinType2, BsmtFinType1, MasVnrType have 'NA' as an option
- LotFrontage has roughly 18% of nan values. We cannot drop this column as it holds some correlation with LotArea (0.426095) and SalePrice (0.351799). Intrinsically we can groupby the neighbourhood and compute the median value.
- In BsmtExposure, theres one row where the exposure is labelled as nan but there is a basement. So we will replace this value with 'No'.
- In BsmtFinType2, theres one row where it is labelled as nan but there is a basement. This cell will be replaced with the mode. ('Unf')
- Replace all nan values in MasVnrArea to zero.
- Replace missing Electrical row with the mode. ('SBrkr')

**TEST SET**

- LotFrontage has roughly 16% of nan values. We cannot drop this column as it holds high correlation with LotArea (0.644608). We can use the same way to fill the missing rows.
- Replace missing MSZoning rows with the mode. ('RL')
- Replace missing cells in BsmtHalfBath and BsmtFullBath in test set with zero.
- Replace missing cells in Utilities in test set with 'AllPub'.
- Replace missing cells in Functional in test set with 'Typ'. (Assume typical unless deductions are warranted)
- Replace missing cells in BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF with 0 as there is no basement recorded in (ID = 2121).
- For ID = 2218 & 2219, replace BsmtQual missing cells with the mode groupby BsmtCond. ('TA')
- For ID = 2041, 2186 & 2525, replace BsmtCond missing cells with the mode groupby BsmtQual. ('TA')
- For ID = 1488 & 2349, replace BsmtExposure missing cells with 'No'.
- Replace missing cells in Exterior1st and Exterior 2nd with the mode. ('VinylSd')
- Replace missing cell in SaleType with the mode. ('WD')
- Replace missing cell in KitchenQual with the mode. ('TA')
- GarageType has 76 null valued cells while GarageCond has 78 null valued cells. Locate the IDs for the 2 cells
    - ID 2577
    - ID 2127
- For ID = 2127, the GarageYrblt will be replaced with the year of the house was built, GarageFinish will be filled with the mode groupby GarageType, GarageQual and GarageCond will be replaced with the mode grouped by GarageYrBlt and GarageType.
- For ID = 2577, the GarageYrblt will be replaced with the year of the house was built, GarageFinish will be filled with the mode groupby GarageType, GarageQual and GarageCond will be replaced with the mode grouped by GarageYrBlt and GarageType. Replace missing cells in GarageArea and GarageCars with the median values based on the GarageYrblt, GarageFinish, GarageQual and GarageCond. 
- Replace missing MasVnrType cell in ID = 2611 with the mode groupby on MasVnrArea.

In [None]:
# look at the null values in train set
train.isnull().sum().sort_values(ascending=False)[:19]

In [None]:
# look at null values in test set
test.isnull().sum().sort_values(ascending=False)[:33]

# Null in train data

In [None]:
y = train[(train['BsmtExposure'].isnull()) & (train['BsmtQual'].notnull())].index
train.loc[y, 'BsmtExposure'] = train.loc[y, 'BsmtExposure'].fillna('No')

x = train[(train['BsmtFinType2'].isnull()) & (train['BsmtQual'].notnull())].index
train.loc[x, 'BsmtFinType2'] = train.loc[x, 'BsmtFinType2'].fillna('Unf')

In [None]:
# label the columns with 'NA' as a category. With an exception for 'GarageYrBlt' as those with no GarageQual
# means there isnt a garage to begin with which will be filled with zeros.
col_to_fill_NA = ['PoolQC', 'MiscFeature', 'Alley', 'Fence',
                  'FireplaceQu', 'GarageCond', 'GarageType', 'GarageFinish',
                  'GarageQual', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
                  'BsmtFinType2', 'BsmtFinType1', 'MasVnrType']

train[col_to_fill_NA] = train[col_to_fill_NA].fillna('NA')

In [None]:
train['GarageYrBlt'] = train['GarageYrBlt'].fillna(0)
train['MasVnrArea'] = train['MasVnrArea'].fillna(0)
train['Electrical'] = train['Electrical'].fillna('SBrkr')

In [None]:
train['LotFrontage'] = train.groupby('Neighborhood')['LotFrontage'].apply(lambda x: x.fillna(x.median()))

# Null in test data

In [None]:
id1 = test[test['Id'] == 2127].index
id2 = test[test['Id'] == 2577].index

In [None]:
test.loc[id1,'GarageYrBlt'] = test.loc[id1, 'GarageYrBlt'].fillna(test.loc[id1,'YearBuilt'].values[0])
test.loc[id1,'GarageFinish'] = (test
                                .groupby('GarageType')['GarageFinish']
                                .apply(lambda x: x.fillna(x.mode().values[0]))
                               )
test.loc[id1,'GarageQual'] = (test
                              .groupby(['GarageYrBlt', 'GarageType'])['GarageQual']
                              .apply(lambda x: x.fillna(x.mode().values[0]))
                             )
test.loc[id1,'GarageCond'] = (test
                              .groupby(['GarageYrBlt', 'GarageType'])['GarageCond']
                              .apply(lambda x: x.fillna(x.mode().values[0]))
                             )

In [None]:
test.loc[id2,'GarageYrBlt'] = test.loc[id2, 'GarageYrBlt'].fillna(test.loc[id1,'YearBuilt'].values[0])
test.loc[id2,'GarageFinish'] = (test
                                .groupby('GarageType')['GarageFinish']
                                .apply(lambda x: x.fillna(x.mode().values[0]))
                               )
test.loc[id2,'GarageQual'] = (test
                              .groupby(['GarageYrBlt', 'GarageType'])['GarageQual']
                              .apply(lambda x: x.fillna(x.mode().values[0]))
                             )
test.loc[id2,'GarageCond'] = (test
                              .groupby(['GarageYrBlt', 'GarageType'])['GarageCond']
                              .apply(lambda x: x.fillna(x.mode().values[0]))
                             )
test.loc[id2, 'GarageCars'] = (test
                               .groupby(['GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond'])['GarageCars']
                               .apply(lambda x: x.fillna(x.median()))
                              )
test.loc[id2, 'GarageArea'] = (test
                               .groupby(['GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond'])['GarageArea']
                               .apply(lambda x: x.fillna(x.median()))
                              )

In [None]:
id3 = test[test['MasVnrArea'].notnull() & test['MasVnrType'].isnull()].index
test.loc[id3, 'MasVnrType'] = test.groupby('MasVnrArea')['MasVnrType'].apply(lambda x: x.fillna(x.mode().values[0]))

In [None]:
test.loc[:,['BsmtHalfBath', 'BsmtFullBath']] = test.loc[:,['BsmtHalfBath', 'BsmtFullBath']].fillna(0)
test[['Exterior1st', 'Exterior2nd']] = test[['Exterior1st', 'Exterior2nd']].fillna('VinylSd')
test['MSZoning'] = test['MSZoning'].fillna('RL')
test['Utilities'] = test['Utilities'].fillna('AllPub')
test['Functional'] = test['Functional'].fillna('Typ')
test['SaleType'] = test['SaleType'].fillna('WD')
test['KitchenQual'] = test['KitchenQual'].fillna('TA')
test['MasVnrArea'] = test['MasVnrArea'].fillna(0)

In [None]:
id4 = test[test['Id'] == 2121].index
test.loc[id4, ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF']] = test.loc[id4, ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF']].fillna(0)

In [None]:
# locate the ids with either BsmtQual is missing but with BsmtCond values and vice versa
id5 = test[(
            test['BsmtCond'].isnull() & test['BsmtQual'].notnull()
            ) |  (
                  test['BsmtCond'].notnull() & test['BsmtQual'].isnull()
                  )
          ].index

test.loc[id5, 'BsmtQual'] = test.loc[id5, 'BsmtQual'].fillna('TA')
test.loc[id5, 'BsmtCond'] = test.loc[id5, 'BsmtQual'].fillna('TA')

In [None]:
id6 = test[(test['BsmtExposure'].isnull()) & (test['BsmtQual'].notnull())].index
test.loc[id6, 'BsmtExposure'] = test.loc[id6, 'BsmtExposure'].fillna('No')

In [None]:
test[col_to_fill_NA] = test[col_to_fill_NA].fillna('NA')

In [None]:
test['GarageYrBlt'] = test['GarageYrBlt'].fillna(0)

In [None]:
test['LotFrontage'] = test.groupby('Neighborhood')['LotFrontage'].apply(lambda x: x.fillna(x.median()))

In [None]:
# Double checking the datasets
print(train.isnull().sum())
print()
print(test.isnull().sum())

# So now we have sorted all the null values, we can then proceed to create new features
Knowing that the prices for houses are mostly affected by the location, condition, size and type of the house, the facilities it provides, the year it was built and the year it was sold. Based on these intrinsic features that we can think off from a very basic point of view, we can now create and simplify the features that we need.

To make things easier, we are going to combine both train and test datasets for this part of the analysis and then separate them again before fitting it to our machine learning models. We can actually perform this step before starting the data cleaning as well. I personally prefer to work the datasets separately when handling null values.

We should also convert the categorical variables - ['MSSubClass', 'OverallQual', 'OverallCond'] from integers to string type. The year and month sold variables are converted into string type as well.

In [None]:
# Combine both data sets and drop the 'Id' column
df = train.append(test)
df = df.drop('Id', axis=1)

In [None]:
# Convert these variables to str type
df[['MSSubClass', 'OverallQual', 'OverallCond']] = df[['MSSubClass', 'OverallQual', 'OverallCond']].astype(str)
df[['MoSold', 'YrSold']] = df[['MoSold', 'YrSold']].astype(str)

In [None]:
# Creating new features and lowering the cardinality of the dataset
df['TotalHouseSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
df['TotalPorchSF'] = df['OpenPorchSF'] + df['EnclosedPorch'] + df['3SsnPorch'] + df['ScreenPorch']
df['TotalBathrooms'] = df['BsmtFullBath'] + df['BsmtHalfBath'] + df['FullBath'] + df['HalfBath']
df['HasPool'] = df['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
df['HasGarage'] = df['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
df['HasBasement'] = df['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
df['HasFireplace'] = df['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
df['HasWoodDeck'] = df['WoodDeckSF'].apply(lambda x: 1 if x > 0 else 0)

In [None]:
# Drop the unwanted columns
drop_col = ['1stFlrSF', '2ndFlrSF', 'BsmtFullBath',
            'BsmtHalfBath', 'FullBath', 'HalfBath', 'OpenPorchSF',
            'EnclosedPorch', '3SsnPorch', 'ScreenPorch']
df = df.drop(drop_col, axis=1)

# Lets move on to checking skewness found in the feature variables



In [None]:
# Create a new dataframe consist of only numeric variables
num_df = df.select_dtypes(exclude = 'object')

# Check skew of all numerical features
num_skew = num_df.apply(lambda x: x.skew()).sort_values(ascending=False)
skew_df = pd.DataFrame({'Skew': num_skew})
skew_df

# We can then perform box-cox transformation on the independent variables as well.

Only select the features with relatively high skew (>0.5) and exclude the new features that we created.

In [None]:
high_skew_df = skew_df[(skew_df['Skew']>0.5) | (skew_df['Skew']<-0.5)]

# Exclude new features and year columns
exclude_features = ['HasPool', 'HasGarage', 'HasBasement', 'HasFireplace', 'HasWoodDeck', 'YearBuilt', 'GarageYrBlt']
high_skew_df = high_skew_df[high_skew_df.index.isin(exclude_features) == False]
high_skew_features = high_skew_df.index

# Perform box-cox transformation with specified lambda
lam = 0.15
for feat in high_skew_features:
    df[feat] = boxcox1p(df[feat], lam)

# Perform label encoding to all ordinal variables

In [None]:
# perform label encoding
from sklearn.preprocessing import LabelEncoder
    
for col in ordinal_variables:
    df[col] = LabelEncoder().fit_transform(df[col])

In [None]:
df.head()

# Perform One Hot Encoding to all nominal variables

In [None]:
# Perform one hot encoding
from sklearn.preprocessing import OneHotEncoder

nominal_variables = ['MSSubClass', 'MSZoning', 'Street', 'Alley','LandContour',
                     'LotConfig', 'Neighborhood', 'Condition1','Condition2', 'BldgType',
                     'HouseStyle', 'RoofStyle', 'RoofMatl','Exterior1st', 'Exterior2nd',
                     'MasVnrType', 'Foundation', 'Heating', 'CentralAir', 'GarageType',
                     'MiscFeature', 'SaleType', 'SaleCondition', 'MoSold', 'YrSold']

encoded_features = []
for col in nominal_variables:
    encoded_feat = OneHotEncoder().fit_transform(df[col].values.reshape(-1, 1)).toarray()
    n = df[col].nunique()
    cols = ['{}_{}'.format(col, n) for n in range(1, n + 1)]
    encoded_df = pd.DataFrame(encoded_feat, columns=cols)
    encoded_df.index = df.index
    encoded_features.append(encoded_df)

df = pd.concat([df, *encoded_features], axis=1).drop(nominal_variables, axis=1)



In [None]:
df.head()

# So now we have our dataset ready, we will fit several regression models and predict the housing sale prices.

The models used are:
* LinearRegression
* Lasso
* Ridge
* RandomForestRegressor
* GradientBoostingRegressor

The models are kept at their default state as i will not go into details for parameter tuning. This can be done using GridSearchCV. I will probably further tune it in the near future.

In [None]:
# Now we can split the data into train and test sets again
df_train = df[:1456]
df_test = df[1456:]

In [None]:
# Compute training and test variables
X_train = df_train.drop('SalePrice', axis=1)
y_train = df_train['SalePrice']
X_test = df_test.drop('SalePrice', axis=1)

In [None]:
# Just to confirm that both datasets has the same amount of columns
print(X_train.shape)
print(X_test.shape)

In [None]:
# import necessary libraries
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score, KFold, train_test_split

In [None]:
# Create variables for all import regression models
lin = LinearRegression()
ridge = Ridge()
lasso = Lasso()
rf = RandomForestRegressor()
gb = GradientBoostingRegressor()

In [None]:
# Define a function to calculate rsme for different models

kfold = KFold(n_splits=10)
def rmsle_cv(model):
    kfold = KFold(n_splits=10)
    rmse= np.sqrt(abs(cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv = kfold)))
    return(rmse)

In [None]:
# Linear Regression
score = rmsle_cv(lin)
print('Linear Regression score: {:.3f}'.format(score.mean()))

In [None]:
# Ridge Regression
score = rmsle_cv(ridge)
print('Ridge Regression score: {:.3f}'.format(score.mean()))

In [None]:
# Lasso Regression
score = rmsle_cv(lasso)
print('Lasso Regression score: {:.3f}'.format(score.mean()))

In [None]:
# Gradient Boosting
score = rmsle_cv(gb)
print('Gradient Boosting score: {:.3f}'.format(score.mean()))

In [None]:
# Random Forest
score = rmsle_cv(rf)
print('Random Forest score: {:.3f}'.format(score.mean()))

So it seems like ridge regression is performing the best with the lowest RMSE. We will proceed with fitting the model and predict the housing sale prices.

In [None]:
# Fitting the ridge model
ridge_model = ridge.fit(X_train, y_train)

# Predicting prices
X_pred = ridge_model.predict(X_test)

Now we have to re-transform the predicted sale prices back to their inital state

In [None]:
X_pred = np.expm1(X_pred)

In [None]:
# Compute submission dataframe

output = pd.DataFrame()
output['Id'] = test['Id']
output['SalePrice'] = X_pred
output.to_csv('submission.csv',index=False)