<a href="https://colab.research.google.com/github/hfernandescfc/House_Predictions/blob/main/House_Price_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import norm, skew #for some statistics
pd.set_option('display.max_columns', None)
import missingno as msno


In [2]:
path = '/content/'

test = 'test.csv'
train = 'train.csv'

In [3]:
train_df = pd.read_csv(path + train)
test_df = pd.read_csv(path + test)

copy_train = train_df.copy()
copy_test = test_df.copy()

dfs_copied = [copy_train, copy_test]

FileNotFoundError: ignored

#FRAMEWORK

1. Understand Data
2. Data Preprocessing
3. EDA
4. Feature Engineering*
5. Model Deployment
6. Evaluate
7. Redeploy

## Understanging Data

In [None]:
print("Train data frame shape:" + str(copy_train.shape))
print("\n"+"-" * 100)
print("Test data frame shape:" + str(copy_test.shape))
print("\n"+"-" * 100)

In [None]:
print(copy_train.columns)
print('\n')
print(copy_train.info())

In [None]:
copy_train.head()

## Basic Cleaning

In [None]:
dtypes = ['category', 'int', 'float64']

for df in dfs_copied:
  df = df.drop(columns = 'Id', inplace=True)


Categorical = copy_train.select_dtypes(include='object')

Numerical = copy_train.columns[~copy_train.columns.isin([Categorical])]

for d in dfs_copied:
  for c in Categorical.columns:
    d[c] = d[c].astype('category')

## Dealing with Missing data

In [None]:
missing_totals = copy_train.isnull().sum().sort_values(ascending=False)

missing_percentuals = (copy_train.isnull().sum()/len(copy_train)).sort_values(ascending=False)

missing_df = pd.concat([missing_totals, missing_percentuals], axis = 1, keys = ['Values Missing', 'Percent Missing'])

missing_columns = missing_df[missing_df['Values Missing'] > 0].index

display(missing_df[missing_df['Values Missing'] > 0])

print('\n')

print(missing_columns)

Although it is a good practice to keep as much data as possible, I decided to remove the features which had more than 50% of missing values.

In [None]:
Threeshold = 0.5

Beyond_Threeshold = missing_df[missing_df['Percent Missing'] > Threeshold].index

copy_train = copy_train.drop(columns = Beyond_Threeshold)

print(copy_train.columns)



In [None]:
dtypes = ['category', 'int', 'float']

Categorical = copy_train.select_dtypes(include='object')

Numerical = copy_train.columns[~copy_train.columns.isin([Categorical])]

for d in dfs_copied:
  for c in Categorical.columns:
    d[c] = d[c].astype('category')

Now, I'll proceed to analyse each feature individually in order to preserve as much information as I can

In [None]:
def order_mapping(df, columns, dict):

  df[columns] = df[columns].cat.add_categories("NA").fillna(value="NA")

  df[columns] = df[columns].map(dict)

  df[columns] = df[columns].fillna(0)


### FireplaceQu

In [None]:
#copy_train['FireplaceQu'].value_counts(ascending=False)

grade_dict = {'NA': 0, 'Po':1, 'Fa':2, 'TA' : 3, 'Gd':4, 'EX' :5}

#g_without_nan = g.cat.add_categories("D").fillna("D")

order_mapping(copy_train, 'FireplaceQu', grade_dict)

#copy_train['FireplaceQu'] = copy_train['FireplaceQu'].fillna(0)

fig = px.box(copy_train, x = 'FireplaceQu', y = 'SalePrice', points = 'all')

fig.show()


### LotFrontage

In [None]:
#copy_train.groupby(by='Neighborhood')['LotFrontage'].median()

fig, axs = plt.subplots(ncols = 2)

from matplotlib.pyplot import figure

figure(figsize=(8, 4), dpi=100)

g = sns.boxplot(data = copy_train, x = 'Neighborhood', y = 'LotFrontage', ax = axs[0] )

g.set_xticklabels(labels = copy_train['Neighborhood'], rotation=90)

g= sns.countplot(data= copy_train, x = 'Neighborhood', ax = axs[1])

g.set_xticklabels(labels = copy_train['Neighborhood'], rotation=90)

#plt.xticks(rotation = 70)

plt.show()

copy_train["LotFrontage"] = copy_train.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))

#copy_train = copy_train.drop(columns='LotFrontage')

### Garage Features

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

Garage_dict = {'NA': 0, 'Po':1, 'Fa':2, 'TA' : 3, 'Gd':4, 'EX' :5}

order_mapping(copy_train, 'GarageCond', Garage_dict)

order_mapping(copy_train, 'GarageQual', Garage_dict)

copy_train['GarageYrBlt'] = copy_train['GarageYrBlt'].fillna(0)

enc = OneHotEncoder()

garages = ['GarageType', 'GarageFinish']

for col in garages:
  copy_train[col] = copy_train[col].cat.add_categories("NA").fillna(value="NA")
  
"""transformer = make_column_transformer((OneHotEncoder(), 
                                       garages),
                                       remainder = 'passthrough')

transformed = transformer.fit_transform(copy_train)

transformed_copy_train = pd.DataFrame(transformed, columns = transformer.get_feature_names())"""

#for col in garages:
#  copy_train[col] = enc.fit_transform(copy_train[[col]])

### Basement Features

In [None]:
#'BsmtFinType2', 'BsmtExposure', 'BsmtQual', 'BsmtCond', 'BsmtFinType1'

bsmt_cols = ['BsmtFinType2', 'BsmtExposure', 'BsmtQual', 'BsmtCond', 'BsmtFinType1']

ord_bsmt = ['BsmtQual', 'BsmtCond']

bsmt_cat = list(set(bsmt_cols)-set(ord_bsmt))

for col in ord_bsmt :
  order_mapping(copy_train, col, grade_dict)

for col in bsmt_cat:
  copy_train[col] = copy_train[col].cat.add_categories("NA").fillna(value="NA")

"""transformer = make_column_transformer((OneHotEncoder(), 
                                       bsmt_cat),
                                       remainder = 'passthrough')

transformed = transformer.fit_transform(copy_train)

transformed_copy_train = pd.DataFrame(transformed, columns = transformer.get_feature_names())"""

#copy_train[]

#for col in list(set(bsmt_cols)-set(ord_bsmt)):
#  copy_train[col] =labelencoder.fit_transform(copy_train[col])

#for col in bsmt_cols:
#  print(str(col) + '\n') 
#  print(transformed_copy_train[col].value_counts(ascending=False))

In [None]:
#transformed_copy_train.head()

### Masonry

In [None]:
copy_train['MasVnrType'] = copy_train['MasVnrType'].cat.add_categories("NA").fillna('NA')

copy_train['MasVnrArea'] = copy_train.groupby('Neighborhood')['MasVnrArea'].transform(
    lambda x: x.fillna(x.median()))



### Electrical

In [None]:
copy_train['Electrical'] = copy_train['Electrical'].fillna(copy_train['Electrical'].mode()[0])

In [None]:
copy_train.isna().sum().sort_values(ascending=False)

No more missing values :)

## EDA

This section is where we can get a better grasp of the data. The goal here is to check the relationship between the features so we can properly build the right model for the kind of data that is available.

In [None]:
X_train = copy_train.drop(columns='SalePrice')

Y_train = copy_train.loc[:, 'SalePrice']

Categorical = X_train.select_dtypes(include='category')

Numerical = X_train.select_dtypes(include=['int', 'float'])



In [None]:
Numerical.describe()

In [None]:
for i in range(len(Numerical.columns)):
  sns.scatterplot(x=Numerical.iloc[:,i], y =Y_train)
  plt.show()

Try to add another collumn of plots showing the countplot of each categorical feature

In [None]:
copy_train = copy_train.drop(copy_train[copy_train['GrLivArea'] > 4500].index)

In [None]:
for i in range(len(Categorical.columns)):
  sns.boxplot(x=Categorical.iloc[:,i], y =Y_train)
  plt.title(Categorical.columns[i])
  plt.xticks(rotation = 90)
  plt.show()

From the scatter and box plots, it seems that there may be some significant outliers present on our data. We will come back to it if our residual plots indicates that this is affecting our models.

In [None]:
corr = X_train.corr()

plt.rcParams["figure.figsize"] = (20, 10)

mask = np.zeros_like(X_train.corr(), dtype=bool) 

mask[np.triu_indices_from(mask)] = True 

plt.title('Pearson Correlation Matrix',fontsize=25)

f = sns.heatmap(X_train.corr(), square = True, cmap="BuGn", linewidths = 0.2, mask = mask)#linewidths=0.25,vmax=0.7,square=True,cmap="BuGn", #"BuGn_r" to reverse 
            #linecolor='w',annot=True,annot_kws={"size":8},mask=mask,cbar_kws={"shrink": .9})

plt.show()

In [None]:
sns.distplot(Y_train, fit = norm)

mu, sigma = norm.fit(Y_train)

plt.title('Sale Price Distribution',fontsize=20)

plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')

plt.show()

The Histogram of SalePrice shows that the distribution is skewed to the right. Although normality is not an assumption for the linear models that will be deployed in this notebook, it might be benefical to transform the dependent variable in order to achieve better predictions.

In [None]:
y_corr = copy_train.corr()

y_corr = y_corr['SalePrice'].sort_values(ascending=False)

top_10_y = y_corr.nlargest(11)[1:]

print(top_10_y)

From a simple analysis of the features it is clear that OverallQual and GrLivArea are the predictors with the highest linear correlation to SalePrice. 

### Feature Engineering

I will not try to do anything fancy here although I believe I could achieve some better results by combining some of the predictors, the focus here will be on adjusting data to fit our linear models.

In [None]:
display(copy_train.info())

In [None]:
copy_train['BsmtCond'] = copy_train['BsmtCond'].astype('int64') #fix

In [None]:
#transform remaining ordinal features to numerical

remaining_ordinal = ['ExterQual', 'ExterCond', 'HeatingQC', 'KitchenQual']



for col in remaining_ordinal:
  order_mapping(copy_train, col, grade_dict)


In [None]:
#OneHotEncoding remaning categorical variables

Categorical = copy_train.select_dtypes(include='category').columns

transformer = make_column_transformer((OneHotEncoder(), 
                                       Categorical),
                                       remainder = 'passthrough')

transformed = transformer.fit_transform(copy_train)

transformed_copy_train = pd.DataFrame(transformed, columns = transformer.get_feature_names())

In [None]:
transformed_copy_train.info()

In [None]:
X_train = transformed_copy_train.drop(columns='SalePrice')

Y_train = transformed_copy_train.loc[:, 'SalePrice']

#Important for some linear models to have the variance of each feature in the same order of magnitude

from sklearn.preprocessing import StandardScaler

scale = StandardScaler()

scale.fit(X_train[Numerical.columns])

X_train[Numerical.columns] = scale.transform(X_train[Numerical.columns])

### Model Deployment

In [None]:
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.model_selection import cross_val_score

def rmse_cv(model):
    rmse= np.sqrt(-cross_val_score(model, X_train, Y_train, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)

In [None]:
model_ridge = Ridge()


alphas = [0.1, 1, 5, 10, 50]

cv_ridge = [rmse_cv(Ridge(alpha = alpha)).mean() 
            for alpha in alphas]

In [None]:
cv_ridge = pd.Series(cv_ridge, index = alphas)
cv_ridge.plot(title = "Ridge RMSE per alpha")
plt.xlabel("alpha")
plt.ylabel("rmse")

In [None]:
model_ridge = Ridge(alpha = 10)

model_ridge.fit(X_train, Y_train)


In [None]:
preds = model_ridge.predict(X_train)

residuals = Y_train - preds

res_frame = pd.DataFrame({'preds':preds, 'residuals': residuals})

plt.rcParams['figure.figsize'] = (6.0, 6.0)

sns.residplot(x = preds, y = residuals)

plt.title('Residuals plot')

plt.ylabel('Residuals')

plt.xlabel('Predictions')

plt.show()

Overall the residuals plot indicates that the error terms have non constant variance, wich means, heteroscedasticity. To tackle this problem the common approach is transforming the response variable Y. It also suggests that it's possible to increase the model performance by removing outliers.

In [None]:
copy_train['SalePrice'] = np.log(copy_train['SalePrice'])