In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
# Part 1 Data Exploration
# Based on 
# https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python/notebook

In [None]:
df_train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')

In [None]:
# OverallQual, YearBuilt, TotalBsmtSF, GrLivArea might be important features
# Analyzing our target, SalePrice

In [None]:
df_train['SalePrice'].describe()

In [None]:
# histogram of SalePrice
sns.distplot(df_train['SalePrice'])

In [None]:
# print out the skewness and kurtosis
print('Skewness: {}'.format(df_train['SalePrice'].skew()))
print('Kurtosis: {}'.format(df_train['SalePrice'].kurt()))

In [None]:
# SalePrice's relationship with numerical variables

In [None]:
data = pd.concat([df_train['SalePrice'], df_train['GrLivArea']], axis=1)
data.plot.scatter(x='GrLivArea', y='SalePrice', ylim=(0, 800000))

In [None]:
data = pd.concat([df_train['SalePrice'], df_train['TotalBsmtSF']], axis=1)
data.plot.scatter(x='TotalBsmtSF', y='SalePrice', ylim=(0, 800000))

In [None]:
# SalePrice's relationship with categorical features

In [None]:
data = pd.concat([df_train['SalePrice'], df_train['OverallQual']], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x='OverallQual', y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);

In [None]:
data = pd.concat([df_train['SalePrice'], df_train['YearBuilt']], axis=1)
f, ax = plt.subplots(figsize=(16, 8))
fig = sns.boxplot(x='YearBuilt', y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
plt.xticks(rotation=90);

In [None]:
# objective analysis of the features

In [None]:
# correlation matrix (heatmap style)
corrmat = df_train.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);

In [None]:
# SalePrice correlation matrix
cols = corrmat.nlargest(10, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(df_train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

In [None]:
# scatter plots between 'SalePrice' and correlated variables
sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(df_train[cols], size = 2.5)
plt.show();

In [None]:
# Deal with missing data

In [None]:
# missing data
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

In [None]:
# dealing with missing data
# delete the features that has more than 15% missing values
# GarageX variables are related to GarageCars, delete them, (same for Bsmt)
# MasVnrX are not essential and strongly correlated with 'YearBuilt' and 'OverallQual' (which are already been considered), delete them
# 1 missing for Electrical, delete the observation and keep the variable

df_train = df_train.drop((missing_data[missing_data['Total'] > 1]).index,1)
df_train = df_train.drop(df_train.loc[df_train['Electrical'].isnull()].index)
df_train.isnull().sum().max()

In [None]:
# Deal with outliers

In [None]:
#Univariate Analysis

In [None]:
# Standardizing the data
saleprice_scaled = StandardScaler().fit_transform(df_train['SalePrice'][:,np.newaxis]);
low_range = saleprice_scaled[saleprice_scaled[:,0].argsort()][:10]
high_range= saleprice_scaled[saleprice_scaled[:,0].argsort()][-10:]
print('outer range (low) of the distribution:')
print(low_range)
print('\nouter range (high) of the distribution:')
print(high_range)

In [None]:
# Bivariate Analysis

In [None]:
#bivariate analysis saleprice/grlivarea
data = pd.concat([df_train['SalePrice'], df_train['GrLivArea']], axis=1)
data.plot.scatter(x='GrLivArea', y='SalePrice', ylim=(0,800000));

In [None]:
# two points with unusually large GrLivArea doesn't follow the general trend (may be agricultural area), delete
# two points with unusally high SalePrice are following the trend, should be special cases, keep
df_train.sort_values(by='GrLivArea', ascending=False)[:2]
df_train = df_train.drop(df_train[df_train.Id==1299].index)
df_train = df_train.drop(df_train[df_train['Id'] == 524].index)

In [None]:
#bivariate analysis saleprice/grlivarea
data = pd.concat([df_train['SalePrice'], df_train['TotalBsmtSF']], axis=1)
data.plot.scatter(x='TotalBsmtSF', y='SalePrice', ylim=(0,800000));

In [None]:
# Getting hard core
# (testing for the assumptions underlying the statistical bases for multivariate analysis)

In [None]:
# Search for normality

In [None]:
# histogram and normal probability plot for SalePrice
sns.distplot(df_train['SalePrice'], fit=norm);
fig = plt.figure()
res = stats.probplot(df_train['SalePrice'], plot=plt)

In [None]:
# positive skewness -> log transformation
df_train['SalePrice'] = np.log(df_train['SalePrice'])
sns.distplot(df_train['SalePrice'], fit=norm);
fig = plt.figure()
res = stats.probplot(df_train['SalePrice'], plot=plt)

In [None]:
# histogram and normal probability plot for GrLivArea
sns.distplot(df_train['GrLivArea'], fit=norm);
fig = plt.figure()
res = stats.probplot(df_train['GrLivArea'], plot=plt)

In [None]:
# positive skewness -> log transformation
df_train['GrLivArea'] = np.log(df_train['GrLivArea'])
sns.distplot(df_train['GrLivArea'], fit=norm);
fig = plt.figure()
res = stats.probplot(df_train['GrLivArea'], plot=plt)

In [None]:
#histogram and normal probability plot for TotalBsmtSF
sns.distplot(df_train['TotalBsmtSF'], fit=norm);
fig = plt.figure()
res = stats.probplot(df_train['TotalBsmtSF'], plot=plt)

In [None]:
# A significant number of observations with value zero (houses without basement)
# the value zero doesn't allow us to do log transformations
# create a variable that can get the effect of having or not having basement (binary variable) and do a log transformation to all the non-zero observations, ignoring those with value zero
# this way we can transform data, without losing the effect of having or not basement

df_train['HasBsmt'] = pd.Series(len(df_train['TotalBsmtSF']), index=df_train.index)
df_train['HasBsmt'] = 0
df_train.loc[df_train.TotalBsmtSF>0, 'HasBsmt'] = 1
df_train.loc[df_train.HasBsmt==1, 'TotalBsmtSF'] = np.log(df_train.loc[df_train.HasBsmt==1, 'TotalBsmtSF'])

In [None]:
#histogram and normal probability plot
sns.distplot(df_train[df_train['TotalBsmtSF']>0]['TotalBsmtSF'], fit=norm);
fig = plt.figure()
res = stats.probplot(df_train[df_train['TotalBsmtSF']>0]['TotalBsmtSF'], plot=plt)

In [None]:
# Search for homoscedasticity

In [None]:
#scatter plot
plt.scatter(df_train['GrLivArea'], df_train['SalePrice']);
# log transformation made the original cone shape disappear

In [None]:
#scatter plot
plt.scatter(df_train[df_train['TotalBsmtSF']>0]['TotalBsmtSF'], df_train[df_train['TotalBsmtSF']>0]['SalePrice']);

In [None]:
# Deal with dummy variables

In [None]:
#convert categorical variable into dummy
df_train = pd.get_dummies(df_train)
df_train