In [214]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from scipy import stats
from scipy.stats import norm, skew
from sklearn.model_selection import train_test_split
from scipy.special import boxcox1p
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn import preprocessing

### Exploratory Data Analysis

#### Read data

In [215]:
# Read data
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
Ids = df_test["Id"].tolist()

#### Correlation coefficient

In [216]:
# Compute the correlation matrix
corr = df_train.corr(method="kendall")
cutoff = 0.05
df_train = df_train.drop(columns=corr.SalePrice[np.abs(corr.SalePrice) < cutoff].index, axis=1)
df_test = df_test.drop(columns=corr.SalePrice[np.abs(corr.SalePrice) < cutoff].index, axis=1)
print(corr.SalePrice[np.abs(corr.SalePrice) < cutoff])

Id             -0.012030
MSSubClass     -0.003979
BsmtFinSF2     -0.030710
BsmtHalfBath   -0.009962
PoolArea        0.047800
MoSold          0.049471
YrSold         -0.021796
Name: SalePrice, dtype: float64


#### Jointplot

In [217]:
# Jointplot to remove outliers
# num_features = df.dtypes[df.dtypes!=object].index.tolist()
# for col in num_features:
#     sns.jointplot(x=col, y="SalePrice", data=df, kind = 'reg')

#### Drop outliers

In [218]:
# Drop outliers
rows_to_drop = []
rows_to_drop = list(df_train[df_train.SalePrice>700000 ].index.values)
rows_to_drop += list(df_train[df_train.LotFrontage > 300].index.values)
rows_to_drop += list(df_train[df_train.LotArea > 15000].index.values)
rows_to_drop += list(df_train[df_train.MasVnrArea > 1300].index.values)
rows_to_drop += list(df_train[df_train.BsmtFinSF1 > 5000].index.values)
rows_to_drop += list(df_train[df_train.TotalBsmtSF > 5000].index.values)
rows_to_drop += list(df_train[df_train["1stFlrSF"] > 4000].index.values)
rows_to_drop += list(df_train[df_train.GrLivArea > 4000].index.values)
rows_to_drop += list(df_train[df_train.BsmtFullBath > 2.5].index.values)
rows_to_drop += list(df_train[df_train.BedroomAbvGr > 7].index.values)
rows_to_drop += list(df_train[df_train.KitchenAbvGr > 2.5].index.values)
rows_to_drop += list(df_train[df_train["3SsnPorch"] > 400].index.values)
rows_to_drop += list(df_train[df_train["ScreenPorch"] > 400].index.values)
rows_to_drop += list(df_train[df_train["MiscVal"] > 2200].index.values)
rows_to_drop += list(df_train[df_train["OpenPorchSF"] > 500].index.values)
rows_to_drop += list(df_train[df_train["EnclosedPorch"] > 500].index.values)
df_train = df_train.drop(rows_to_drop)
Y = df_train.SalePrice.values
df_all = pd.concat([df_train.drop(columns=['SalePrice']), df_test])

#### Data cleaning

In [219]:
# Columns containg NaN
total = df_all.isnull().sum().sort_values(ascending=False)
percent = (df_all.isnull().sum()/df_all.isnull().count()).sort_values(ascending=False)
d_type = df_all[total.index.tolist()].dtypes
pd.concat([total, percent, d_type], axis=1, keys=['Total', 'Percent', "Dtype"])

Unnamed: 0,Total,Percent,Dtype
PoolQC,2779,0.998204,object
MiscFeature,2688,0.965517,object
Alley,2590,0.930316,object
Fence,2230,0.801006,object
FireplaceQu,1390,0.499282,object
LotFrontage,444,0.159483,float64
GarageYrBlt,152,0.054598,float64
GarageCond,152,0.054598,object
GarageFinish,152,0.054598,object
GarageQual,152,0.054598,object


In [208]:
# Fillna
cat_cols_fillna = ["PoolQC","MiscFeature","Alley","Fence","FireplaceQu","GarageCond","GarageFinish","GarageQual","GarageType","BsmtExposure","BsmtCond","BsmtQual","BsmtFinType1","BsmtFinType2","MasVnrType","MSZoning","Functional","Utilities","Electrical","Exterior1st","KitchenQual","Exterior2nd","SaleType"]
non_cat_cols_fillna = ["LotFrontage","GarageYrBlt","MasVnrArea","BsmtFullBath","TotalBsmtSF","BsmtUnfSF","BsmtFinSF1","GarageCars","GarageArea"]
for col in cat_cols_fillna:
    df_all[col].fillna('None', inplace=True)
for col in non_cat_cols_fillna:
    df_all[col].fillna(df_all[col].mean(), inplace=True)

In [209]:
# transforming numerical that are categorical
df_all['OverallCond'] = df_all['OverallCond'].astype(str)

# Encoding string features
cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'OverallCond')

for c in cols:
    lbl = LabelEncoder()
    lbl.fit(df_all[c].values)
    df_all[c] = lbl.transform(list(df_all[c].values))

#### Box-Cox Transform
Remove skewness and kurtosis

In [210]:
# Skewed features
numeric_feats = df_all.dtypes[df_all.dtypes != "object"].index
skewed_feats = df_all[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness.head(10)

Unnamed: 0,Skew
MiscVal,26.028143
LowQualFinSF,13.44122
3SsnPorch,11.060406
LandSlope,5.0655
KitchenAbvGr,4.120462
EnclosedPorch,3.994448
ScreenPorch,3.886563
LotArea,2.872434
MasVnrArea,2.443863
OpenPorchSF,2.40954


In [211]:
# Transform
skewness = skewness[abs(skewness) > 0.75]
skewed_features = skewness.index
lamda_ = 0.15
for feat in skewed_features:
    df_all[feat] = boxcox1p(df_all[feat], lamda_)

#### Prepare train test data

In [212]:
# Preparing train test data
df_all = pd.get_dummies(df_all)
df_train = df_all.iloc[:len(df_train)]
df_test = df_all.iloc[len(df_train):]
X_train = df_train.values
X_test = df_test.values
Y_train = Y.astype(np.float64)
df_train.head()
X_train = preprocessing.MinMaxScaler().fit_transform(X_train)

#### Train and predict

In [213]:
switcher = {"LinearRegressor": LinearRegression(),
            "RandomForestRegressor" : RandomForestRegressor(random_state=0),
            "GradientBoostingRegressor": GradientBoostingRegressor(random_state=0)}
model = switcher["GradientBoostingRegressor"]
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
res = pd.DataFrame({"Id" : Ids, "SalePrice" : Y_pred})
res.to_csv("res.csv", index=False)