# Pre-processing
blah blah write some explanation

## Import

In [None]:
# Basic Libraries
import math
import numpy as np
import pandas as pd
import seaborn as sb
import researchpy as rp
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics
from scipy import stats
from scipy.stats import skew 
from sklearn.model_selection import train_test_split
# machine learning library
from sklearn.linear_model import LinearRegression
from sklearn import svm
# data normalization from sklearn
from sklearn.preprocessing import MinMaxScaler
# data standardization from sklearn
from sklearn.preprocessing import StandardScaler
# computational time
import time

In [2]:
#importing the csv dataset
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# Four Standard Basic Optimisations
In order to determine which learning algorithm is the most accurate for this application, we will run each one on the dataset with a few basic optimisations.
For the purpose of preliminary comparisons, we will process the data in 4 general ways and save them to individual csv files. The 4 general ways are as follows (in order):
<br><br>
<u>ONE</u>
- Dealing with null data
- Dealing with highly-skewed categorical data
- One Hot Encoding
- Feature selection

<u>TWO</u>
- Dealing with null data
- Dealing with highly-skewed categorical data
- One Hot Encoding
- <b>Dealing with highly-skewed continuous data</b>
- Feature selection

<u>THREE</u>
- Dealing with null data
- Dealing with highly-skewed categorical data
- One Hot Encoding
- Dealing with highly-skewed continuous data
- <b>Normalisation</b>
- Feature selection

<u>FOUR</u>
- Dealing with null data
- Dealing with highly-skewed categorical data
- One Hot Encoding
- Dealing with highly-skewed continuous data
- <b>Z-scoring</b>
- Feature selection

<br><br>
(Note: Feature selection is done at the end of each version. The rest will build upon ONE or TWO)

## Standard optimisation ONE
- Dealing with null data
- Dealing with highly-skewed categorical data
- One Hot Encoding
- Feature selection

### Dealing with null data

FROM EDA:

LotFrontage /number of null data: 259

MasVnrArea /number of null data: 8

Electrical /number of null data: 1

GarageYrBlt /number of null data: 81


For feature with too many missing values, remove this feature entirely. 
- Set treshold to be 10% of all data points

For feature with only a few missing values, remove that data point.

In [3]:
nullData = [['LotFrontage', 259], ['MasVnrArea', 8], ['Electrical', 1], ['GarageYrBlt', 81]]
n = len(train)
treshold = 0.1
drop = []

print('Drop feature - too many nulls:')
for i in nullData:
    if i[1]/n > treshold: # Arbitrary treshold: 10%
        print(i[0])
        train.drop(columns=[i[0]], inplace=True)
    else:
        drop.append(i[0])
        
print('Remove data point:')
print(drop)
train.dropna(subset=drop, inplace=True)

train.head()

Drop feature - too many nulls:
LotFrontage
Remove data point:
['MasVnrArea', 'Electrical', 'GarageYrBlt']


Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
print('Number of columns:', len(train.columns))
print('Number of data:', len(train))

Number of columns: 80
Number of data: 1370


### Dealing with highly skewed categorical features

As identifited from data exploration, we removed categorical variables with one category of data occuping >= 90% of data. 

In [5]:
categorical = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond','Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
#skewness of categorical variables
max_percent = []
catogorical_skewed=[]
for i in categorical: 
    if rp.summary_cat(train[i])["Percent"].max() >= 90: 
        max_percent.append(rp.summary_cat(train[i])["Percent"].max())
print ("The number of variables with one category of data which occupies >= 90% of data =", len(max_percent))
#highly skewed categorical variables
for i in categorical: 
    if rp.summary_cat(train[i])["Percent"].max() >= 90:
        catogorical_skewed.append(i)
        print (i,"/ratio of the dominant category = ", rp.summary_cat(train[i])["Percent"].max()/100)

The number of variables with one category of data which occupies >= 90% of data = 15
Street /ratio of the dominant category =  0.9964
LandContour /ratio of the dominant category =  0.9015000000000001
Utilities /ratio of the dominant category =  0.9993000000000001
LandSlope /ratio of the dominant category =  0.9467
Condition2 /ratio of the dominant category =  0.9898
RoofMatl /ratio of the dominant category =  0.9818000000000001
BsmtCond /ratio of the dominant category =  0.9246
Heating /ratio of the dominant category =  0.981
CentralAir /ratio of the dominant category =  0.9495999999999999
Electrical /ratio of the dominant category =  0.9226000000000001
Functional /ratio of the dominant category =  0.9336
GarageQual /ratio of the dominant category =  0.9504
GarageCond /ratio of the dominant category =  0.9612999999999999
PavedDrive /ratio of the dominant category =  0.9372
MiscFeature /ratio of the dominant category =  0.9216


In [6]:
for i in catogorical_skewed:
    train.drop(columns=[i], inplace=True)

In [7]:
for i in catogorical_skewed:
    categorical.remove(i)
#skewness of categorical variables
max_percent = []
for i in categorical: 
    if rp.summary_cat(train[i])["Percent"].max() >= 90: 
        max_percent.append(rp.summary_cat(train[i])["Percent"].max())
print ("The number of variables with one category of data which occupies >= 90% of data =", len(max_percent))
#highly skewed categorical variables
for i in categorical: 
    if rp.summary_cat(train[i])["Percent"].max() >= 90: 
         print (i,"/ratio of the dominant category = ", rp.summary_cat(train[i])["Percent"].max()/100)

The number of variables with one category of data which occupies >= 90% of data = 0


### One Hot Encoding

In [8]:
train = pd.get_dummies(train, columns= categorical, prefix= categorical)
train.head()

Unnamed: 0,Id,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,8450,2003,2003,196.0,706,0,150,856,856,...,0,0,0,1,0,0,0,0,1,0
1,2,9600,1976,1976,0.0,978,0,284,1262,1262,...,0,0,0,1,0,0,0,0,1,0
2,3,11250,2001,2002,162.0,486,0,434,920,920,...,0,0,0,1,0,0,0,0,1,0
3,4,9550,1915,1970,0.0,216,0,540,756,961,...,0,0,0,1,1,0,0,0,0,0
4,5,14260,2000,2000,350.0,655,0,490,1145,1145,...,0,0,0,1,0,0,0,0,1,0


In [9]:
print('Shape:', train.shape)
print('Dtypes:\n', train.dtypes)

Shape: (1370, 248)
Dtypes:
 Id                         int64
LotArea                    int64
YearBuilt                  int64
YearRemodAdd               int64
MasVnrArea               float64
                          ...   
SaleCondition_AdjLand      uint8
SaleCondition_Alloca       uint8
SaleCondition_Family       uint8
SaleCondition_Normal       uint8
SaleCondition_Partial      uint8
Length: 248, dtype: object


In [10]:
# create a checkpoint
train1 = train.copy()

### Feature selection
Choosing variables that is correlated to SalePrice with absolute value more than 0.2

In [11]:
# variables most closely related to SalePrice
corr = train1.corr()['SalePrice']
# print(corr)
feature_select=[]
corrshape=corr.nlargest(corr.shape[0])[1:corr.shape[0]].shape[0]
# print(corrshape)
for i in range(corrshape-1):
    if (abs(corr.nlargest(corrshape)[1:corrshape][i])>0.2):
        feature_select.append(corr.nlargest(corrshape)[1:corrshape].axes[0][i])
        
print('Number of features selected = ', len(feature_select))        
print(corr.nlargest(corrshape)[1:corrshape])

Number of features selected =  68
GrLivArea            0.709783
GarageCars           0.636173
GarageArea           0.607197
TotalBsmtSF          0.603284
1stFlrSF             0.596087
                       ...   
OverallQual_5       -0.383080
GarageType_Detchd   -0.406550
BsmtQual_TA         -0.456964
GarageFinish_Unf    -0.485273
KitchenQual_TA      -0.527689
Name: SalePrice, Length: 246, dtype: float64


In [12]:
#export file to csv
train_ONE = train1[feature_select+['Id', 'SalePrice']].copy()
train_ONE.to_csv('train_ONE.csv', index=False)

## Standard optimisation TWO
- Dealing with null data
- Dealing with highly-skewed categorical data
- One Hot Encoding
- Dealing with highly-skewed continuous data
- Feature selection

### Dealing with highly-skewed continuous data

#### Removing outliers with IQR
~~After doing the log transformation,~~ we selected continuous variables that still gives a skewness value of more than 1. <br> 
We then used the interquartile range (IQR) method to remove the outliers.


In [14]:
# identifying continuous variables with high skewness
continuous = ['LotArea', 'YearBuilt', 'YearRemodAdd', 
       'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr','TotRmsAbvGrd', 'Fireplaces', 
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
array1 = skew(train1[continuous])
#the top 15 features which are the most skewed
index1 = array1.argsort()[-15:][::-1]
continuous_skewed=[]
for i in index1:
    print (continuous[i],": skewness = ", array1[i])
    if array1[i]>=1:
        continuous_skewed.append(continuous[i])

MiscVal : skewness =  24.732979451706033
PoolArea : skewness =  14.342881566937836
LotArea : skewness =  12.025446327344339
LowQualFinSF : skewness =  10.66505349858299
3SsnPorch : skewness =  9.96262046395852
KitchenAbvGr : skewness =  5.129053439295553
BsmtFinSF2 : skewness =  4.174330005566947
ScreenPorch : skewness =  3.967379533802487
BsmtHalfBath : skewness =  3.89811918894981
EnclosedPorch : skewness =  3.214887613130617
MasVnrArea : skewness =  2.588332838278803
OpenPorchSF : skewness =  2.2638025759950504
BsmtFinSF1 : skewness =  1.6941364533315026
TotalBsmtSF : skewness =  1.6330286577439608
WoodDeckSF : skewness =  1.5027101898713007


In [15]:
print('train1 shape\t\t:',train1.shape)
# calculate interquartile range
q25, q75 = train1[continuous_skewed].quantile(0.25), train1[continuous_skewed].quantile(0.75)
iqr = q75 - q25
# calculate the outlier cutoff
cut_off = iqr * 1.5
lower, upper = q25 - cut_off, q75 + cut_off

train1_iqr = train1[~((train1>upper)).any(axis=1)]
print('train1_iqr shape\t:', train1_iqr.shape)

train1 shape		: (1370, 248)
train1_iqr shape	: (688, 248)


In [16]:
#calculating skewness of continuous variables
array = skew(train1_iqr[continuous])
#the top 3 features which are the most skewed
print('Top 3 features which are the most skewed:')
index = array.argsort()[-3:][::-1]
for i in index: 
    print (continuous[i],": skewness = ", array[i])

Top 3 features which are the most skewed:
MasVnrArea : skewness =  1.1633497486834365
OpenPorchSF : skewness =  1.0088075868431694
WoodDeckSF : skewness =  0.77571907189836


In [None]:
# features_liqr = train1_iqr.copy()
# features_liqr.drop(['Id', 'SalePrice'], axis=1, inplace=True)
# labels_liqr = train1_iqr['SalePrice']
# train_X_liqr, test_X_liqr, train_Y_liqr, test_Y_liqr = train_test_split(features_liqr,labels_liqr,test_size=0.2, random_state=0)

In [None]:
#investigating the skewness of data features (visually)
f, axes = plt.subplots(len(continuous)+1, 3, figsize=(20, 200))
colors = ["r", "g", "b","r","g"]
colors = colors*7

count = 0
for i in continuous+['SalePrice']:
    sb.boxplot(x = train1_iqr[i], color = colors[count], ax = axes[count,0])
    sb.histplot(x = train1_iqr[i], color = colors[count], ax = axes[count,1])
    sb.violinplot(x = train1_iqr[i], color = colors[count], ax = axes[count,2])
    count += 1

In [17]:
# create a checkpoint
train2 = train1_iqr.copy()

### Feature selection
Choosing variables that is correlated to SalePrice with absolute value more than 0.2

In [18]:
# variables most closely related to SalePrice
corr = train1_iqr.corr()['SalePrice']
# print(corr)
feature_select=[]
corrshape=corr.nlargest(corr.shape[0])[1:corr.shape[0]].shape[0]
# print(corrshape)
for i in range(corrshape-1):
    if (abs(corr.nlargest(corrshape)[1:corrshape][i])>0.2):
        feature_select.append(corr.nlargest(corrshape)[1:corrshape].axes[0][i])
        
print('Number of features selected = ', len(feature_select))        
print(corr.nlargest(corrshape)[1:corrshape])

Number of features selected =  73
GrLivArea            0.764786
GarageCars           0.699105
GarageArea           0.671034
TotRmsAbvGrd         0.639797
FullBath             0.632455
                       ...   
GarageType_Detchd   -0.452965
Foundation_CBlock   -0.478446
BsmtQual_TA         -0.527170
GarageFinish_Unf    -0.544653
KitchenQual_TA      -0.571333
Name: SalePrice, Length: 211, dtype: float64


In [19]:
#export file to csv
train_TWO = train1_iqr[feature_select+['Id', 'SalePrice']].copy()
train_TWO.to_csv('train_TWO.csv', index=False)

## Standard optimisation THREE
- Dealing with null data
- Dealing with highly-skewed categorical data
- One Hot Encoding
- Dealing with highly-skewed continuous data
- Normalisation
- Feature selection

### Normalisation (Min-Max scaling)

Transforming values so that they range between 0 and 1.

In [21]:
features3 = train2.copy()
features3.drop(['Id', 'SalePrice'], axis=1, inplace=True)

replace = np.array(train2[['Id', 'SalePrice']])
# drop Id and SalePrice cause we are not gonna be scaling them. Add back later

In [22]:
# fit scaler on training data
norm = MinMaxScaler().fit(features3)
# transform training data
features_norm = norm.transform(features3)

In [23]:
# final transformed data as dataframe with Id and SalePrice added in
train3 = pd.DataFrame(features_norm, columns=features3.columns)
train3['Id'] = replace.T[0]
train3['SalePrice']= replace.T[1]
train3.head()

Unnamed: 0,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,...,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,Id,SalePrice
0,0.436748,0.946154,0.883333,0.457944,0.436072,0.0,0.073457,0.419197,0.247191,0.560735,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1,208500
1,0.607782,0.930769,0.866667,0.378505,0.300185,0.0,0.212537,0.450539,0.285038,0.568615,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,3,223500
2,0.791644,0.923077,0.833333,0.817757,0.404571,0.0,0.239961,0.560725,0.418096,0.691399,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,5,250000
3,0.536559,0.953846,0.916667,0.434579,0.845584,0.0,0.15524,0.825661,0.742756,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,7,307000
4,0.604728,0.653846,0.25,0.0,0.559605,0.0,0.065622,0.509305,0.356002,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,11,129500


In [None]:
# train_X_norm, test_X_norm, train_Y_norm, test_Y_norm = train_test_split(features_norm,labels,test_size=0.2, random_state=0)

In [None]:
# #investigating the skewness of data features (visually)
# f, axes = plt.subplots(len(continuous), 3, figsize=(20, 200))
# colors = ["r", "g", "b","r","g"]
# colors = colors*7
# count = 0
# for i in continuous:
#     col_ind=features.columns.get_indexer([i])[0]
#     sb.boxplot(x = features3[:,col_ind], color = colors[count], ax = axes[count,0])
#     sb.histplot(x = features3[:,col_ind], color = colors[count], ax = axes[count,1])
#     sb.violinplot(x = features3[:,col_ind], color = colors[count], ax = axes[count,2])
#     count += 1 

In [None]:
# no need to checkpoint for three

### Feature selection
Choosing variables that is correlated to SalePrice with absolute value more than 0.2

In [24]:
# variables most closely related to SalePrice
corr = train3.corr()['SalePrice']
# print(corr)
feature_select=[]
corrshape=corr.nlargest(corr.shape[0])[1:corr.shape[0]].shape[0]
# print(corrshape)
for i in range(corrshape-1):
    if (abs(corr.nlargest(corrshape)[1:corrshape][i])>0.2):
        feature_select.append(corr.nlargest(corrshape)[1:corrshape].axes[0][i])
        
print('Number of features selected = ', len(feature_select))        
print(corr.nlargest(corrshape)[1:corrshape])

Number of features selected =  73
GrLivArea            0.764786
GarageCars           0.699105
GarageArea           0.671034
TotRmsAbvGrd         0.639797
FullBath             0.632455
                       ...   
GarageType_Detchd   -0.452965
Foundation_CBlock   -0.478446
BsmtQual_TA         -0.527170
GarageFinish_Unf    -0.544653
KitchenQual_TA      -0.571333
Name: SalePrice, Length: 211, dtype: float64


In [28]:
#export file to csv
train_THREE = train3[feature_select+['Id', 'SalePrice']].copy()
train_THREE.to_csv('train_THREE.csv', index=False)

## Standard optimisation FOUR
- Dealing with null data
- Dealing with highly-skewed categorical data
- One Hot Encoding
- Dealing with highly-skewed continuous data
- Z-Scoring
- Feature selection

### Applying Standardization (Z-Score method)

Transforming data into a distribution with a mean of 0 and standard deviation of 1.

In [None]:
# # This is the prof's one
# print('mean before:\t',np.mean(train2,axis=0))
# print('std before:\t',np.std(train2,axis=0))

# #YOUR CODE HERE
# train2 -= train2.mean(axis=0)
# train2 /= np.std(train2,axis=0)
# print('mean after:\t',np.mean(train2,axis=0))
# print('std after:\t',np.std(train2,axis=0))

# plt.figure(2,figsize=(6,6))
# plt.scatter(train2[:,0], train2[:,1], s=60, c='r', marker='+') 
# plt.title('Data normalized by z-scoring')
# plt.show()

In [30]:
features4 = train2.copy()

# transform
for i in continuous:
    # fit on training data column
    scale = StandardScaler().fit(train[[i]])
    # transform training data column
    features4[i] = scale.transform(features4[[i]])
    
features4.drop(['Id', 'SalePrice'], axis=1, inplace=True)
# same as before in THREE
# replace = np.array(train2[['Id', 'SalePrice']]) 

In [31]:
train4 = pd.DataFrame(features4, columns=features3.columns)
train4['Id'] = replace.T[0]
train4['SalePrice']= replace.T[1]
train4.head()

Unnamed: 0,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,...,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,Id,SalePrice
0,-0.218825,1.029146,0.864397,0.471073,0.548776,-0.293876,-0.947855,-0.499247,-0.830872,1.139894,...,0,1,0,0,0,0,1,0,1,208500
2,0.05502,0.961026,0.815486,0.286813,0.069283,-0.293876,-0.308055,-0.352256,-0.664743,1.167208,...,0,1,0,0,0,0,1,0,3,223500
4,0.349403,0.926967,0.717665,1.305662,0.437621,-0.293876,-0.181898,0.16451,-0.080693,1.592849,...,0,1,0,0,0,0,1,0,5,250000
6,-0.059017,1.063205,0.962218,0.416879,1.993793,-0.293876,-0.571635,1.407046,1.344389,-0.803943,...,0,1,0,0,0,0,1,0,7,307000
10,0.05013,-0.265118,-0.994206,-0.591131,0.984678,-0.293876,-0.9839,-0.076647,-0.353249,-0.803943,...,0,1,0,0,0,0,1,0,11,129500


In [None]:
# train_X_stan, test_X_stan, train_Y_stan, test_Y_stan = train_test_split(features_stan,labels,test_size=0.2, random_state=0)

In [None]:
# #investigating the skewness of data features (visually)
# f, axes = plt.subplots(len(continuous), 3, figsize=(20, 200))
# colors = ["r", "g", "b","r","g"]
# colors = colors*7
# count = 0
# for i in continuous:
#     sb.boxplot(x = features4[i], color = colors[count], ax = axes[count,0])
#     sb.histplot(x = features4[i], color = colors[count], ax = axes[count,1])
#     sb.violinplot(x = features4[i], color = colors[count], ax = axes[count,2])
#     count += 1 

In [None]:
# #calculating skewness of continuous variables
# array = skew(features4[continuous])
# #the top 10 features which are the most skewed
# index = array.argsort()[-10:][::-1]
# for i in index: 
#     print (continuous[i],": skewness = ", array[i])

### Feature selection
Choosing variables that is correlated to SalePrice with absolute value more than 0.2

In [32]:
# variables most closely related to SalePrice
corr = train4.corr()['SalePrice']
# print(corr)
feature_select=[]
corrshape=corr.nlargest(corr.shape[0])[1:corr.shape[0]].shape[0]
# print(corrshape)
for i in range(corrshape-1):
    if (abs(corr.nlargest(corrshape)[1:corrshape][i])>0.2):
        feature_select.append(corr.nlargest(corrshape)[1:corrshape].axes[0][i])
        
print('Number of features selected = ', len(feature_select))        
print(corr.nlargest(corrshape)[1:corrshape])

Number of features selected =  73
GrLivArea            0.764786
GarageCars           0.699105
GarageArea           0.671034
TotRmsAbvGrd         0.639797
FullBath             0.632455
                       ...   
GarageType_Detchd   -0.452965
Foundation_CBlock   -0.478446
BsmtQual_TA         -0.527170
GarageFinish_Unf    -0.544653
KitchenQual_TA      -0.571333
Name: SalePrice, Length: 220, dtype: float64


In [33]:
#export file to csv
train_FOUR = train4[feature_select+['Id', 'SalePrice']].copy()
train_FOUR.to_csv('train_FOUR.csv', index=False)

## Features

Create a feature dataset that does not consist of Id and SalePrice (label: value to be predicted)


In [None]:
# features = train.copy()
# features.drop(['Id', 'SalePrice'], axis=1, inplace=True)

# labels = train['SalePrice']

In [None]:
# train_X, test_X, train_Y, test_Y = train_test_split(features,labels,test_size=0.2, random_state=0)

# Can ignore the rest from here lmao

## Feature Selection

Choosing variables that is correlated to SalePrice with absolute value more than 0.2

In [None]:
# # variables most closely related to SalePrice
# corr = train.corr()['SalePrice']
# feature_select=[]
# corrshape=corr.nlargest(corr.shape[0])[1:corr.shape[0]].shape[0]
# for i in range(corrshape-1):
#     if (abs(corr.nlargest(corrshape)[1:corrshape][i])>0.2):
#         feature_select.append(corr.nlargest(corrshape)[1:corrshape].axes[0][i])
# print(corr.nlargest(corrshape)[1:corrshape])
# print(len(feature_select))

In [None]:
# features_fs1 = train[feature_select].copy()
# labels_fs1 = train['SalePrice']
# train_X_fs1, test_X_fs1, train_Y_fs1, test_Y_fs1 = train_test_split(features_fs1,labels_fs1,test_size=0.2, random_state=0)

In [None]:
# # data normalization after feature selection 1
# # fit scaler on training data
# norm = MinMaxScaler().fit(features[feature_select])
# # transform training data
# features_fs1_norm = norm.transform(features[feature_select])

# train_X_fs1_norm, test_X_fs1_norm, train_Y_fs1_norm, test_Y_fs1_norm = train_test_split(features_fs1_norm,labels_fs1,test_size=0.2, random_state=0)

In [None]:
# # data standardization after feature selection 1
# features_fs1_stan = features_stan[feature_select].copy()
# train_X_fs1_stan, test_X_fs1_stan, train_Y_fs1_stan, test_Y_fs1_stan = train_test_split(features_fs1_stan,labels_fs1,test_size=0.2, random_state=0)

In [None]:
# # variables most closely related to SalePrice after removing outliers via IQR
# corr = train_out_2.corr()['SalePrice']
# feature_select2=[]
# corrshape=corr.nlargest(corr.shape[0])[1:corr.shape[0]].shape[0]
# for i in range(corrshape-1):
#     if (abs(corr.nlargest(corrshape)[1:corrshape][i])>0.2):
#         feature_select2.append(corr.nlargest(corrshape)[1:corrshape].axes[0][i])
# print(corr.nlargest(corrshape)[1:corrshape])
# print(len(feature_select))

In [None]:
# features_fs2 = train_out_2[feature_select2].copy()
# labels_fs2 = train_out_2['SalePrice']
# train_X_fs2, test_X_fs2, train_Y_fs2, test_Y_fs2 = train_test_split(features_fs2,labels_fs2,test_size=0.2, random_state=0)

In [None]:
# # data normalization after feature selection 2
# # fit scaler on training data
# norm = MinMaxScaler().fit(features_liqr[feature_select2])
# # transform training data
# features_fs2_norm = norm.transform(features_liqr[feature_select2])
# train_X_fs2_norm, test_X_fs2_norm, train_Y_fs2_norm, test_Y_fs2_norm = train_test_split(features_fs2_norm,labels_fs2,test_size=0.2, random_state=0)

In [None]:
# # data standardization after feature selection 2
# features_stan=train_out_2.copy()
# for i in continuous:
#     # fit on training data column
#     scale = StandardScaler().fit(train_out_2[[i]])
#     # transform training data column
#     features_stan[i] = scale.transform(features_stan[[i]])
# features_fs2_stan=features_stan[feature_select2].copy()
# train_X_fs2_stan, test_X_fs2_stan, train_Y_fs2_stan, test_Y_fs2_stan = train_test_split(features_fs2_stan,labels_fs2,test_size=0.2, random_state=0)

## Pre-processing v1 - Removing outliers with log transformation: Continuous Variables 

Using logarithmic transformation, it may help correct the distribution of variables.

In [None]:
# # identifying continuous variables with high skewness
# continuous = ['LotArea', 'YearBuilt', 'YearRemodAdd', 
#        'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
#        'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
#        'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr','TotRmsAbvGrd', 'Fireplaces', 
#        'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
#        'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
# array1 = skew(train[continuous])
# #the top 15 features which are the most skewed
# index1 = array1.argsort()[-15:][::-1]
# continuous_skewed=[]
# for i in index1:
#     print (continuous[i],": skewness = ", array1[i])
#     if array1[i]>=1:
#         continuous_skewed.append(continuous[i])

In [None]:
# #applying log transformation for selected variables
# train_out=train.copy()
# for i in continuous_skewed:
#     train_out[i] = train_out[i].map(lambda l: np.log(l) if l > 0 else 0)

In [None]:
# #recalculating skewness of continuous variables
# array2 = skew(train_out[continuous])
# #the top 10 features which are the most skewed
# index2 = array2.argsort()[-15:][::-1]
# continuous_skewed=[]
# for i in index2: 
#     print (continuous[i],": skewness = ", array2[i])
#     if array2[i] >= 1:
#         continuous_skewed.append(continuous[i])

In [None]:
# #revert the log transformation if the skewness increases after log transformation
# for i in index2: 
#     print (continuous[i],": skewness = ", array1[i],"\t, skewness2 = ", array2[i])
#     if array2[i] >= array1[i]:
#         train_out[continuous[i]] = train[continuous[i]]

In [None]:
# #recalculating skewness of continuous variables
# array2 = skew(train_out[continuous])
# #the top 10 features which are the most skewed
# index2 = array2.argsort()[-15:][::-1]
# for i in index2: 
#     print (continuous[i],": skewness = ", array2[i])

## Pre-processing v4 - combine v1 and v2

In [None]:
# features_liqr = train_out_2.copy()
# features_liqr.drop(['Id', 'SalePrice'], axis=1, inplace=True)
# labels_liqr = train_out_2['SalePrice']
# # data normalization from sklearn
# # fit scaler on training data
# norm = MinMaxScaler().fit(features_liqr)
# # transform training data
# features_norm = norm.transform(features_liqr)
# train_X_liqr_norm, test_X_liqr_norm, train_Y_liqr_norm, test_Y_liqr_norm = train_test_split(features_norm,labels_liqr,test_size=0.2, random_state=0)

In [None]:
# #investigating the skewness of data features (visually)
# f, axes = plt.subplots(len(continuous), 3, figsize=(20, 200))
# colors = ["r", "g", "b","r","g"]
# colors = colors*7
# count = 0
# for i in continuous:
#     col_ind=features.columns.get_indexer([i])[0]
#     sb.boxplot(x = features_norm[:,col_ind], color = colors[count], ax = axes[count,0])
#     sb.histplot(x = features_norm[:,col_ind], color = colors[count], ax = axes[count,1])
#     sb.violinplot(x = features_norm[:,col_ind], color = colors[count], ax = axes[count,2])
#     count += 1 

## Pre-processing v5 - combine v1 and v3

In [None]:
# # data standardization 
# features_liqr_stan=train_out_2.copy()
# for i in continuous:
#     # fit on training data column
#     scale = StandardScaler().fit(train_out_2[[i]])
#     # transform training data column
#     features_liqr_stan[i] = scale.transform(features_liqr_stan[[i]])
# labels_liqr_stan = train_out_2['SalePrice']
# features_liqr_stan.drop(['Id', 'SalePrice'], axis=1, inplace=True)

# train_X_liqr_stan, test_X_liqr_stan, train_Y_liqr_stan, test_Y_liqr_stan = train_test_split(features_liqr_stan,labels_liqr_stan,test_size=0.2, random_state=0)

In [None]:
# #investigating the skewness of data features (visually)
# f, axes = plt.subplots(len(continuous), 3, figsize=(20, 200))
# colors = ["r", "g", "b","r","g"]
# colors = colors*7
# count = 0
# for i in continuous:
#     sb.boxplot(x = features_stan[i], color = colors[count], ax = axes[count,0])
#     sb.histplot(x = features_stan[i], color = colors[count], ax = axes[count,1])
#     sb.violinplot(x = features_stan[i], color = colors[count], ax = axes[count,2])
#     count += 1 

In [None]:
# #calculating skewness of continuous variables
# array = skew(features_stan[continuous])
# #the top 10 features which are the most skewed
# index = array.argsort()[-10:][::-1]
# for i in index: 
#     print (continuous[i],": skewness = ", array[i])