## Sample 2 of Exploratory Data Analysis
### EDA of a dataset of house prices and various attributes

In [31]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split as tts
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

path = os.getcwd()
df = pd.read_csv(path+'/Data/House_Price.csv')

#   splitting dataset into 70-30 train-test
train, test = tts(df, test_size=0.3, random_state=309)

Data preprocessing steps

In [32]:
##  Delete duplicate data-points
train = train.drop_duplicates()
test = test.drop_duplicates()

##  handle missing data - remove columns with >50% NA
def remove_cols(dataset):
    remove_cols = []
    null_columns = dataset.isna().sum()
    null_columns = null_columns[null_columns != 0].sort_values(ascending=False)
    null_columns = null_columns.to_dict()

    for key, value in null_columns.items():
        pct = value / len(dataset)
        if pct >= 0.5:
            remove_cols.append(key)

    dataset = dataset.drop(columns=remove_cols)
    return dataset

test = remove_cols(test)
train = remove_cols(train)

In [33]:
from sklearn.preprocessing import OrdinalEncoder
#   encode FireplaceQu into ordinal values e.g. 0 = none 5 = best fireplace
enc = OrdinalEncoder(categories=[[0,'Po', 'Fa', 'TA', 'Gd', 'Ex']])
#   replace NAs with 0
X = np.array(train['FireplaceQu'].fillna(0))
X = X.reshape(-1,1)
train['FireplaceQu'] = enc.fit_transform(X)


#   encode Bsmt Qual/Cond into ordinal values
X = np.array(train['BsmtQual'].fillna(0))
X = X.reshape(-1,1)
train['BsmtQual'] = enc.fit_transform(X)
X = np.array(train['BsmtCond'].fillna(0))
X = X.reshape(-1,1)
train['BsmtCond'] = enc.fit_transform(X)

#   encode Garage Qual/Cond into ordinal values
X = np.array(train['GarageQual'].fillna(0))
X = X.reshape(-1,1)
train['GarageQual'] = enc.fit_transform(X)
X = np.array(train['GarageCond'].fillna(0))
X = X.reshape(-1,1)
train['GarageCond'] = enc.fit_transform(X)

#   replace NAs with 0 in LotFrontage
train['LotFrontage'] = train['LotFrontage'].fillna(0)
train['LotFrontage']

#   drop 'GarageType', 'GarageYrBlt', 'GarageFinish'
garage_drop = ['GarageType', 'GarageYrBlt', 'GarageFinish']
train = train.drop(columns=garage_drop)

#   drop 'BsmtExposure', 'BsmtFinType1', 'BsmtFintype2'
bsmt_drop = ['BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
train = train.drop(columns=bsmt_drop)

#   replace NAs in 'MasVnrType' and 'MasVnrArea'
train['MasVnrType'] = train['MasVnrType'].fillna('None')
train['MasVnrArea'] = train['MasVnrArea'].fillna(0)

#   replace NAs in 'Electrical' with most common
train['Electrical'] = train['Electrical'].fillna('SBrkr')

In [34]:
#   TEST Set
#   encode FireplaceQu into ordinal values e.g. 0 = none 5 = best fireplace
enc = OrdinalEncoder(categories=[[0,'Po', 'Fa', 'TA', 'Gd', 'Ex']])
#   replace NAs with 0
X = np.array(test['FireplaceQu'].fillna(0))
X = X.reshape(-1,1)
test['FireplaceQu'] = enc.fit_transform(X)


#   encode Bsmt Qual/Cond into ordinal values
X = np.array(test['BsmtQual'].fillna(0))
X = X.reshape(-1,1)
test['BsmtQual'] = enc.fit_transform(X)
X = np.array(test['BsmtCond'].fillna(0))
X = X.reshape(-1,1)
test['BsmtCond'] = enc.fit_transform(X)

#   encode Garage Qual/Cond into ordinal values
X = np.array(test['GarageQual'].fillna(0))
X = X.reshape(-1,1)
test['GarageQual'] = enc.fit_transform(X)
X = np.array(test['GarageCond'].fillna(0))
X = X.reshape(-1,1)
test['GarageCond'] = enc.fit_transform(X)

#   replace NAs with 0 in LotFrontage
test['LotFrontage'] = test['LotFrontage'].fillna(0)
test['LotFrontage']

#   drop 'GarageType', 'GarageYrBlt', 'GarageFinish'
garage_drop = ['GarageType', 'GarageYrBlt', 'GarageFinish']
test = test.drop(columns=garage_drop)

#   drop 'BsmtExposure', 'BsmtFinType1', 'BsmtFintype2'
bsmt_drop = ['BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
test = test.drop(columns=bsmt_drop)

#   replace NAs in 'MasVnrType' and 'MasVnrArea'
test['MasVnrType'] = test['MasVnrType'].fillna('None')
test['MasVnrArea'] = test['MasVnrArea'].fillna(0)

#   replace NAs in 'Electrical' with most common
test['Electrical'] = test['Electrical'].fillna('SBrkr')

In [35]:
from sklearn.preprocessing import MinMaxScaler
#   data normalisation for our top 5 features 'OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF'
top5features = ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF']
X = train[top5features]
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
train[top5features] = X

#   now test set
X = test[top5features]
X = scaler.fit_transform(X)
test[top5features] = X

#   remove redundant columns
redundant_columns = ['Id', 'MSZoning', 'Street', 'LotShape', 'LandContour','LandSlope','Condition1','Condition2']
train = train.drop(columns=redundant_columns)
test = test.drop(columns=redundant_columns)

In [38]:
#   saving processed data to CSV
train.to_csv('training-processed.csv', index=False)
test.to_csv('test-processed.csv', index=False) 

dimensionality reduction techniques

In [None]:
from sklearn import decomposition
from sklearn.preprocessing import StandardScaler
##  dimensionality reduction / feature selection

#   select for numeric only features
numer = train.select_dtypes(include=['float','int'])
price = train['SalePrice']
#   scale all features to 0-1
scaler = StandardScaler()
numer = scaler.fit_transform(numer)
numer = pd.DataFrame(data=numer)

#   principal components analysis
pca = decomposition.PCA(n_components=5)
pca.fit(numer)
pca_X = pca.transform(numer)
df = pd.DataFrame(data = pca_X,columns = ['PC1', 'PC2','PC3','PC4','PC5'])
target = pd.Series(price, name='saleprice')
result_df = pd.concat([df, target], axis=1)
print(result_df.head())
print()

#   independent components analysis
ica = decomposition.FastICA(n_components=5,whiten='unit-variance', max_iter=1000)
ica.fit(numer)
ica_X = ica.transform(numer)
df = pd.DataFrame(data = ica_X,columns = ['PC1', 'PC2','PC3','PC4','PC5'])
result_df = pd.concat([df, target], axis=1)
print(result_df.head())


data mining preprocessed data using machine learning methods.

Using ordinary linear regression and ridge regression (with alpha=0.5 ) for predicting the house prices. Comparing their results regarding the mean squared errors on the training set and the test set.

In [72]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

#   linear regression
reg = linear_model.LinearRegression()
train_X = train.iloc[:,0:len(train.columns)-1]
train_X = train.select_dtypes(include=['float64','int'])
train_Y = train.iloc[:,len(train.columns)-1]
#train_X
reg.fit(train_X, train_Y)
test_X = test.iloc[:,0:len(train.columns)-1]
test_X = test.select_dtypes(include=['float64','int'])
predict_test = reg.predict(test_X)

test_Y = test.iloc[:,len(train.columns)-1]
print("LR Mean squared error: %.2f" % mean_squared_error(predict_test, test_Y))

#   ridge regression
ridge = linear_model.Ridge(alpha=0.5)
ridge.fit(train_X, train_Y)
ridge_pred = ridge.predict(test_X)
print("Ridge reg Mean squared error: %.2f" % mean_squared_error(ridge_pred, test_Y))

LR Mean squared error: 0.00
Ridge reg Mean squared error: 0.00


Using Random Forest, comparing the results of linear regression and ridge regression

In [109]:
from sklearn.ensemble import RandomForestClassifier

rf_cls = RandomForestClassifier(max_depth=2, random_state=0)
rf_cls.fit(train_X, train_Y)
rf_pred = rf_cls.predict(test_X)
print("Random Forest Mean squared error: %.2f" % mean_squared_error(rf_pred, test_Y))

Random Forest Mean squared error: 6313780108.17
(438,)
(438,)
