In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

In [6]:
import os
os.chdir(os.getcwd().replace('/notebooks', ''))

In [7]:
# Load the data
filepath = '../data/raw/train.csv'
data = pd.read_csv(filepath, index_col='Id')

# Fill the missing

## Numerical Features

In [8]:
data['LotFrontage'] = data['LotFrontage'].fillna(data[data['LotFrontage'] < 300]['LotFrontage'].mean())

data['GarageYrBlt'] = data['GarageYrBlt'].fillna(data['GarageYrBlt'].interpolate())
data['MasVnrArea'] = data['MasVnrArea'].fillna(0)

In [9]:
for col in ['LotFrontage', 'GarageYrBlt', 'MasVnrArea', 'MasVnrType']:
    print(data[col].isnull().sum())

0
0
0
872


## Categorical Features

In [10]:
data['MasVnrType'] = data['MasVnrType'].fillna('None')
data.drop(['MiscFeature','PoolQC','Fence','Alley'],axis=1, inplace=True)

In [11]:
lst_of_missing = []
for col in data.columns:
    if data[col].isnull().sum() > 0:
        lst_of_missing.append(col)
        print(f'{col:<13}: {data[col].isnull().sum(): <4} missing values - {data[col].isnull().sum() / len(data) * 100:.2f}% - {data[col].dtype}')

BsmtQual     : 37   missing values - 2.53% - object
BsmtCond     : 37   missing values - 2.53% - object
BsmtExposure : 38   missing values - 2.60% - object
BsmtFinType1 : 37   missing values - 2.53% - object
BsmtFinType2 : 38   missing values - 2.60% - object
Electrical   : 1    missing values - 0.07% - object
FireplaceQu  : 690  missing values - 47.26% - object
GarageType   : 81   missing values - 5.55% - object
GarageFinish : 81   missing values - 5.55% - object
GarageQual   : 81   missing values - 5.55% - object
GarageCond   : 81   missing values - 5.55% - object


In [12]:
print(data['SalePrice'].max())
print(data['SalePrice'].min())

755000
34900


In [13]:
def fill_missing(df, col, i):
    # Step 1: Set point value
    point = {}
    for type in df[col].unique():
        num = df[df[col] == type][col].count()
        if num > 0:
            point[type] = 1/num
        else:
            point[type] = 0
    # Step 2: Find mode value for each range
    mode = {}
    for i in range(5):
        rang = [i*144020, (i+1)*144020]
        data = df[col][df['SalePrice'].between(rang[0], rang[1])]
        max = 0
        for type in data.unique():
            if data[data == type].count()*point[type] > max:
                mode[i] = type
                max = data[data == type].count()*point[type]
    # Step 3: Fill missing value
    if i in data[data[col].isnull()].index:
        for c in mode.keys():
            if df['SalePrice'][i] in range(c*144020, (c+1)*144020):
                df[col][i] = mode[c]

for col in lst_of_missing:
    data[col] = data[col].apply(lambda x: fill_missing(data, col, x) if x == 'Nan' else x)


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1460 entries, 1 to 1460
Data columns (total 76 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1460 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   LotShape       1460 non-null   object 
 6   LandContour    1460 non-null   object 
 7   Utilities      1460 non-null   object 
 8   LotConfig      1460 non-null   object 
 9   LandSlope      1460 non-null   object 
 10  Neighborhood   1460 non-null   object 
 11  Condition1     1460 non-null   object 
 12  Condition2     1460 non-null   object 
 13  BldgType       1460 non-null   object 
 14  HouseStyle     1460 non-null   object 
 15  OverallQual    1460 non-null   int64  
 16  OverallCond    1460 non-null   int64  
 17  YearBuilt      1460 non-null   int64  
 18  YearRemodAdd 

# Encoding

In [15]:
label_encoders = LabelEncoder()
for col in data.columns:
    if data[col].dtype == 'object':
        data[col] = label_encoders.fit_transform(data[col])

In [16]:
data.select_dtypes(include=['object'])

1
2
3
4
5
...
1456
1457
1458
1459
1460


In [28]:
data.to_csv(r'../data/afterprocessed/processed_pre_norm.csv')

In [31]:
data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,3,65.0,8450,1,3,3,0,4,0,...,0,0,0,0,0,2,2008,8,4,208500
2,20,3,80.0,9600,1,3,3,0,2,0,...,0,0,0,0,0,5,2007,8,4,181500
3,60,3,68.0,11250,1,0,3,0,4,0,...,0,0,0,0,0,9,2008,8,4,223500
4,70,3,60.0,9550,1,0,3,0,0,0,...,272,0,0,0,0,2,2006,8,0,140000
5,60,3,84.0,14260,1,0,3,0,2,0,...,0,0,0,0,0,12,2008,8,4,250000


# Normalization

## Use StandardScaler
- StandardScaler is used to scale the data to have a mean of 0 and a standard deviation of 1.

In [17]:
scaler = StandardScaler()
data_to_scale = data.drop('SalePrice', axis=1)
data_to_scale = pd.DataFrame(scaler.fit_transform(data_to_scale), columns=data_to_scale.columns,index=data.index)
data_to_scale['SalePrice'] = data['SalePrice']
data_to_scale.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.073375,-0.045532,-0.226101,-0.207142,0.064238,0.750731,0.314667,-0.02618,0.60467,-0.225716,...,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,0.138777,0.313867,0.208502,208500
2,-0.872563,-0.045532,0.45519,-0.091886,0.064238,0.750731,0.314667,-0.02618,-0.628316,-0.225716,...,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-0.48911,-0.614439,0.313867,0.208502,181500
3,0.073375,-0.045532,-0.089843,0.07348,0.064238,-1.378933,0.314667,-0.02618,0.60467,-0.225716,...,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,0.990891,0.138777,0.313867,0.208502,223500
4,0.309859,-0.045532,-0.453198,-0.096897,0.064238,-1.378933,0.314667,-0.02618,-1.861302,-0.225716,...,4.092524,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,-1.367655,0.313867,-3.426284,140000
5,0.073375,-0.045532,0.636868,0.375148,0.064238,-1.378933,0.314667,-0.02618,-0.628316,-0.225716,...,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,2.100892,0.138777,0.313867,0.208502,250000


In [36]:
data_to_scale.to_csv(r'../data/afterprocessed/processed_post_norm.csv')

## Use MinMaxScaler
- MinMaxScaler is used to scale the data to have a range of 0 to 1.

In [18]:
scaler2 = MinMaxScaler()
data_normalized = pd.DataFrame(scaler2.fit_transform(data), columns=data.columns)
data_normalized.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,0.235294,0.75,0.150685,0.03342,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.090909,0.5,1.0,0.8,0.241078
1,0.0,0.75,0.202055,0.038795,1.0,1.0,1.0,0.0,0.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.363636,0.25,1.0,0.8,0.203583
2,0.235294,0.75,0.160959,0.046507,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.727273,0.5,1.0,0.8,0.261908
3,0.294118,0.75,0.133562,0.038561,1.0,0.0,1.0,0.0,0.0,0.0,...,0.492754,0.0,0.0,0.0,0.0,0.090909,0.0,1.0,0.0,0.145952
4,0.235294,0.75,0.215753,0.060576,1.0,0.0,1.0,0.0,0.5,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.5,1.0,0.8,0.298709


In [29]:
data_normalized.to_csv(r'../data/afterprocessed/processed_post_norm.csv')

# Dimensionality reduction using PCA
- PCA is used to reduce the number of features in the dataset (Use StandardScaler).

In [20]:
pca = PCA(n_components=30)  
features_pca = pca.fit_transform(data_standardized)
print(features_pca)

[[ 1.99318404 -0.06443223 -2.20461466 ... -0.14565682 -0.19883158
  -0.68005967]
 [ 0.23829931 -1.67171958  1.68412438 ...  0.18115273 -0.25851309
   0.91353205]
 [ 2.57091604 -0.15033723 -1.62237786 ... -0.2928107   0.36518579
  -0.46572825]
 ...
 [ 1.82751614  2.85479899  0.08870374 ...  1.09803642  0.61088865
  -1.23594137]
 [-2.97283216 -2.43265229  2.43129568 ...  1.64788013  0.68183193
  -0.23802351]
 [-0.9184661  -2.36390564  1.978655   ...  2.51765906  1.41658365
   0.38997494]]
