In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.simplefilter(action='ignore')

In [2]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


### Train Test Split

Before beginning to engineer our features, it is important to separate our data intro training and testing set. This is to avoid over-fitting caused by information leakage.

In [132]:
x,y = df.drop(['Id','SalePrice'], axis=1), df['SalePrice']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=0)
display(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(1168, 80)

(292, 80)

(1168,)

(292,)

### Missing Values - Categorical Variables

In [13]:
# first, find % of missing values for all categorical variables

cat_vars = [col for col in x_train.columns if x_train[col].dtypes=='O']
x_train[cat_vars].isnull().mean()

MSZoning         0.000000
Street           0.000000
Alley            0.939212
LotShape         0.000000
LandContour      0.000000
Utilities        0.000000
LotConfig        0.000000
LandSlope        0.000000
Neighborhood     0.000000
Condition1       0.000000
Condition2       0.000000
BldgType         0.000000
HouseStyle       0.000000
RoofStyle        0.000000
RoofMatl         0.000000
Exterior1st      0.000000
Exterior2nd      0.000000
MasVnrType       0.005137
ExterQual        0.000000
ExterCond        0.000000
Foundation       0.000000
BsmtQual         0.023973
BsmtCond         0.023973
BsmtExposure     0.023973
BsmtFinType1     0.023973
BsmtFinType2     0.024829
Heating          0.000000
HeatingQC        0.000000
CentralAir       0.000000
Electrical       0.000856
KitchenQual      0.000000
Functional       0.000000
FireplaceQu      0.471747
GarageType       0.049658
GarageFinish     0.049658
GarageQual       0.049658
GarageCond       0.049658
PavedDrive       0.000000
PoolQC      

In [14]:
# filling in NA

x_train[cat_vars] = x_train[cat_vars].fillna('Missing')
x_test[cat_vars] = x_test[cat_vars].fillna('Missing')

# making sure

[col for col in x_train[cat_vars].columns if x_train[col].isnull().sum()>0]

### Missing Values - Numerical Variables

To engineer missing values in numerical variables, we will:

    - add a binary missing value indicator variable
    - and then replace the missing values in the original variable with the mode

In [31]:
# making a list of numerical variables that have missing values

num_vars = [col for col in x_train.columns if x_train[col].dtypes != 'O' and x_train[col].isnull().sum()>0]
x_train[num_vars].isnull().mean()

LotFrontage    0.181507
MasVnrArea     0.005137
GarageYrBlt    0.049658
dtype: float64

In [28]:
#num_vars = [col for col in x_test.columns if x_test[col].dtypes != 'O' and x_test[col].isnull().sum()>0]
#x_test[num_vars].isnull().mean()

In [33]:
for each in num_vars:
    
    mode_train = x_train[each].mode()[0]
    mode_test = x_test[each].mode()[0]
    
    x_train[each+'_na'] = np.where(x_train[each].isnull(), 1, 0)
    x_test[each+'_na'] = np.where(x_test[each].isnull(), 1, 0)
    
    x_train[each] = x_train[each].fillna(mode_train)
    x_test[each] = x_train[each].fillna(mode_test)

display(x_train[num_vars].isnull().mean(), x_test[num_vars].isnull().mean())

LotFrontage    0.0
MasVnrArea     0.0
GarageYrBlt    0.0
dtype: float64

### Categorical Variables - Replacing with Rare Labels

In [103]:
for each in cat_vars:
    
    temp = x_train[each].value_counts()/len(x_train)
    
    if temp[temp<0.01].index.values in x_train[each].values:
        x_train[each] = np.where(x_train[each].isin(temp[temp<0.01].index.values), 'Rare', x_train[each])

### Categorical Variables - Encoding

Next, we need to transform the strings of the categorical variables into numbers. We will do so in a way to capture the monotonic relationship between the label and the target.

In [116]:
# sort categorical labels based on mean value of the target

for each in x_train.columns:
    ordered_categorical = x_train.groupby(each)['SalePrice'].mean().sort_values().index
    ordered_label = {value: count for count, value in enumerate(ordered_categorical,0)}
    x_train[each] = x_train[each].map(ordered_label)
    x_test[each] = x_test[each].map(ordered_label)

### Feature Scaling

In [120]:
x_train.drop(['Id'], axis=1, inplace=True)
x_test.drop(['Id'], axis=1, inplace=True)

scaler = MinMaxScaler()
scaler.fit(x_train)
x_train_new = scaler.transform(x_train)

scaler.fit(x_test)
x_test_new = scaler.transform(x_test)