## Importing Libraries

In [1527]:
import pandas as pd
import numpy as np
import seaborn as sns
import pylab
import scipy.stats as stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer
from feature_engine.encoding import RareLabelEncoder

## Constant Declaration

In [1528]:
class FeatureCategory:
    CATEGORICAL = 1
    NUMERICAL = 2
    ORDINAL = 3

target_name = 'SalePrice'
    
num_features = ['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', \
                'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', \
                 '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', \
                 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', \
                 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', \
                 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', \
                 'MiscVal', 'MoSold', 'YrSold']

cat_features = ['MSZoning', 'Street', 'Alley', \
                'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', \
                'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', \
                'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', \
                'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', \
                'BsmtExposure','BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', \
                'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', \
                'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', \
                'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition' ]

ord_features = ['MSSubClass', 'OverallQual', 'OverallCond' ]


## Classes

In [1529]:
class FeatureAnalysis:
    
    def validate_imputation(self, x_train, x_test, features):
        features_to_impute = self.analyse_missing_features(x_train, features, True)
        features_no_impute = self.analyse_missing_features(x_train, features, False)

        test_features_to_impute = self.analyse_missing_features(x_test, features, True)
        test_features_no_impute = self.analyse_missing_features(x_test, features, False)
        
        print(f'Number of train features to impute = {len(features_to_impute)}')
        print(f'Number of train features without missing values = {len(features_no_impute)}')
        print(f'Number of test features to impute = {len(test_features_to_impute)}')
        print(f'Number of test features without missing values = {len(test_features_no_impute)}')
        
        if (not len(features_to_impute) == len(test_features_to_impute)):
            print(f'Train and test feature number to impute is NOT the same')
            return False
        
       
        if (not features_to_impute.index.equals(test_features_to_impute.index)):
            print(f'Train and test features to impute are NOT the same')
            return False
        
        print('\n')
        return True
        
    
    def analyse_missing_features(self, df, feature_list, to_impute):
        if (to_impute):
            return df[feature_list] \
                    .isnull().mean() \
                    .loc[lambda x : x > 0] \
                    .sort_values(ascending=False)
        else:
            return df[feature_list] \
                    .isnull().mean() \
                    .loc[lambda x : x == 0] \
                    .sort_values(ascending=False)
            
            
    
    def unique_values(self, df, feature_name):
        feature_values = df[feature_name].unique()
        feature_values.sort()
        return feature_values
    
    def draw_feature_plots(self, df, feature_name, target_name, is_categorical):
        feature = df[feature_name]
        fig, axs = plt.subplots(2, 2)
        fig.suptitle(f'"{feature_name}" feature analysis')

        axs[0, 0].hist(feature)
        
        if (is_categorical == False):
            axs[0, 1].boxplot(feature)
    
        stats.probplot(df[feature_name], dist='norm', plot=axs[1, 0])
        
        if (not (target_name == '')):
            axs[1, 1].scatter(feature, df[target_name])

        plt.show()
    

## Loading training and testing sets

In [1530]:
train_df = pd.read_csv('../datasets/house_prices/train.csv')
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [1531]:
x_train, x_test, y_train, y_test = train_test_split(train_df, train_df[target_name], test_size=0.3, random_state=123)

In [1532]:
print('Number of FEATURES = ' + str(len(num_features) + len(cat_features) + len(ord_features)))
print('Number of ROWS = ' + str(len(train_df)))
print('Number of the NUMERICAL features = ' + str(len(num_features)))
print('Number of the CATEGORICAL features = ' + str(len(cat_features)))
print('Number of the ORDINAL features = ' + str(len(ord_features)))
print(f'Train X dataset size = {len(x_train)}')
print(f'Test X dataset size = {len(x_test)}')
print('All FEATURES = ' + str(x_train.columns.values))

Number of FEATURES = 79
Number of ROWS = 1460
Number of the NUMERICAL features = 33
Number of the CATEGORICAL features = 43
Number of the ORDINAL features = 3
Train X dataset size = 1022
Test X dataset size = 438
All FEATURES = ['Id' 'MSSubClass' 'MSZoning' 'LotFrontage' 'LotArea' 'Street' 'Alley'
 'LotShape' 'LandContour' 'Utilities' 'LotConfig' 'LandSlope'
 'Neighborhood' 'Condition1' 'Condition2' 'BldgType' 'HouseStyle'
 'OverallQual' 'OverallCond' 'YearBuilt' 'YearRemodAdd' 'RoofStyle'
 'RoofMatl' 'Exterior1st' 'Exterior2nd' 'MasVnrType' 'MasVnrArea'
 'ExterQual' 'ExterCond' 'Foundation' 'BsmtQual' 'BsmtCond' 'BsmtExposure'
 'BsmtFinType1' 'BsmtFinSF1' 'BsmtFinType2' 'BsmtFinSF2' 'BsmtUnfSF'
 'TotalBsmtSF' 'Heating' 'HeatingQC' 'CentralAir' 'Electrical' '1stFlrSF'
 '2ndFlrSF' 'LowQualFinSF' 'GrLivArea' 'BsmtFullBath' 'BsmtHalfBath'
 'FullBath' 'HalfBath' 'BedroomAbvGr' 'KitchenAbvGr' 'KitchenQual'
 'TotRmsAbvGrd' 'Functional' 'Fireplaces' 'FireplaceQu' 'GarageType'
 'GarageYrBlt' '

In [1533]:
fa = FeatureAnalysis()

## Feature imputation

In [1534]:
imputed_features = []
imputed_train_df = pd.DataFrame()
imputed_test_df = pd.DataFrame()
median_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
frequent_cat_imputer = SimpleImputer(missing_values=np.NaN, strategy='most_frequent')

### Numerical feature imputation

In [1535]:
num_features_to_impute = fa.analyse_missing_features(x_train, num_features, True)
num_features_no_impute = fa.analyse_missing_features(x_train, num_features, False)

is_valid = fa.validate_imputation(x_train, x_test, num_features)

if(is_valid):
    print('Numberical feature imputation is VALID')
else:
    print('Numerical feature imputaion ERROR')

imputed_train_df = pd.concat([imputed_train_df, x_train[num_features_no_impute.index]])
imputed_test_df =  pd.concat([imputed_test_df, x_test[num_features_no_impute.index]])

imputed_features = imputed_features + num_features_no_impute.index.values.tolist()

print(f'Number of imputed features is: {len(imputed_features)}')
num_features_to_impute

Number of train features to impute = 3
Number of train features without missing values = 30
Number of test features to impute = 3
Number of test features without missing values = 30


Numberical feature imputation is valid
Number of imputed features is: 30


LotFrontage    0.159491
GarageYrBlt    0.059687
MasVnrArea     0.003914
dtype: float64

In [1536]:
len(imputed_train_df)

1022

In [1537]:
median_imputer.fit(x_train[num_features_to_impute.index])
num_features_imputed = median_imputer.transform(x_train[num_features_to_impute.index])
num_features_imputed_df = pd.DataFrame(num_features_imputed, columns=num_features_to_impute.index)

for column in num_features_imputed_df.columns:
    imputed_train_df[column] = num_features_imputed_df[column]
    
#print(str(len(imputed_train_df.columns)))
imputed_train_df.head()

Unnamed: 0,LotArea,YearBuilt,MoSold,MiscVal,PoolArea,ScreenPorch,3SsnPorch,EnclosedPorch,OpenPorchSF,WoodDeckSF,...,1stFlrSF,TotalBsmtSF,BsmtUnfSF,BsmtFinSF2,BsmtFinSF1,YearRemodAdd,YrSold,LotFrontage,GarageYrBlt,MasVnrArea
376,8846,1996,7,0,0,0,0,0,0,0,...,914,870,572,0,298,1996,2006,68.0,1967.0,0.0
250,5350,1940,5,450,0,0,0,0,0,263,...,1306,728,728,0,0,1966,2010,60.0,1916.0,0.0
228,8521,1967,5,0,0,0,0,0,0,0,...,912,912,70,0,842,1967,2010,68.0,1954.0,0.0
40,8658,1965,12,0,0,0,0,0,138,0,...,1324,1088,445,0,643,1965,2006,60.0,1985.0,0.0
428,6762,2007,9,0,0,0,0,0,54,105,...,1208,1208,544,0,664,2007,2007,70.0,2003.0,120.0


In [1538]:
 imputed_features = imputed_features + num_features_to_impute.index.values.tolist()
num_features_to_impute = fa.analyse_missing_features(imputed_train_df, num_features, True)

if(len(num_features_to_impute) == 0):
    print('All numberical features were imputed successfully')
else:
    print('Warning ! Not all numberical features were imputed')
    
print(f'Number of impute features is: {len(imputed_features)}')

Number of impute features is: 33


### Categorical feature imputation

In [1539]:
cat_features_no_impute = fa.analyse_missing_features(x_train, cat_features, False)
cat_features_no_impute.index

Index(['MSZoning', 'Exterior1st', 'SaleType', 'PavedDrive', 'Functional',
       'KitchenQual', 'CentralAir', 'HeatingQC', 'Heating', 'Foundation',
       'ExterCond', 'ExterQual', 'Exterior2nd', 'RoofMatl', 'Street',
       'RoofStyle', 'HouseStyle', 'BldgType', 'Condition2', 'Condition1',
       'Neighborhood', 'LandSlope', 'LotConfig', 'Utilities', 'LandContour',
       'LotShape', 'SaleCondition'],
      dtype='object')

In [1540]:
#Printing all categorical features to impute
cat_features_to_impute = fa.analyse_missing_features(x_train, cat_features, True)

cat_features_no_impute = fa.analyse_missing_features(x_train, cat_features, False)

for feature_name in cat_features_no_impute.index:
    imputed_train_df[feature_name] = x_train[feature_name]

cat_features_to_impute

PoolQC          0.995108
MiscFeature     0.963796
Alley           0.931507
Fence           0.804305
FireplaceQu     0.473581
GarageType      0.059687
GarageFinish    0.059687
GarageQual      0.059687
GarageCond      0.059687
BsmtExposure    0.027397
BsmtQual        0.026419
BsmtCond        0.026419
BsmtFinType1    0.026419
BsmtFinType2    0.026419
MasVnrType      0.003914
Electrical      0.000978
dtype: float64

In [1541]:
#Dropping features such that missing value percent is greater than 20%
not_imputable_cat_features = cat_features_to_impute.loc[lambda x : x >= 0.1]
cat_features_to_impute = cat_features_to_impute.loc[lambda x : x < 0.1]

print(f'Number of the not imputable feature is {len(not_imputable_cat_features)}')

cat_features_to_impute

Number of the not imputable feature is 5


GarageType      0.059687
GarageFinish    0.059687
GarageQual      0.059687
GarageCond      0.059687
BsmtExposure    0.027397
BsmtQual        0.026419
BsmtCond        0.026419
BsmtFinType1    0.026419
BsmtFinType2    0.026419
MasVnrType      0.003914
Electrical      0.000978
dtype: float64

In [1542]:
for feature_name in cat_features_to_impute.index:
    
    most_frequent_category = x_train[feature_name].mode()
    imputed_train_df[feature_name] = x_train[feature_name]
    imputed_train_df[imputed_train_df[feature_name].isnull()] = most_frequent_category[0]
    

In [1543]:
cat_features_to_impute = fa.analyse_missing_features(imputed_train_df, cat_features_to_impute.index, True)

if(len(cat_features_to_impute) == 0):
    print('All categorical features were imputed successfully')
else:
    print('Warning ! Not all categorical features were imputed')


All categorical features were imputed successfully


In [1544]:
print('Listing not imputable categorical features')
not_imputable_cat_feature_list = not_imputable_cat_features.index.values.tolist()


Listing not imputable categorical features


In [1545]:
imputed_cat_features = []

for feature_name in cat_features:
    if(not (feature_name in not_imputable_cat_feature_list)):
        imputed_cat_features.append(feature_name)

    
imputed_features = imputed_features + imputed_cat_features
print(f'Number of impute features is: {len(imputed_features)}')

Number of impute features is: 71


### Ordianl feature imputation

In [1546]:
ord_features_to_impute = fa.analyse_missing_features(x_train, ord_features, True)
ord_features_no_impute = fa.analyse_missing_features(x_train, ord_features, False)

if (len(ord_features_to_impute) == 0):
    print('No missing values were found for the ordinal features')

No missing values were found for the ordinal features


In [1547]:
# Ordinal features do not have missing values
# Adding all ordinal features to the imputed dataframe

for feature_name in ord_features_no_impute.index:
    imputed_train_df[feature_name] = x_train[feature_name]
    
imputed_features = imputed_features + ord_features_no_impute.index.values.tolist()
print(f'Number of impute features is: {len(imputed_features)}')

Number of impute features is: 74


## Categorical feature encoding

### Rare categories handling

In [1548]:
for feature_name in imputed_cat_features:
    feature_cat_info = imputed_train_df.groupby(feature_name)[feature_name].count() / len(imputed_train_df)
    feature_cat_info = feature_cat_info.sort_values(ascending=False)
    
    all_cat_number = len(feature_cat_info)
    rare_cat_number = len(feature_cat_info.loc[lambda x : x > 0.05])
    non_rare_cat_number = all_cat_number - rare_cat_number

    print(f'Feature "{feature_name}" has {rare_cat_number} rare categories and {non_rare_cat_number} NON rare') 

Feature "MSZoning" has 3 rare categories and 7 NON rare
Feature "Street" has 2 rare categories and 5 NON rare
Feature "LotShape" has 3 rare categories and 6 NON rare
Feature "LandContour" has 2 rare categories and 7 NON rare
Feature "Utilities" has 2 rare categories and 5 NON rare
Feature "LotConfig" has 3 rare categories and 7 NON rare
Feature "LandSlope" has 2 rare categories and 6 NON rare
Feature "Neighborhood" has 6 rare categories and 24 NON rare
Feature "Condition1" has 2 rare categories and 12 NON rare
Feature "Condition2" has 2 rare categories and 11 NON rare
Feature "BldgType" has 3 rare categories and 7 NON rare
Feature "HouseStyle" has 4 rare categories and 9 NON rare
Feature "RoofStyle" has 3 rare categories and 8 NON rare
Feature "RoofMatl" has 2 rare categories and 10 NON rare
Feature "Exterior1st" has 6 rare categories and 11 NON rare
Feature "Exterior2nd" has 6 rare categories and 14 NON rare
Feature "MasVnrType" has 3 rare categories and 2 NON rare
Feature "ExterQual"

In [1549]:
rare_label_encoder = RareLabelEncoder(tol=0.05, n_categories=2, variables=imputed_cat_features, replace_with='Rare')
rare_label_encoder.fit(imputed_train_df[imputed_cat_features])

rare_cat_encoded_df = rare_label_encoder.transform(imputed_train_df[imputed_cat_features])

for feature_name in imputed_cat_features:
    imputed_train_df[feature_name] = rare_cat_encoded_df[feature_name]
    rare_cat_rows_percent = round(len(imputed_train_df \
        .loc[imputed_train_df[feature_name] == 'Rare'][feature_name]) / len(imputed_train_df), 3)
    print(f'Feature "{feature_name}" has {rare_cat_rows_percent} rows')


Feature "MSZoning" has 0.088 rows
Feature "Street" has 0.037 rows
Feature "LotShape" has 0.068 rows
Feature "LandContour" has 0.126 rows
Feature "Utilities" has 0.033 rows
Feature "LotConfig" has 0.112 rows
Feature "LandSlope" has 0.076 rows
Feature "Neighborhood" has 0.537 rows
Feature "Condition1" has 0.158 rows
Feature "Condition2" has 0.045 rows
Feature "BldgType" has 0.096 rows
Feature "HouseStyle" has 0.11 rows
Feature "RoofStyle" has 0.054 rows
Feature "RoofMatl" has 0.047 rows
Feature "Exterior1st" has 0.137 rows
Feature "Exterior2nd" has 0.144 rows
Feature "MasVnrType" has 0.012 rows
Feature "ExterQual" has 0.079 rows
Feature "ExterCond" has 0.043 rows
Feature "Foundation" has 0.038 rows
Feature "BsmtQual" has 0.061 rows
Feature "BsmtCond" has 0.108 rows
Feature "BsmtExposure" has 0.031 rows
Feature "BsmtFinType1" has 0.005 rows
Feature "BsmtFinType2" has 0.114 rows
Feature "Heating" has 0.044 rows
Feature "HeatingQC" has 0.058 rows
Feature "CentralAir" has 0.075 rows
Feature 