## Importing Libraries

In [None]:
import pandas as pd
import numpy as np

print(np.version.version)

import seaborn as sns
import pylab
import scipy.stats as stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer
from feature_engine.encoding import RareLabelEncoder, OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error, r2_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader,TensorDataset

from torcheval.metrics import R2Score

## Constant Declaration

In [None]:
class FeatureCategory:
    CATEGORICAL = 'CATEGORICAL'
    NUMERICAL = 'NUMERICAL'
    ORDINAL = 'ORDINAL',
    ALL = 'ALL'

target_name = 'SalePrice'
    
num_features = ['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', \
                'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', \
                 '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', \
                 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', \
                 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', \
                 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', \
                 'MiscVal', 'MoSold', 'YrSold']

cat_features = ['MSZoning', 'Street', 'Alley', \
                'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', \
                'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', \
                'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', \
                'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', \
                'BsmtExposure','BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', \
                'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', \
                'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', \
                'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition' ]

ord_features = ['MSSubClass', 'OverallQual', 'OverallCond' ]


## Helper Classes

In [None]:
class FeatureAnalysis:
    
    def assign_df(self, df, df_to_append):
        for column in df_to_append.columns:
            df[column] = df_to_append[column].values
    
    def has_features_to_impute(self, df, features, feature_category, is_test):
        features_to_impute = self.analyse_missing_features(df, features, True)
        
        if (is_test):
            set_type = 'test'
        else:
            set_type = 'train'
        
        if(len(features_to_impute) == 0):
            print(f'All {set_type} {feature_category} features were imputed successfully')
            return False
        else:
            print(f'Warning ! Not all {feature_category} features were imputed')
            return True
    
    def validate_imputation(self, x_train, x_test, features):
        features_to_impute = self.analyse_missing_features(x_train, features, True)
        features_no_impute = self.analyse_missing_features(x_train, features, False)

        test_features_to_impute = self.analyse_missing_features(x_test, features, True)
        test_features_no_impute = self.analyse_missing_features(x_test, features, False)
        
        print(f'Number of train features to impute = {len(features_to_impute)}')
        print(f'Number of train features without missing values = {len(features_no_impute)}')
        print(f'Number of test features to impute = {len(test_features_to_impute)}')
        print(f'Number of test features without missing values = {len(test_features_no_impute)}')
        
        if (not len(features_to_impute) == len(test_features_to_impute)):
            print(f'Train and test feature number to impute is NOT the same')
            return False
        
       
        if (not features_to_impute.index.equals(test_features_to_impute.index)):
            print(f'Train and test features to impute are NOT the same')
            return False
        
        print('\n')
        return True
        
    
    def analyse_missing_features(self, df, feature_list, to_impute):
        if (to_impute):
            return df[feature_list] \
                    .isnull().mean() \
                    .loc[lambda x : x > 0] \
                    .sort_values(ascending=False)
        else:
            return df[feature_list] \
                    .isnull().mean() \
                    .loc[lambda x : x == 0] \
                    .sort_values(ascending=False)
            
            
    
    def unique_values(self, df, feature_name):
        feature_values = df[feature_name].unique()
        feature_values.sort()
        return feature_values
    
    def draw_feature_plots(self, df, feature_name, target_name, is_categorical):
        feature = df[feature_name]
        fig, axs = plt.subplots(2, 2)
        fig.suptitle(f'"{feature_name}" feature analysis')

        axs[0, 0].hist(feature)
        
        if (is_categorical == False):
            axs[0, 1].boxplot(feature)
    
        stats.probplot(df[feature_name], dist='norm', plot=axs[1, 0])
        
        if (not (target_name == '')):
            axs[1, 1].scatter(feature, df[target_name])

        plt.show()
    

## Loading training and testing sets

In [None]:
train_df = pd.read_csv('../datasets/house_prices/train.csv')
train_df.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(train_df, train_df[target_name], test_size=0.3, random_state=123)

In [None]:
print('Number of FEATURES = ' + str(len(num_features) + len(cat_features) + len(ord_features)))
print('Number of ROWS = ' + str(len(train_df)))
print('Number of the NUMERICAL features = ' + str(len(num_features)))
print('Number of the CATEGORICAL features = ' + str(len(cat_features)))
print('Number of the ORDINAL features = ' + str(len(ord_features)))
print(f'Train X dataset size = {len(x_train)}')
print(f'Test X dataset size = {len(x_test)}')
print('All FEATURES = ' + str(x_train.columns.values))

In [None]:
fa = FeatureAnalysis()

## Feature imputation

In [None]:
imputed_features = []
imputed_train_df = pd.DataFrame()
imputed_test_df = pd.DataFrame()
median_imputer = SimpleImputer(missing_values=np.NaN, strategy='median')
frequent_cat_imputer = SimpleImputer(missing_values=np.NaN, strategy='most_frequent')

### Numerical feature imputation

In [None]:
num_features_to_impute = fa.analyse_missing_features(x_train, num_features, True)
num_features_no_impute = fa.analyse_missing_features(x_train, num_features, False)

is_valid = fa.validate_imputation(x_train, x_test, num_features)

if(is_valid):
    print('Numberical feature imputation is VALID')
else:
    print('Numerical feature imputaion ERROR')

fa.assign_df(imputed_train_df, x_train[num_features_no_impute.index])
fa.assign_df(imputed_test_df, x_test[num_features_no_impute.index])

imputed_features = imputed_features + num_features_no_impute.index.values.tolist()

print(f'Number of imputed features is: {len(imputed_features)}')
num_features_to_impute

In [None]:
median_imputer.fit(x_train[num_features_to_impute.index])

num_features_imputed = median_imputer.transform(x_train[num_features_to_impute.index])
test_num_features_imputed = median_imputer.transform(x_test[num_features_to_impute.index])

num_features_imputed_df = pd.DataFrame(num_features_imputed, columns=num_features_to_impute.index)
test_num_features_imputed_df = pd.DataFrame(test_num_features_imputed, columns=num_features_to_impute.index)

fa.assign_df(imputed_train_df, num_features_imputed_df)
fa.assign_df(imputed_test_df, test_num_features_imputed_df)

imputed_train_df.head()

In [None]:
imputed_features = imputed_features + num_features_to_impute.index.values.tolist()

fa.has_features_to_impute(imputed_train_df, num_features, FeatureCategory.NUMERICAL, False)
fa.has_features_to_impute(imputed_test_df, num_features, FeatureCategory.NUMERICAL, True)
    
print(f'Number of impute features is: {len(imputed_features)}')

imputed_train_df.isnull().mean()[lambda x : x > 0]
len(imputed_train_df)

### Categorical feature imputation

In [None]:
cat_features_no_impute = fa.analyse_missing_features(x_train, cat_features, False)
cat_features_no_impute.index

In [None]:
#Printing all categorical features to impute
cat_features_to_impute = fa.analyse_missing_features(x_train, cat_features, True)
cat_features_no_impute = fa.analyse_missing_features(x_train, cat_features, False)

is_valid = fa.validate_imputation(x_train, x_test, cat_features)

if(is_valid):
    print('Categorical feature imputation is VALID')
else:
    print('Categorical feature imputaion ERROR')

fa.assign_df(imputed_train_df, x_train[cat_features_no_impute.index])
fa.assign_df(imputed_test_df, x_test[cat_features_no_impute.index])

cat_features_to_impute

In [None]:
#Dropping features such that missing value percent is greater than 20%
not_imputable_cat_features = cat_features_to_impute.loc[lambda x : x >= 0.1]
cat_features_to_impute = cat_features_to_impute.loc[lambda x : x < 0.1]

print(f'Number of the not imputable feature is {len(not_imputable_cat_features)}')

cat_features_to_impute

In [None]:
for feature_name in cat_features_to_impute.index:
    most_frequent_category = x_train[feature_name].mode()
    
    imputed_train_df[feature_name] = x_train[feature_name].values
    imputed_train_df.loc[imputed_train_df[feature_name].isnull(), feature_name] = most_frequent_category[0]
    
    imputed_test_df[feature_name] = x_test[feature_name].values
    imputed_test_df.loc[imputed_test_df[feature_name].isnull(), feature_name] = most_frequent_category[0]
    

In [None]:
cat_features_to_impute

In [None]:
print('Listing not imputable categorical features')
not_imputable_cat_feature_list = not_imputable_cat_features.index.values.tolist()


In [None]:
imputed_cat_features = []

for feature_name in cat_features:
    if(not (feature_name in not_imputable_cat_feature_list)):
        imputed_cat_features.append(feature_name)

    
imputed_features = imputed_features + imputed_cat_features
print(f'Number of impute features is: {len(imputed_features)}')

In [None]:
fa.has_features_to_impute(imputed_train_df, imputed_cat_features, FeatureCategory.CATEGORICAL, False)
fa.has_features_to_impute(imputed_test_df, imputed_cat_features, FeatureCategory.CATEGORICAL, True)

In [None]:
imputed_train_df.head()

### Ordianl feature imputation

In [None]:
ord_features_to_impute = fa.analyse_missing_features(x_train, ord_features, True)
ord_features_no_impute = fa.analyse_missing_features(x_train, ord_features, False)

if (len(ord_features_to_impute) == 0):
    print('No missing values were found for the ordinal features')

In [None]:
fa.has_features_to_impute(imputed_train_df, imputed_train_df.columns, FeatureCategory.ALL, False)
fa.has_features_to_impute(imputed_test_df, imputed_test_df.columns, FeatureCategory.ALL, True)

In [None]:
# Ordinal features do not have missing values
# Adding all ordinal features to the imputed dataframe

fa.assign_df(imputed_train_df, x_train[ord_features_no_impute.index])
fa.assign_df(imputed_test_df, x_test[ord_features_no_impute.index])
    
imputed_features = imputed_features + ord_features_no_impute.index.values.tolist()
print(f'Number of impute features is: {len(imputed_features)}')

## Categorical feature encoding

### Rare categories handling

In [None]:
for feature_name in imputed_cat_features:
    feature_cat_info = imputed_train_df.groupby(feature_name)[feature_name].count() / len(imputed_train_df)
    feature_cat_info = feature_cat_info.sort_values(ascending=False)
    
    all_cat_number = len(feature_cat_info)
    rare_cat_number = len(feature_cat_info.loc[lambda x : x > 0.05])
    non_rare_cat_number = all_cat_number - rare_cat_number

    print(f'Feature "{feature_name}" has {rare_cat_number} rare categories and {non_rare_cat_number} NON rare') 

In [None]:
rare_label_encoder = RareLabelEncoder(tol=0.05, n_categories=2, variables=imputed_cat_features, replace_with='Rare')
rare_label_encoder.fit(imputed_train_df[imputed_cat_features])

rare_cat_encoded_df = rare_label_encoder.transform(imputed_train_df[imputed_cat_features])
test_rare_cat_encoded_df = rare_label_encoder.transform(imputed_test_df[imputed_cat_features])

fa.assign_df(imputed_train_df, rare_cat_encoded_df[imputed_cat_features])
fa.assign_df(imputed_test_df, test_rare_cat_encoded_df[imputed_cat_features])

### Categorical feature encoding

In [None]:
cat_encoder = OrdinalEncoder(encoding_method='ordered', variables=imputed_cat_features)

In [None]:
y_train.index = imputed_train_df.index

cat_encoder.fit(imputed_train_df, y_train)

imputed_train_df = cat_encoder.transform(imputed_train_df)
imputed_test_df = cat_encoder.transform(imputed_test_df)

In [None]:
imputed_train_df[imputed_cat_features].head()

## Feature Scalling

In [None]:
scaler = MinMaxScaler()

In [None]:
scaler.fit(imputed_train_df)

In [None]:
print('Scalling maximum values.')
scaler.data_max_

In [None]:
print('Scalling minimum values.')
scaler.data_min_

In [None]:
scalled_train_df = pd.DataFrame(scaler.transform(imputed_train_df), columns=imputed_train_df.columns)
scalled_test_df = pd.DataFrame(scaler.transform(imputed_test_df), columns=imputed_test_df.columns)

scalled_train_df.head()

## Feature Selection

In [None]:
selector = SelectFromModel(estimator=LinearRegression())

In [None]:
y_train.index = scalled_train_df.index

selector.fit(scalled_train_df, y_train)
selected_features_index = scalled_train_df.columns[selector.get_support()]
selected_features = selected_features_index.values

selected_features

In [None]:
scalled_train_df[selected_features].head()

## Linear Regression model training

In [None]:
regressor = LinearRegression()

regressor.fit(scalled_train_df[selected_features], y_train)
predicated_y_test = regressor.predict(scalled_test_df[selected_features])


In [None]:
#The coefficients
print('Coefficients: \n', regressor.coef_)

In [None]:
#The mean squered error
print('The mean squered error is:')
mean_squared_error(y_test, predicated_y_test)

In [None]:
#The R2 score
print('The R squered score is:')
r2_score(y_test, predicated_y_test)

## Neural Network model training

### Network configuration classes

In [None]:
class LayerType:
    INPUT = 'Input'
    OUTPUT = 'Output'
    HIDDEN = 'Hiddent'

class NetConfiguration:
    
    def __init__(self, layers):
        self.layers = layers
        self.learning_rate = 0
        self.epochs = 0
        self.batch_size = 0
    
    
class LayerItem:
    
    def __init__(self, name, layer_type, input_size, output_size):
        
        self.name = name
        self.type = layer_type
        self.input_size = input_size
        self.output_size = output_size

    def __str__(self) -> str:
        return f'Layer({self.name})={self.type}({self.input_size}, {self.output_size})'
        
class LayerConfiguration:
    
    def __init__(self):
        
        self.__layers = []
        self.__hidden_layers = []
        self.__input_layer = None
        self.__output_layer = None

    def __add_and_get(self, layer_name, layer_type, input_size, output_size):
        
        layer = LayerItem(layer_name, layer_type, input_size, output_size)
        self.__layers.append(layer)
        return layer
        
    def use_input_layer(self, input_size, output_size):
        
        layer_name = 'input'
        layer_type = LayerType.INPUT
        layer = self.__add_and_get(layer_name, layer_type, input_size, output_size)
        self.__input_layer = layer
        return self
        
    def use_output_layer(self, input_size, output_size):
        
        layer_name = 'output'
        layer_type = LayerType.OUTPUT
        layer = self.__add_and_get(layer_name, layer_type, input_size, output_size)
        self.__output_layer = layer
        return self
    
    def use_hidden_layer(self, input_size, output_size):
        
        layer_id = len(self.__hidden_layers) + 1
        
        layer_name = f'hidden_{layer_id}'
        layer_type = LayerType.HIDDEN
        layer = self.__add_and_get(layer_name, layer_type, input_size, output_size)
        self.__hidden_layers.append(layer)
        return self

    def use_hidden_layers(self, size, units):
        
        for unit in range(units):
            self.use_hidden_layer(size, size)

        return self
            
    def get_input_layer(self):
        return self.__input_layer
    
    def get_output_layer(self):
        return self.__output_layer
    
    def get_hidden_layers(self):
        return self.__hidden_layers
    
    def __str__(self) -> str:
        
        result = ''
        for layer in self.__layers:
            result = f'{result} \n{str(layer)}'
            
        return result
        

### Network configuration

In [None]:
input_layer_size = len(scalled_train_df.columns)
hidden_layer_size = 64
hidden_layers = 2

layer_configuration = LayerConfiguration()
layer_configuration \
    .use_input_layer(input_layer_size, hidden_layer_size) \
    .use_hidden_layers(hidden_layer_size, hidden_layers) \
    .use_output_layer(hidden_layer_size, 1)

net_config = NetConfiguration(layer_configuration)
net_config.learning_rate = .001
net_config.epochs = 100
net_config.batch_size = 64


print(str(layer_configuration))

In [None]:
class HousePricesExperiment:
    
    def __init__(self, config, model, loss_method, optimizer):
        self.epochs = config.epochs
        self.batch_size = config.batch_size
        
        #Initialing ANN
        self.model = model

        #Initializing loss function
        self.loss_method = loss_method

        #Initialing optimizer
        self.optimizer = optimizer
    
    def train(self, train_df, targets):
        
        targets_df = pd.DataFrame(targets.tolist(), columns=[target_name])
        
        train_tensor = torch.tensor(train_df.values).float()
        targets_tensor = torch.tensor(targets_df.values).float()
        
        print(f'Features tensor size is {train_tensor.size()}')
        print(f'Targets tensor size is {targets_tensor.size()}\n')
        
        train_dataset = TensorDataset(train_tensor,targets_tensor)
        batchsize    = self.batch_size
        train_loader = DataLoader(train_dataset, batch_size=batchsize, shuffle=True, drop_last=True)
        
        losses = torch.zeros(self.epochs)
        for epochi in range(self.epochs):
           
            batchAcc  = []
            batchLoss = []
            
            # loop over training data batches        
            for X, y in train_loader:
                
                #Forward step
                predictions = self.model(X)
            
                #Calculation loss
                loss = self.loss_method(predictions, y)
                batchLoss.append(loss.item())
            
                #Backward step
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
             
            losses[epochi] = np.mean(batchLoss)
        predictions = self.model(train_tensor)
        
        r2s_metric = R2Score()
        r2s_metric.update(predictions, targets_tensor)
        accuracy = r2s_metric.compute()
        
        return losses, predictions, accuracy

class HousePricesNet(nn.Module):
    
    def __init__(self, config):
        super().__init__()
        
        layer_config = config.layers.get_input_layer()        
        self.input = nn.Linear(layer_config.input_size, layer_config.output_size)

        self.hidden = nn.ModuleList()
        
        for layer_config in config.layers.get_hidden_layers():
            self.hidden.append(nn.Linear(layer_config.input_size, layer_config.output_size))
            
        layer_config = config.layers.get_output_layer()        
        self.output = nn.Linear(layer_config.input_size, layer_config.output_size)
  
    def forward(self, x_train):
        
        data = self.input(x_train)
        data = F.relu(data)
        
        for hidden_layer in self.hidden:
            data = hidden_layer(data)
            data = F.relu(data)
        
        result = self.output(data)
        
        return result

In [None]:
model = HousePricesNet(net_config)
loss_method = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=net_config.learning_rate)

In [None]:
experiment = HousePricesExperiment(net_config, model, loss_method, optimizer)

losses, predictions, accuracy = experiment.train(scalled_train_df, y_train)

print(accuracy)

print(f'Training accuracy is {accuracy}')
plt.plot(losses.detach(), markerfacecolor='w', linewidth=2)
plt.xlabel('Epoch'), plt.ylabel('Loss')
plt.show()