# Preprocessing

First take a look at the data

In [18]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.graph_objects as go
import sklearn
import sklearn.compose
import sklearn.preprocessing
import sklearn.model_selection
from pandas.api.types import is_object_dtype

In [19]:
csv_data = pd.read_csv('immo_dev_data.csv')

# display data transposed, easier to see all columns
csv_data.head().T

Unnamed: 0,0,1,2,3,4
Id,7135329,7170979,7172246,7172252,7172733
AreaLiving,140,143,160,351,400
AreaProperty,501,277,712,496,1800
BuiltYear,2016,2004,1945,2016,1975
FloorNumber,,,,,
...,...,...,...,...,...
gde_workers_sector3,358,2787,1138,17,701
gde_workers_total,537,5041,2999,39,732
location_has_street,0,1,0,0,0
location_is_complete,0,1,0,0,0


# 

# One Hot Encoding

In [37]:
class preprocessor:
    def __init__(self, df, y_var, cols_to_drop = []):
        self.y = df[y_var].copy()
        cols_to_drop.append(y_var)
        print('Columns dropped to create X: ', cols_to_drop)
        if len(cols_to_drop) == 1:
            self.X = df.drop(columns = [y_var])
        else:
            self.X = df.drop(columns = cols_to_drop)
        self.enc = sklearn.preprocessing.OneHotEncoder(handle_unknown='ignore')
        self.scaler = sklearn.preprocessing.StandardScaler(with_std=False)
    

    def __fillna(self, _df):
        df = _df.copy()
        for col in df:
            #get dtype for column
            dt = df[col].dtype
            #check if it is a number
            if is_object_dtype(dt):
                df[col] = df[col].fillna("No Entry")
            else:
                df[col] = df[col].fillna(-1)
        return df


    def __fit_df(self):
        """
        This function fits (one hot encoding) the categorical columns if they are of type object and
        function fits (standardscaler) the numerical columns if they are of numbers (based on select_dtypes).
        Takes:
        - _df: pandas.DataFrame
        - enc: sklearn.preprocessing.OneHotEncoder(handle_unknown='ignore') object

        returns:
        - encoded df as numpy.array
        """
        # Fit for onehot-encoder
        df_obj = self.X_train_df.select_dtypes(include = [object])
        self.enc.fit(df_obj)
        
        # Fit for standard scaler:
        df_num = self.X_train_df.select_dtypes(include = 'number')
        self.std = df_num.std(axis=0).fillna(1)
        self.mean = df_num.mean(axis=0).fillna(0)

    def __encode_transform_df(self, _df):
        """
        This function transforms the new df with the fitted encoder (one hot encoding).

        Takes:
        - _df: pandas.DataFrame
        - enc: sklearn.preprocessing.OneHotEncoder(handle_unknown='ignore') object

        returns:
        - transformed df as numpy.array
        """

        df = _df.copy()
        df_obj = df.select_dtypes(include = [object])
        df.drop(columns = df_obj.columns, inplace=True)
        return np.concatenate((df.to_numpy(), self.enc.transform(df_obj).toarray()), axis = 1)
        

    def __standardise_df(self, _df):
        """
        This function transforms the new df with the fitted encoder (one hot encoding).

        Takes:
        - _df: pandas.DataFrame
        - enc: sklearn.preprocessing.OneHotEncoder(handle_unknown='ignore') object

        returns:
        - transformed df as numpy.array
        """
        df = _df.copy()
        df_num = df.select_dtypes(include = 'number')
        df.drop(columns = df_num.columns, inplace=True)
        df_num = ((df_num - self.mean) / (self.std)).copy()
        df = df.join(df_num)
        return df


    def split_X_y(self, test_frac = 0.2):
        self.X_test_df = self.X.sample(frac = test_frac)
        self.X_train_df = self.X.drop(index = self.X_test_df.index, axis = 0)
        self.y_train_df = self.y.iloc[self.X_train_df.index]
        self.y_test_df = self.y.iloc[self.X_test_df.index]

    def encode_sample(self, _sample):
        sample = _sample.copy()
        sample = self.__standardise_df(sample)
        sample = self.__fillna(sample)
        sample = self.__encode_transform_df(sample)
        return sample

    def preprocess(self):
        self.split_X_y()
        self.__fit_df()
        self.X_train_df = self.__fillna(self.X_train_df).copy()
        self.X_train_df = self.__standardise_df(self.X_train_df).copy()
        self.X_train = self.__encode_transform_df(self.X_train_df).copy()
        self.X_test = self.encode_sample(self.X_test_df)

In [38]:
preprocessor = preprocessor(csv_data.head(5000), y_var='PurchasePrice', cols_to_drop=['StreetAndNr', 'Id'])

Columns dropped to create X:  ['StreetAndNr', 'Id', 'PurchasePrice']


In [39]:
preprocessor.preprocess()

In [42]:
preprocessor.X_train

array([[True, -0.6373587340482573, -0.44325402110089634, ..., 0.0, 0.0,
        0.0],
       [True, 2.3782998827452326, -0.45177935830980215, ..., 0.0, 0.0,
        0.0],
       [True, 1.8494877082364214, 0.7025512997760459, ..., 0.0, 0.0, 0.0],
       ...,
       [True, -0.9946642573650215, 0.023934457947142596, ..., 1.0, 0.0,
        0.0],
       [True, 0.47743449870004695, 0.1603398532896357, ..., 0.0, 0.0,
        0.0],
       [True, 1.5064744058523278, -0.26592700715565526, ..., 0.0, 0.0,
        0.0]], dtype=object)

In [9]:
np.isnan(preprocessor.X_train_df.iloc[1, 9])

True

In [10]:
csv_data.iloc[:5].fillna(555)

Unnamed: 0,Id,AreaLiving,AreaProperty,BuiltYear,FloorNumber,ForestDensityL,ForestDensityM,ForestDensityS,GroupNameDe,HouseObject,...,gde_private_apartments,gde_social_help_quota,gde_tax,gde_workers_sector1,gde_workers_sector2,gde_workers_sector3,gde_workers_total,location_has_street,location_is_complete,PurchasePrice
0,7135329,140.0,501.0,2016,555.0,0.418964,0.555985,0.730714,Haus,True,...,1358.0,3.660512,8.73,17.0,162.0,358.0,537.0,0,0,745000.0
1,7170979,143.0,277.0,2004,555.0,0.033259,0.074061,0.076468,Haus,True,...,3476.0,3.634717,6.13,0.0,2250.0,2787.0,5041.0,1,1,780000.0
2,7172246,160.0,712.0,1945,555.0,0.0,0.0,0.0,Haus,True,...,2806.0,2.512344,9.79,167.0,1694.0,1138.0,2999.0,0,0,570000.0
3,7172252,351.0,496.0,2016,555.0,0.037575,0.0,0.0,Haus,True,...,131.0,1.734104,9.15,12.0,10.0,17.0,39.0,0,0,920000.0
4,7172733,400.0,1800.0,1975,555.0,0.095162,0.097193,0.153314,Haus,True,...,1181.0,1.056052,2.97,0.0,27.0,701.0,732.0,0,0,3950000.0
