# Import Libraries

## setting path to project

In [21]:
import sys
project_path = "/mnt/data/study_path/kaggle/house_price_production"

if project_path not in sys.path:
    sys.path.insert(0, project_path)
    

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## importing libraries

In [48]:
from model.config import config
import pandas as pd
import sklearn
from sklearn.base import BaseEstimator, TransformerMixin

from typing import Union


In [44]:
import numpy as np
np.__version__

'1.18.4'

# Load Data

## Functions

In [34]:
def load_dataset(*, file_name: str) -> pd.DataFrame:
    return pd.read_csv(f'{config.DATASET_PATH/file_name}')

## loading data

In [37]:
train_data = load_dataset(file_name = config.TRAIN_FILE)
test_data = load_dataset(file_name = config.TEST_FILE)

# Transform Data

## Functions

In [52]:
## Transform target variable to log scale
class LogTransformVar(BaseEstimator, TransformerMixin):
    def __init__(self, variables:Union[list, str, None] = None, shift:bool = False) -> None:
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables
        self.shift = shift
        
    def fit(self, X:pd.DataFrame)->"LogTransformVar":
        return self
    
    def transform(self, X:pd.DataFrame)->pd.DataFrame:
        X = X.copy()
        for variable in self.variables:
            min_value = X[variable].min()
            if min_value > 0:
                pass
            elif self.shift:
                X[variable] = X[variable] + min_value + 1
            else:
                raise ValueError(f'Cannot take logarithm for Variable containing 0 or negative values'
                                f'found non positive value in Varaible: {variable}')
        for variable in self.variables:
            X[variable] = np.log(X[variable])
        return X

In [72]:
class LabelEncodeCatVar(BaseEstimator, TransformerMixin):
    def __init__(self, variables:Union[list,str] = None, cat_dict:dict = {}) -> None:
        self.cat_dict = cat_dict
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables
    
    def fit(self, X:pd.DataFrame) -> "LabelEncodeCatVar":
        for variable in self.variables:
            if variable not in self.cat_dict:
                cat_var = X[variable].astype("category").cat.as_ordered()
                self.cat_dict[variable] = cat_var.cat.categories
        return self
    
    def transform(self, X:pd.DataFrame) -> pd.DataFrame:
        X = X.copy()
        for variable in self.variables:
            cat_var = X[variable].astype("category").cat.as_ordered()
            X[variable] = cat_var.cat.set_categories(self.cat_dict[variable], ordered = True)
        return X

In [73]:
class Numericalize(BaseEstimator, TransformerMixin):
    def __init__(self, variables:Union[str, list] = None) -> None:
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables
    
    def fit(self, X:pd.DataFrame) -> "Numericalize":
        return self
    
    def transform(self, X:pd.DataFrame) -> pd.DataFrame:
        X = X.copy()
        for variable in self.variables:
            X[variable]  = X[variable].cat.codes + 1
        return X

In [62]:
ltv = LogTransformVar(shift = 0, variables= config.TARGET)

In [65]:
ltv.transform(train_data)[config.TARGET]

0       12.247694
1       12.109011
2       12.317167
3       11.849398
4       12.429216
          ...    
1455    12.072541
1456    12.254863
1457    12.493130
1458    11.864462
1459    11.901583
Name: SalePrice, Length: 1460, dtype: float64