# Import Libraries

In [75]:
import pandas as pd
import numpy as np
import pathlib
import sklearn
from xgboost import XGBRegressor
import os

from typing import Union
from model import *
from model.config import config
from sklearn.model_selection import train_test_split
from model.processing import preprocessors as pp

from sklearn.pipeline import Pipeline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Define Variables

In [13]:
PATH = '/mnt/data/study_path/kaggle/house_price_production/model/'
SOURCE_PATH = pathlib.Path(PATH)
DATASET_PATH = SOURCE_PATH / "datasets"
TRAINED_MODEL_PATH = SOURCE_PATH / "trained_models"

TRAIN_FILE = "train.csv"
TEST_FILE = "test.csv"
TARGET = "SalePrice"

# Data Management Functions

In [17]:
def load_dataset(*, file_name: str) -> pd.DataFrame:
    return pd.read_csv(f'{config.DATASET_PATH/file_name}')

# Processor Functions

## PreProcessor

In [63]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

class LabelEncodeCatVar(BaseEstimator, TransformerMixin):
    '''
    Label encode categorical variables
    '''
    def __init__(self, variables:Union[list,str] = None, cat_dict:dict = {}) -> None:
        self.cat_dict = cat_dict
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables
    
    def fit(self, X:pd.DataFrame) -> "LabelEncodeCatVar":
        for variable in self.variables:
            if variable not in self.cat_dict:
                cat_var = X[variable].astype("category").cat.as_ordered()
                self.cat_dict[variable] = cat_var.cat.categories
        return self
    
    def transform(self, X:pd.DataFrame) -> pd.DataFrame:
        X = X.copy()
        for variable in self.variables:
            cat_var = X[variable].astype("category").cat.as_ordered()
            X[variable] = cat_var.cat.set_categories(self.cat_dict[variable], ordered = True)
        return X

class Numericalize(BaseEstimator, TransformerMixin):
    '''
    convert categorical variables to their numerical codes
    '''
    def __init__(self, variables:Union[str, list] = None) -> None:
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables
    
    def fit(self, X:pd.DataFrame) -> "Numericalize":
        return self
    
    def transform(self, X:pd.DataFrame) -> pd.DataFrame:
        X = X.copy()
        for variable in self.variables:
            X[variable]  = X[variable].cat.codes + 1
        return X
    
class CategoricalImputer(BaseEstimator, TransformerMixin):
    """Categorical data missing value imputer."""

    def __init__(self, variables=None) -> None:
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables

    def fit(self, X: pd.DataFrame, y: pd.Series = None) -> "CategoricalImputer":
        """Fit statement to accomodate the sklearn pipeline."""

        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Apply the transforms to the dataframe."""

        X = X.copy()
        for feature in self.variables:
            X[feature] = X[feature].fillna("Missing")

        return X


class NumericalImputer(BaseEstimator, TransformerMixin):
    """Numerical missing value imputer."""

    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables

    def fit(self, X, y=None):
        # persist mode in a dictionary
        self.imputer_dict_ = {}
        for feature in self.variables:
            self.imputer_dict_[feature] = X[feature].mode()[0]
        return self

    def transform(self, X):
        X = X.copy()
        for feature in self.variables:
            X[feature].fillna(self.imputer_dict_[feature], inplace=True)
        return X

class RareLabelCategoricalEncoder(BaseEstimator, TransformerMixin):
    """Rare label categorical encoder"""

    def __init__(self, tol=0.05, variables=None):
        self.tol = tol
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables

    def fit(self, X, y=None):
        # persist frequent labels in dictionary
        self.encoder_dict_ = {}

        for var in self.variables:
            # the encoder will learn the most frequent categories
            t = pd.Series(X[var].value_counts() / np.float(len(X)))
            # frequent labels:
            self.encoder_dict_[var] = list(t[t >= self.tol].index)

        return self

    def transform(self, X):
        X = X.copy()
        for feature in self.variables:
            X[feature] = np.where(
                X[feature].isin(self.encoder_dict_[feature]), X[feature], "Rare"
            )

        return X
    
class DropUnecessaryFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, variables_to_drop=None):
        self.variables = variables_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # encode labels
        X = X.copy()
        X = X.drop(self.variables, axis=1)

        return X

## Feature Transformer

In [None]:
class LogTransformer(BaseEstimator, TransformerMixin):
    """Logarithm transformer."""

    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables

    def fit(self, X, y=None):
        # to accomodate the pipeline
        return self

    def transform(self, X):
        X = X.copy()

        # check that the values are non-negative for log transform
        if not (X[self.variables] > 0).all().all():
            vars_ = self.variables[(X[self.variables] <= 0).any()]
            raise InvalidModelInputError(
                f"Variables contain zero or negative values, "
                f"can't apply log for vars: {vars_}"
            )

        for feature in self.variables:
            X[feature] = np.log(X[feature])

        return X

# Train Model

## Get Data

In [50]:
data = load_dataset(file_name= config.TRAIN_FILE)

In [59]:
FEATURES = config.FEATURES

In [60]:
X_train, X_test, y_train, y_test = train_test_split(data[FEATURES], data[config.TARGET], 
                                                   test_size=0.1, random_state=config.RANDOM_STATE)

In [61]:
y_train = np.log(y_train)
y_test = np.log(y_test)

## Define Pipeline

In [87]:
# pipeline = Pipeline(
#     [
#         ("categorical Imputer", pp.CategoricalImputer(variables= config.CATEGORICAL_VARS)),
#         ("numerical Imputer", pp.NumericalImputer(variables= config.NUMERICAL_VARS)),
#         ("rare_label_encoder Imputer", pp.RareLabelCategoricalEncoder(variables= config.CATEGORICAL_VARS)),
#         ("Label_Encoder", pp.LabelEncodeCatVar(variables=config.CATEGORICAL_VARS),
#         ("Numericalize", pp.Numericalize(variables= config.CATEGORICAL_VARS))),
#         ("DropFeatures", pp.DropUnecessaryFeatures(variables_to_drop=config.DROP_FEATURES))
#     ]
# )



Pipeline(memory=None,
         steps=[('categorical Imputer',
                 CategoricalImputer(variables=['MSZoning', 'Street', 'Alley',
                                               'LotShape', 'LandContour',
                                               'Utilities', 'LotConfig',
                                               'LandSlope', 'Neighborhood',
                                               'Condition1', 'Condition2',
                                               'BldgType', 'HouseStyle',
                                               'RoofStyle', 'RoofMatl',
                                               'Exterior1st', 'Exterior2nd',
                                               'MasVnrType', 'ExterQual',
                                               'ExterCond', 'Foundation',
                                               'BsmtQual', 'Bsm...
                                          'Neighborhood', 'Condition1',
                                          'Condition2