## Load Packages

In [None]:
import re

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as stats

from sklearn.preprocessing import PowerTransformer, LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer,  TransformedTargetRegressor

from env import *
from plotting.unique import *
from plotting.missing import *
from processing.data import Data

pd.pandas.set_option('display.max_columns', None)

## Import Data

In [None]:

train = Data.from_csv(filepath=DIR_DATA_TRAIN, index_col=COL_ID)
test  = Data.from_csv(filepath=DIR_DATA_TEST, index_col=COL_ID)
print(train)
repr(train)

In [None]:
# Create more human-readable columns
for regex, repl in REGEX_REPL_COLUMN:
    train.columns = train.columns.map(lambda x: re.sub(string=x, pattern=regex, repl=repl))

train.columns = train.columns.map(str.lower)

train.col_target = COL_TARGET

print(f'Columns of training data: {train.columns.tolist()}')

## Variable Types

In [None]:
train.print_column_types()

In [None]:
train.df.info()

## Unique Values

In [None]:
plot_count_unique(train.df, 
                  dtype_include=['float64', 'int64'],
                  title='Count of unique values - numerical fields')

In [None]:
plot_count_unique(train.df, 
                  dtype_include=['O'],
                  title='Count of unique values - categorical fields')

In [None]:
train.remove_constant_columns()

### Aggregate low count categorical features to 'other' category

In [None]:

#TODO Add unknown category for never-seen-before categories
#TODO Aggregate low count categorical features to 'other' category

CATEGORICAL_AGGREGATION_THRESHOLD = 0.01

col = 'exterior2nd'


train.df[col].value_counts() / train.df[col].value_counts().sum()

## Missing Values

In [None]:
col_missing_bool = train.df.apply(lambda x: x.isnull().sum() > 0)
col_missing_desc = train.df.columns[col_missing_bool].tolist()

print(f'Columns with missing values:')
print("\n".join(col_missing_desc))

In [None]:
sns.set_theme(
    style="ticks", 
    # palette=sns.color_palette("Set1"), 
    rc=custom_params)

# Plot percentage of missing data
for dtypes in [['object',], ['int64', 'float64']]:

    plot_perc_missing(
        train.df, 
        title=f'% of Missing Values - {dtypes} Columns',
        dtype_include=dtypes,
        )

In [None]:
train.remove_missing_columns(threshold=MISSING_THRESHOLD_DROP)

col_missing_bool = train.df.apply(lambda x: x.isnull().sum() > 0)
col_missing_desc = train.df.columns[col_missing_bool].tolist()

In [None]:

# TODO Histogram of target feature based on missingness

for col in col_missing_desc:

    boxplot_target_missingness_relationship(
        df=train.df, 
        col_var=col, 
        col_target='sale_price'
        )

In [None]:
train.add_flag_missing_values(
    ttest_threshold=FLAG_MISSING_PVALUE_TRESHOLD,
    ttest_min_samples=FLAG_MISSING_MIN_SAMPLES
    )

## Create features from time data

In [None]:
train.create_column(
    col_name='year_built_or_remod', 
    values=train.df[['year_remod_add', 'year_built']].apply(np.max, axis=1))

def x(a,b):
    return np.max([a-b, 0])

# Calculate age when sold
train.create_column(
    col_name='age_garage_when_sold',
    values=train.df.apply(lambda f: x(f['year_sold'], f['garage_year_built']), axis=1))

train.create_column(
    col_name='age_house_when_sold', 
    values=train.df.apply(lambda f: x(f['year_sold'], f['year_built_or_remod']), axis=1))

train.drop_columns_regex(regex='year(?!.+missing$)')

## Discrete Features

In [None]:
# ms_sub_class is a categorical feature
train.change_column_types({'ms_sub_class': 'object'})

In [None]:
train.print_column_types()

In [None]:
discrete_cols   = [col for col in train.num_columns if len(train.df[col].unique()) < 20]
continuous_cols = [col for col in train.num_columns if col not in discrete_cols]

train.df[discrete_cols].head(10)

In [None]:
    
for col in discrete_cols:
    
    sns.set_theme(
            style="ticks", 
            palette=sns.color_palette("Reds"), 
            rc=custom_params)
    
    sns.catplot(data=train.df, x=col, y=COL_TARGET, kind='box', height=4.5, aspect=1.7)
    sns.stripplot(data=train.df, x=col, y=COL_TARGET, jitter=0.1, alpha=0.1, color='k')
    plt.title(f'Distribution of \'{COL_TARGET}\' by \'{col}\'')
    plt.ylabel(COL_TARGET.replace('_', ' ').upper())
    plt.xlabel(col.replace('_', ' ').upper())
    plt.show()

## Numerical features

In [None]:
train.df[continuous_cols].head()

In [None]:
sns.set_theme(
            style="white", 
            palette=sns.color_palette("Set2"), 
            rc=custom_params)

train.df.loc[:, continuous_cols].hist(bins=15, figsize=(15,15))
plt.suptitle(f'No Transform')
plt.show()

In [None]:
sns.set_theme(
            style="white", 
            palette=sns.color_palette("Set2"), 
            rc=custom_params)

np.log10(train.df.loc[:, continuous_cols]+1).hist(bins=15, figsize=(15,15))
plt.suptitle(f'log10 Transform')
plt.show()