# WEC - preprocessing pipeline

## Some useful libraries

In [None]:
import numpy as np
import pandas as pd

# VISUALIZATIONS
import seaborn as sns
import matplotlib.pyplot as plt

#SCIPY
from scipy import stats

# STATSMODELS
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor

# SKLEARN
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (
    StandardScaler,
    PowerTransformer,
    RobustScaler,
    MinMaxScaler
)
from sklearn.feature_selection import (
    SequentialFeatureSelector,
    SelectKBest,
    f_classif
)
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import (
    StratifiedKFold,
    cross_val_score,
)
from sklearn.metrics import (
    roc_auc_score,
    accuracy_score,
    average_precision_score,
    RocCurveDisplay,
    PrecisionRecallDisplay,
    confusion_matrix,
    ConfusionMatrixDisplay
)

# GRADIENT BOOSTING
import xgboost as xgb
import catboost as ctb

# CATEGORY ENCODERS
from category_encoders import (
    CatBoostEncoder,
    WOEEncoder,
    OneHotEncoder,
    OrdinalEncoder
)

# HYPERPARAMETER OPTIMIZATION
import optuna as opt

# EXPLAINABLE AI WITH SHAPLEY VALUES
import shap

## Downcasting dataframe

In [None]:
def downcast_dataframe(data):
    # Source: https://www.kaggle.com/anshuls235/time-series-forecasting-eda-fe-modelling
    df = data.copy()
    
    print("BEFORE downcast")
    print(df.info(memory_usage="deep"))
    print("=================")
    print("=================", "\n")

    cols = df.dtypes.index.tolist()
    types = df.dtypes.values.tolist()
    for i, t in enumerate(types):
        # Integer
        if "int" in str(t):
            # Check if minimum and maximum are in the limit of int8
            if (
                df[cols[i]].min() > np.iinfo(np.int8).min
                and df[cols[i]].max() < np.iinfo(np.int8).max
            ):
                df[cols[i]] = df[cols[i]].astype(np.int8)
            # Check if minimum and maximum are in the limit of int16
            elif (
                df[cols[i]].min() > np.iinfo(np.int16).min
                and df[cols[i]].max() < np.iinfo(np.int16).max
            ):
                df[cols[i]] = df[cols[i]].astype(np.int16)
            # Check if minimum and maximum are in the limit of int32
            elif (
                df[cols[i]].min() > np.iinfo(np.int32).min
                and df[cols[i]].max() < np.iinfo(np.int32).max
            ):
                df[cols[i]] = df[cols[i]].astype(np.int32)
            # Choose int64
            else:
                df[cols[i]] = df[cols[i]].astype(np.int64)
        # Float
        elif "float" in str(t):
            if (
                df[cols[i]].min() > np.finfo(np.float32).min
                and df[cols[i]].max() < np.finfo(np.float32).max
            ):
                df[cols[i]] = df[cols[i]].astype(np.float32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.float64)
        # Object
        elif t == object:
            if cols[i] == "date":
                df[cols[i]] = pd.to_datetime(df[cols[i]], format="%Y-%m-%d")
            else:
                df[cols[i]] = df[cols[i]].astype("category")
    print("AFTER downcast")
    print(df.info(memory_usage="deep"))
    return df

## EDA

Useful pandas functions to remember:
* dataset.describe()
* dataset.info()

In [None]:
def check_for_missings(df):
    print("\nMissing values in data:")
    print(df.isna().sum())

## Functions for preprocessing

### Feature selection

#### Tests for significance of the relationship and correlation

* Nominal - Nominal

Null hypothesis: no relation between variables.

In [None]:
def chi2_test(df, feature1, feature2):   
    contingency_tab = pd.crosstab(index = df[feature1], columns = df[feature2])
    print(f"\nChi Square Test result between {feature1} and {feature2} variables.")
    print(stats.chi2_contingency(contingency_tab))

* Continuous - binary

Null hypothesis: no significant correlation between variables.

In [None]:
def point_biserial_corr(df, feature1, feature2):
    print(f"\nPoint Biserial correlation between {feature1} and {feature2} variables:")
    print(stats.pointbiserialr(df[feature1], df[feature2]))

* Continuous - continuous

In [None]:
def corr_heatmap(df, method='spearman'):
    corr = df.corr(method=method)

    plt.figure(figsize=(10,10))
    sns.set(font_scale=1.25)
    sns.heatmap(
        corr, linewidths=1.5, annot=True, square=True, fmt=".2f", annot_kws={"size": 10}
    )
    plt.show()

### Variance analysis

* Low variance indicates non-informative features

In [None]:
def var_analysis(df, features_to_drop):
    # source: https://www.kaggle.com/code/raphael2711/catboost-pipeline-nested-crossvalidation-optuna
    Features = []
    val=[]
    var=[]
    X=df.drop(columns=features_to_drop)

    for column in X:
        most_freq_value = np.round((X[column].value_counts(normalize = True).iloc[0])*100, 2)
        variance=X[column].var()
        Features.append(column)
        val.append(most_freq_value)
        var.append(variance)
    count = pd.DataFrame(list(zip(Features, val,var)), columns =['Feature', 'Count%','Variance']).sort_values(ascending=False,by='Count%')
    display(count.style.background_gradient(cmap = 'Reds', axis = 0,subset='Count%'))

### Multicollinearity analysis

* variables highly correlated with other features should be removed

In [None]:
def vif_analysis(dataset, features_to_drop):
    # source: https://www.kaggle.com/code/raphael2711/catboost-pipeline-nested-crossvalidation-optuna
    VIF = dataset.drop(columns=[features_to_drop])
    vif_data = pd.DataFrame()
    vif_data["Feature"] = VIF.columns
    # calculating VIF for each feature 
    vif_data["VIF"] = [variance_inflation_factor(VIF.values, i) for i in range(len(VIF.columns))]
    vif_data=vif_data.sort_values(by='VIF',ascending=False)
    vif_data.style.background_gradient(cmap = 'Reds', axis = 0)

### ANOVA

* SelectKBest(f_classif) for ANOVA feature selection
* Use it as a step in sklearn Pipeline

## Visualizations

In [None]:
def confusion_matrix(y_hat, preds, labels):
    conf_matrix = confusion_matrix(y_hat, preds, labels=labels)
    disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=labels)
    disp.plot()
    plt.show()

## Models

### Gradient boosting models

* XGBoost