# I. Project Team Members

| Prepared by | Email | Prepared for |
| :-: | :-: | :-: |
| **_Your Name_** | _Your Email_ | **_Project Name_** |

# II. Notebook Target Definition

_Insert Text Here_

# III. Notebook Setup

## III.A. Import Libraries

In [None]:
from optbinning import BinningProcess
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## III.B. Import Data

In [None]:
X_train = pd.read_pickle('../../data/processed/X_train.pkl')
X_test = pd.read_pickle('../../data/processed/X_test.pkl')
y_train = pd.read_pickle('../../data/processed/y_train.pkl')
y_test = pd.read_pickle('../../data/processed/y_test.pkl')

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
y_train.head()

In [None]:
y_test.head()

# IV. Feature Engineering

## IV.A. Data Shape Inspection

In [None]:
X_train.shape, X_test.shape

In [None]:
y_train.shape, y_test.shape

## IV.B. Data Information Inspection

In [None]:
X_train.info()

In [None]:
X_test.info()

In [None]:
y_train.info()

In [None]:
y_test.info()

## IV.C. Unused Feature Removal

In [None]:
def unused_feat_removal(df, feature_to_remove):
    df.drop(columns=feature_to_remove, inplace=True)
    return df

In [None]:
feature_to_remove = ["column_0", "column_1"]

In [None]:
unused_feat_removal(X_train, feature_to_remove)
unused_feat_removal(X_test, feature_to_remove)
X_train.shape, X_test.shape

In [None]:
X_train.head()

In [None]:
X_test.head()

## IV.D. Feature Scaling

In [None]:
# Min Max Scaler
def minmax_scaler(df_train, df_test, feature_to_scale):
    scaler = MinMaxScaler()
    scaler.fit(df_train[feature_to_scale])
    df_train[feature_to_scale] = scaler.transform(df_train[feature_to_scale])
    df_test[feature_to_scale] = scaler.transform(df_test[feature_to_scale])
    return df_train, df_test


# Standard Scaler
def standard_scaler(df_train, df_test, feature_to_scale):
    scaler = StandardScaler()
    scaler.fit(df_train[feature_to_scale])
    df_train[feature_to_scale] = scaler.transform(df_train[feature_to_scale])
    df_test[feature_to_scale] = scaler.transform(df_test[feature_to_scale])
    return df_train, df_test

In [None]:
feature_to_scale = ["column_0", "column_1"]

In [None]:
# Min Max Scaler
X_train, X_test = minmax_scaler(X_train, X_test, feature_to_scale)

# Standard Scaler
X_train, X_test = standard_scaler(X_train, X_test, feature_to_scale)

X_train.shape, X_test.shape

In [None]:
X_train.head()

In [None]:
X_test.head()

## IV.E. Specific Feature Engineering

## IV.F. Final Feature Inspection

In [None]:
X_train.shape, X_test.shape

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
X_train.info()

In [None]:
X_test.info()

# V. Feature Selection

## V.A. Feature Optimal Binning

In [None]:
feature = list(X_train.columns)
categorical_feature = ["column_0", "column_1"]
X = X_train[feature]
y = y_train["target_label"].values
selection_criteria = {
    "iv": {
        "min": 0.02,
        "max": 0.5,
        "strategy": "highest"
    }
}

In [None]:
optimal_binning_process = BinningProcess(
    feature, categorical_variables=categorical_feature, selection_criteria=selection_criteria)
optimal_binning_process.fit(X, y)

In [None]:
optimal_binning_process.information(print_level=2)

In [None]:
optimal_binning_process.summary()

In [None]:
feature_optimal_binning_summary = optimal_binning_process.summary()
feature_optimal_binning_summary.to_csv(
    '../../reports/feature_optimal_binning_summary.csv', index=False)

## V.B. Optimal Binning Details

### V.B.1. _Column Name_ Optimal Binning Details

In [None]:
optb = optimal_binning_process.get_binned_variable("feature_column_name")
optimal_binning_table_column_name = optb.binning_table.build()
optimal_binning_table_column_name.insert(0, "Variables", "feature_column_name")
optb.binning_table.build()

In [None]:
optb.binning_table.plot(metric="event_rate")

## V.C. Optimal Binning Analysis

In [None]:
optimal_binning_table_dict = [
    optimal_binning_table_column_0, optimal_binning_table_column_1]
optimal_binning_analysis = pd.concat(optimal_binning_table_dict, axis=0)
optimal_binning_analysis

In [None]:
optimal_binning_analysis_dict = {
    "Non-event": "Good", "Event": "Bad", "Event rate": "Bad Rate"}
bin_to_remove = ["Special", "Missing"]
optimal_binning_analysis.rename(
    columns=optimal_binning_analysis_dict, inplace=True)
optimal_binning_analysis = optimal_binning_analysis[optimal_binning_analysis.Bin.isin(
    bin_to_remove) == False]
optimal_binning_analysis

In [None]:
optimal_binning_analysis.to_csv(
    '../../reports/optimal_binning_analysis.csv', index=False)

## V.D. Feature Optimal Binning Weight of Evidence Encoding

In [None]:
X_train_woe = optimal_binning_process.transform(X, metric="woe")
X_test_woe = optimal_binning_process.transform(X_test[feature], metric="woe")
X_train_woe.shape, X_test_woe.shape

In [None]:
X_train_woe.head()

In [None]:
X_test_woe.head()

## V.E. Feature Manual Binning

### V.E.1. Feature Weight of Evidence and Information Value Inspection

In [None]:
def woe_analysis(X, feature, y):
    df = pd.concat([X[feature], y], axis=1)
    df = pd.concat([df.groupby(df.columns.values[0], as_index=False)[df.columns.values[1]].count(),
                    df.groupby(df.columns.values[0], as_index=False)[df.columns.values[1]].mean()], axis=1)
    df = df.iloc[:, [0, 1, 3]]
    df.columns = [df.columns[0], "n_observation", "proportion_of_category"]
    df["proportion_of_observation"] = df["n_observation"] / \
        df["n_observation"].sum()
    df["n_good"] = df["proportion_of_category"] * df["n_observation"]
    df["n_bad"] = (1 - df["proportion_of_category"]) * df["n_observation"]
    df["proportion_of_good"] = df["n_good"] / df["n_good"].sum()
    df["proportion_of_bad"] = df["n_bad"] / df["n_bad"].sum()
    df["WoE"] = np.log(df["proportion_of_good"] / df["proportion_of_bad"])
    df = df.sort_values(["WoE"]).reset_index(drop=True)
    df["diff_proportion_of_category"] = df["proportion_of_category"].diff().abs()
    df["diff_WoE"] = df["WoE"].diff().abs()
    df["IV"] = (df["proportion_of_good"] - df["proportion_of_bad"]) * df["WoE"]
    df["IV"] = df["IV"].sum()
    return df


def plot_by_woe(woe_df, rotation_of_x_axis_labels=0):
    x = np.array(woe_df.iloc[:, 0].astype(str))
    y = woe_df["WoE"]
    plt.figure(figsize=(18, 6))
    plt.plot(x, y, marker='o', linestyle='--', color='k')
    plt.xlabel(woe_df.columns[0])
    plt.ylabel("Weight of Evidence")
    plt.title("Weight of Evidence by " + woe_df.columns[0])
    plt.xticks(rotation=rotation_of_x_axis_labels)

In [None]:
X_binning = X_train.copy()
X_binning.shape

In [None]:
X_binning.head()

#### V.E.1.A. _Column_0_

In [None]:
# Categorical Feature
column_0_woe = woe_analysis(X_binning, "column_0", y_train)
column_0_woe

In [None]:
plot_by_woe(column_0_woe)

#### V.E.1.B. _Column_1_

In [None]:
# Continuous Feature
# Fine Classing or Coarse Classing
# This is an iterative process
X_binning["column_1"] = pd.cut(X_binning["column_1"], 10)
column_1_woe = woe_analysis(X_binning, "column_1", y_train)
column_1_woe

In [None]:
plot_by_woe(column_1_woe)

### V.E.2. Feature Binning

#### V.E.2.A. _Column_0_

In [None]:
# This is just a binning example
column_0_bins = [["STR", "ATK", "VIT"],
                 ["DEF", "AGI", "DEX"],
                 ["INT", "LUCK"]]

In [None]:
def column_0_binning(df, bins):
    for i, bin in enumerate(bins):
        for column_0_bin in bin:
            df.loc[df["column_0"] == column_0_bin, "column_0"] = f'bin_{chr(97+i)}'
    remaining_column_0_bins = set(df["column_0"]) - set([f'bin_{chr(97+i)}' for i in range(len(bins))])
    return df

In [None]:
X_binning = column_0_binning(X_binning, column_0_bins)
X_binning.shape

In [None]:
X_binning.head()

#### V.E.2.B. _Column_1_

In [None]:
def column_1_binning(df):
    bins = [-np.inf, 1978840.31, 8013587.75, 23240000.00, 40215236.00,
            54947816.00, 249441952.00, np.inf]  # This is just a binning example
    bin_labels = ["-inf_to_1978840.31", "1978840.31_to_8013587.75", "8013587.75_to_23240000.00",
                  "23240000.00_to_40215236.00", "40215236.00_to_54947816.00", "54947816.00_to_249441952.00", "249441952.00_to_inf"]
    df["column_1"] = pd.cut(df["column_1"], bins=bins, labels=bin_labels)
    df["column_1"] = df["column_1"].astype(str)
    return df

In [None]:
X_binning = column_1_binning(X_binning)
unused_feat_removal(X_binning, "column_1_binned")
X_binning.shape

In [None]:
X_binning.head()

In [None]:
X_binning.info()

## V.F. Feature Manual Binning Weight of Evidence Encoding

In [None]:
def manual_binning_woe_encoding(X, feature, df_woe_analysis):
    X_encoded = X.copy()
    woe_values = df_woe_analysis.set_index(feature)["WoE"]
    X_encoded[feature] = X_encoded[feature].map(woe_values)
    return X_encoded

In [None]:
X_train_copy = X_train.copy()
X_test_copy = X_test.copy()

### V.F.1. _Column_0_

In [None]:
X_train_woe = manual_binning_woe_encoding(
    X_train_copy, "column_0", column_0_woe)
X_test_woe = manual_binning_woe_encoding(X_test_copy, "column_0", column_0_woe)
X_train_woe.shape, X_test_woe.shape

In [None]:
X_train_woe.head()

In [None]:
X_test_woe.head()

### V.F.2. _Column_1_

In [None]:
X_train_woe = manual_binning_woe_encoding(
    X_train_copy, "column_1", column_1_woe)
X_test_woe = manual_binning_woe_encoding(X_test_copy, "column_1", column_1_woe)
X_train_woe.shape, X_test_woe.shape

In [None]:
X_train_woe.head()

In [None]:
X_test_woe.head()

## V.G. Feature Manual Binning One-Hot Encoding

In [None]:
X_train_binned = X_train.copy()
X_test_binned = X_test.copy()

In [None]:
def feature_binning(df):
    df = column_0_binning(df)
    df = column_1_binning(df)
    return df

In [None]:
X_train_binned = feature_binning(X_train_binned)
X_test_binned = feature_binning(X_test_binned)
X_train_binned.shape, X_test_binned.shape

In [None]:
X_train_binned.head()

In [None]:
X_test_binned.head()

In [None]:
feature_to_encode = ["column_0", "column_1"]

In [None]:
def one_hot_encoding(df, feature_to_encode):
    df_encoded = pd.get_dummies(df,
                                columns=feature_to_encode,
                                prefix=feature_to_encode,
                                prefix_sep=":")
    df_encoded = df_encoded.astype(int)
    return df_encoded

In [None]:
X_train_ohe = one_hot_encoding(X_train_binned, feature_to_encode)
X_test_ohe = one_hot_encoding(X_test_binned, feature_to_encode)
X_train_ohe.shape, X_test_ohe.shape

In [None]:
X_train_ohe.head()

In [None]:
X_test_ohe.head()

## V.H. Export Data

In [None]:
X_train_woe.to_pickle('../../data/processed/X_train_woe.pkl')
X_test_woe.to_pickle('../../data/processed/X_test_woe.pkl')

X_train_ohe.to_pickle('../../data/processed/X_train_ohe.pkl')
X_test_ohe.to_pickle('../../data/processed/X_test_ohe.pkl')