# I. Project Team Members

| Prepared by | Email | Prepared for |
| :-: | :-: | :-: |
| **Hardefa Rogonondo** | hardefarogonondo@gmail.com | **IBRD Credit Scorecard Predictive Engine** |

# II. Notebook Target Definition

_Insert Text Here_

# III. Notebook Setup

## III.A. Import Libraries

In [None]:
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.over_sampling import ADASYN, KMeansSMOTE, RandomOverSampler, SMOTE, SMOTEN, SVMSMOTE
from imblearn.under_sampling import RandomUnderSampler
from scipy.stats import chi2_contingency, ttest_ind
from sklearn.feature_selection import f_classif
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import seaborn as sns

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## III.B. Import Data

In [None]:
df = pd.read_pickle('../../data/processed/df.pkl')

In [None]:
df.head()

# IV. Exploratory Data Analysis

## IV.A. Data Shape Inspection

In [None]:
X.shape, y.shape

## IV.B. Data Information Inspection

In [None]:
X.info()

In [None]:
y.info()

## IV.C. Missing Values Inspection

In [None]:
X_missing = pd.DataFrame(X.isnull().sum().sort_values() / len(X) * 100).reset_index()
X_missing.columns = ["variables", "missing_percentage"]
X_missing

In [None]:
sns.barplot(data = X_missing,
            x = "variables",
            y = "missing_percentage",
            palette = 'Blues')
plt.title("X Null Values Proportion")
plt.xticks(rotation = 'vertical')
plt.show()

### IV.C.1. Missing Value Imputation for Numerical Columns

In [None]:
X_categorical = X.select_dtypes(include = 'object').copy()
X_numerical = X.select_dtypes(include = 'number').copy()
X_categorical.shape, X_numerical.shape

In [None]:
X_numerical.columns

In [None]:
X_numerical.isnull().any()

In [None]:
imputer = SimpleImputer(missing_values = np.nan, strategy = "median") # Other strategy is mean
imputer.fit(X_numerical)

In [None]:
imputed_data = imputer.transform(X_numerical)
X_numerical = pd.DataFrame(imputed_data)
X_numerical.columns = X_numerical.columns
X_numerical.index = X_numerical.index

In [None]:
X_numerical.isnull().any()

### IV.C.2. Missing Value Imputation for Categorical Columns

In [None]:
X_categorical.columns

In [None]:
X_categorical.isnull().sum()

In [None]:
X_categorical = X_categorical.fillna(value = "Null")

In [None]:
X_categorical.isnull().sum()

In [None]:
X = pd.concat([X_numerical, X_categorical], axis = 1)

In [None]:
X.head()

In [None]:
X.isnull().any()

## IV.D. Duplicated Values Inspection

In [None]:
X_duplicated = X[X.duplicated(subset = "column_name", keep = False)]
X_duplicated.shape

In [None]:
X_duplicated

## IV.E. Data Visualization

### IV.E.1. Target Label Proportion

In [None]:
# Barplot
plt.title("Target Label Proportion")
y_proportion = sns.countplot(data = y,
                             x = y["target_label"],
                             palette = 'Blues')
y_proportion.bar_label(y_proportion.containers[0])
plt.show()

In [None]:
# Pie Chart
plt.title("Target Label Proportion")
plt.pie(x = y.value_counts(),
        labels = y["target_label"].value_counts(),
        colors = sns.color_palette('Set3'),
        autopct = '%1.1f%%')
plt.show()

### IV.E.2. _Column Name_

In [None]:
plt.title("Column Name Proportion")
column_name_proportion = sns.countplot(data = X,
                                       x = "column_name",
                                       palette = 'Set_1')
column_name_proportion.bar_label(column_name_proportion.containers[0])
plt.show()

## IV.F. Statistical Analysis

### IV.F.1. Statistical Description

In [None]:
X.describe()

### IV.F.2. Skewness Analysis

In [None]:
X_skewness = X.skew()
X_skewness = pd.DataFrame({"variables": X_skewness.index, "skewness": X_skewness.values})

In [None]:
plt.title("Skewness Analysis")
plt.bar(X_skewness["variables"], X_skewness["skewness"])
plt.xticks(rotation = 45)
plt.xlabel("Variables")
plt.ylabel("Skewness")
plt.show()

### IV.F.3. Chi-Squared Analysis

Analyze the independence or dependence between categorical variables and assess the goodness of fit of observed data to an expected distribution.

In [None]:
X_categorical = X.select_dtypes(include = 'object').copy()
X_numerical = X.select_dtypes(include = 'number').copy()
X_categorical.shape, X_numerical.shape

In [None]:
X_categorical.columns

In [None]:
X_numerical.columns

In [None]:
chi2_result = pd.DataFrame(columns = ["variables", "p-value"])

for column in X_categorical.columns:
    cross_tab = pd.crosstab(y, X_categorical[column])
    chi2, p_value, degree_of_freedom, expected_frequencies = chi2_contingency(cross_tab)
    chi2_result = chi2_result.append({"variables": column, "p-value": round(p_value, 10)}, ignore_index = True)

chi2_result.sort_values(by = "p-value", ascending = True, inplace = True, ignore_index = True)
chi2_result

### IV.F.4. T-Statistics Analysis

Assess if there is a significant difference in means between two groups, such as comparing the mean scores of a continuous variable between two treatment groups.

In [None]:
X_numerical.fillna(X_numerical.mean(), inplace = True)

In [None]:
t_test_results = []
for variable in X_numerical.columns:
    group_0_values = X_numerical.loc[y == 0, variable]
    group_1_values = X_numerical.loc[y == 1, variable]
    t_statistic, p_value = ttest_ind(group_0_values, group_1_values)
    t_test_results.append({"variables": variable, "t-statistic": t_statistic, "p-value": p_value})

t_test_table = pd.DataFrame(t_test_results)
t_test_table.sort_values(by = "t-statistic", ascending = False, inplace = True, ignore_index = True)
t_test_table

### IV.F.5. ANOVA F Analysis

Compare more than two groups, such as comparing the mean scores of a continuous variable among different experimental conditions.

In [None]:
X_numerical.fillna(X_numerical.mean(), inplace = True)

In [None]:
f_statistic, p_values = f_classif(X_numerical, y)

anova_f_table = pd.DataFrame({"variables": X_numerical.columns, "f-score": f_statistic, "p-values": p_values.round(decimals = 10)})
anova_f_table.sort_values(by = "f-score", ascending = False, inplace = True, ignore_index = True)
anova_f_table

## IV.G. Correlation Matrix

In [None]:
X.corr()

In [None]:
sns.heatmap(data = X.corr())

# V. Preprocessing

## V.A. Columns Reorder

In [None]:
custom_order = ["column_0", "column_1", "column_2"]

In [None]:
X = X.reindex(columns = custom_order)
X.shape

In [None]:
X.head()

## V.B. Specific Preprocessing

## V.C. Imbalance Data Preprocessing

### V.C.1. Random Undersampling

In [None]:
rus = RandomUnderSampler(random_state = 777)
X_undersampled, y_undersampled = rus.fit_resample(X, y)
y_undersampled.value_counts()

### V.C.2. Random Oversampling

In [None]:
ros = RandomOverSampler(random_state = 777)
X_oversampled, y_oversampled = ros.fit_resample(X, y)
y_oversampled.value_counts()

### V.C.3. Synthetic Minority Oversampling Technique (SMOTE)

In [None]:
smote = SMOTE(random_state = 777)
X_smote, y_smote = smote.fit_resample(X, y)
y_smote.value_counts()

### V.C.4. Synthetic Minority Oversampling Technique for Nominal (SMOTEN)

In [None]:
smoten = SMOTEN(random_state = 777)
X_smoten, y_smoten = smoten.fit_resample(X, y)
y_smoten.value_counts()

### V.C.5. Adaptive Synthetic Sampling (ADASYN)

In [None]:
adasyn = ADASYN(random_state = 777)
X_adasyn, y_adasyn = adasyn.fit_resample(X, y)
y_adasyn.value_counts()

### V.C.6. KMeans Clustering + Synthetic Minority Oversampling Technique (SMOTE)

In [None]:
kmeanssmote = KMeansSMOTE(random_state = 777)
X_kmeanssmote, y_kmeanssmote = kmeanssmote.fit_resample(X, y)
y_kmeanssmote.value_counts()

### V.C.7. Support Vector Machine (SVM) + Synthetic Minority Oversampling Technique (SMOTE)

In [None]:
svmsmote = SVMSMOTE(random_state = 777)
X_svmsmote, y_svmsmote = svmsmote.fit_resample(X, y)
y_svmsmote.value_counts()

### V.C.8. Synthetic Minority Oversampling Technique (SMOTE) + Edited Nearest Neighbour (ENN)

In [None]:
smoteenn = SMOTEENN(random_state = 777)
X_smoteenn, y_smoteenn = smoteenn.fit_resample(X, y)
y_smoteenn.value_counts()

### V.C.9. Synthetic Minority Oversampling Technique (SMOTE) + Tomek Links

In [None]:
smotetomek = SMOTETomek(random_state = 777)
X_smotetomek, y_smotetomek = smotetomek.fit_resample(X, y)
y_smotetomek.value_counts()

## V.D. Data Splitting

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 777)

In [None]:
X_train.shape, X_test.shape

In [None]:
y_train.shape, y_test.shape

## V.E. Export Data

In [None]:
X_train.to_pickle('../../data/processed/X_train.pkl')
X_test.to_pickle('../../data/processed/X_test.pkl')
y_train.to_pickle('../../data/processed/y_train.pkl')
y_test.to_pickle('../../data/processed/y_test.pkl')