# IS424: Data Mining & Biz Analytics
## Team: G3T3
### Project: Predicting Loan Default based on Customer Profile
### Data Preprocessing
---

# 1. Setting up the notebook
---

### Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.feature_selection import chi2, SelectKBest
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso
from xgboost import XGBClassifier

from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, fbeta_score, accuracy_score, roc_auc_score

### Import dataset and tidy columns

In [2]:
df = pd.read_csv("./dataset/loanprediction.csv")
df.drop("Id", axis=1, inplace=True)

In [3]:
# standardise column naming
df.columns = ["income", "age", "experience", "marital_status", "house_ownership", "car_ownership", "profession", "city", "state", "current_job_years", "current_house_years", "risk_flag"]

In [4]:
# retrieve numerical and categorical columns
numerical = list(df.describe().columns)[:-1]
categorical = [i for i in df.columns if i not in numerical and i != "risk_flag"]

### Splitting the data into train and test splits

In [5]:
y = df["risk_flag"]
x = df.drop("risk_flag", axis=1)

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y, random_state = 2021)

# 2. Feature Engineering
---

## 2.1 Label and One Hot Encoding

In [8]:
print( x_train['marital_status'].value_counts() )
# print( x_train['car_ownership'].value_counts() )

single     180928
married     20672
Name: marital_status, dtype: int64


In [None]:
def feature_engineering(df_x):
    """
        Encoding categorical columns
    """
    x = df_x.copy()
    
    # Label/Binary Encoding — categorical columns with binary categories
    label_enc = LabelEncoder()
    x["marital_status"] = label_enc.fit_transform(x["marital_status"])
    x["car_ownership"] = label_enc.fit_transform(x["car_ownership"])
    
    # One Hot Encoding — house_ownership column
    onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    
    house_ownership_values = onehot_encoder.fit_transform( x[['house_ownership']] )
    house_ownership_labels = np.array(["norent_noown", "owned", "rented"]).ravel()
    house_ownership_df = pd.DataFrame(house_ownership_values, columns=house_ownership_labels)

    x.reset_index(drop=True, inplace=True)
    house_ownership_df.reset_index(drop=True, inplace=True)
    x = pd.concat([x, house_ownership_df], axis=1)
    
    x.drop("house_ownership", axis=1, inplace=True)

    return x

In [None]:
x_train = feature_engineering(x_train)
x_test = feature_engineering(x_test)

In [None]:
print( x_train['marital_status'].value_counts() )
print( x_train['car_ownership'].value_counts() )

# 3. Feature Selection
---

To reduce the dimensional space of the dataset, we are carrying out several feature selection techniques with the goal of identify features to keep and drop

1. Filter Methods
    - Chi-square
    - ANOVA
2. Embedded Methods
    - Random Forest
    - XGBoost
    - Lasso Regression  

In [None]:
def retrieve_feature_importance(importance, names):
    """
        Helper function to display the feature importance
    """
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    data = {'feature_name':feature_names,'feature_importance':feature_importance}
    feature_importance_df = pd.DataFrame(data)

    feature_importance_df.sort_values(by=['feature_importance'], ascending=False, inplace=True)
    return feature_importance_df.reset_index(drop=True)

## 3.1 Filter Methods
---
### Chi-square

- H0: 2 variables are independent (feature and target)
- H1: 2 variables are not independent
- 5% significance level, using p-value
- Reject H0 if p-value < 0.05

#### Conclusion: Drop none

In [None]:
numerical = ["income", "age", "experience", "current_job_years", "current_house_years"]
categorical = ["risk_flag", "marital_status", "house_ownership", "car_ownership", "profession", "city", "state"]

df_cat = df.copy()[categorical]

label_enc = LabelEncoder()
for cat in categorical:
    df_cat[cat] = label_enc.fit_transform(df_cat[cat])
    
y = df_cat["risk_flag"]
x = df_cat.drop("risk_flag", axis=1)

chi_scores = chi2(x, y)

# show results — dataframe
results_df = retrieve_feature_importance(chi_scores[1], x.columns)
results_df.rename(columns={"feature_importance": "p-value"}, inplace=True)
display(results_df)

# show results — graph
p_values = pd.Series(chi_scores[1], index = x.columns)
p_values.sort_values(ascending = False , inplace = True)
p_values.plot.bar()

### ANOVA

- H0: The numerical column cannot clearly discriminate between the 2 classes
- H1: The numerical column can clearly discriminate between the 2 classes
- 5% significance level, using F-Score
- F-value at 0.05 significance = 3.179
- Reject H0 if f-score > 3.179

#### Conclusion: Drop `income`

In [None]:
x = df.copy()[numerical]
fscore = SelectKBest(k="all")
fscore.fit(x, y)
x_fs = fscore.transform(x)

results_df = retrieve_feature_importance(fscore.scores_, x.columns)
results_df.rename(columns={"feature_importance": "f-score"}, inplace=True)
display(results_df)

plt.bar(numerical, fscore.scores_)
plt.xticks(rotation=90)

plt.show()

## 3.2 Embedded Methods
---

Note: Encoding is required for profession, city and state for the purpose of Feature Selection. However, the actual encoding will only be done during GridSearch Cross-Validation

In [None]:
x_train_embedded = target_encoding(x_train, y_train)

### 3.2.1 Random Forest Classifier

In [None]:
rf_clf = RandomForestClassifier(n_estimators=100, random_state=2021)
rf_clf.fit(x_train_embedded, y_train.values.ravel() )

In [None]:
feat_impt_df = retrieve_feature_importance(rf_clf.feature_importances_, x_train_embedded.columns)
feat_impt_df

### 3.2.2 XGBoost

In [None]:
xgb = XGBClassifier(use_label_encoder=False,
                    random_state=2021)
                    
xgb.fit(x_train_embedded, y_train)

In [None]:
feat_impt_df = retrieve_feature_importance(xgb.feature_importances_, x_train_embedded.columns)
feat_impt_df

### 3.3.3 Lasso Regression

In [None]:
alpha_param = [1, 0.1, 0.01, 0.001, 0.0001]

for alpha in alpha_param:
    lasso = Lasso(alpha=alpha)
    lasso.fit(x_train_embedded, y_train)

    importance = np.abs( lasso.coef_ )

    feat_impt_df = retrieve_feature_importance(importance, x_train_embedded.columns)
    feat_impt_df.index.rename('alpha='+str(alpha), inplace=True) 
    print( feat_impt_df )
    print()

### Conclusion:
Results from the Feature Selection process are consolidated into this Google Sheet: https://docs.google.com/spreadsheets/d/1QSPAkr9M8qJNOBBwmAk2dnTFTrOGiVrXPHorGIVx2KM/edit?usp=sharing

In [None]:
x_train.drop(['current_house_years', 'current_job_years', 'norent_noown', 'owned'], axis=1, inplace=True)
x_test.drop(['current_house_years', 'current_job_years', 'norent_noown', 'owned'], axis=1, inplace=True)

# 4. Exporting preprocessed Train Test sets to csv files
---

In [None]:
x_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
train = pd.concat([x_train, y_train], axis=1)

x_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
test = pd.concat([x_test, y_test], axis=1)

try:
    train.to_csv("./dataset/train.csv", index=False)
    test.to_csv("./dataset/test.csv", index=False)
    print('Succesfully exported')
except:
    print('Error')