In [1]:
import numpy as np
import pandas as pd
import altair as alt

from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier
# import lightgbm as lgb

from sklearn.model_selection import cross_validate, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, precision_recall_curve, PrecisionRecallDisplay, roc_curve, RocCurveDisplay
from sklearn.svm import SVC

# from imblearn.over_sampling import SMOTE
# from imblearn.under_sampling import RandomUnderSampler
# from imblearn.pipeline import Pipeline

In [2]:
# Load train and test dataset
train_df = pd.read_csv('../data/preprocessed/train.csv')
test_df = pd.read_csv('../data/preprocessed/test.csv')

In [3]:
X_train = train_df.drop(columns=['TARGET', 'random'])
y_train = train_df['TARGET']

X_test = test_df.drop(columns=['TARGET', 'random'])
y_test = test_df['TARGET']

In [4]:
y_train.value_counts(normalize=True)

0    0.875625
1    0.124375
Name: TARGET, dtype: float64

#### Scoring Metrics

Since this is an classification problem, the standard scoring metrics accuracy and apart from that, as we know from EDA that there is some class imbalance involved in the data, so it makes sense to look at scoring metrics like Precision, Recall and F1 score. To evaluate the performance of the model, i will also be looking at the Precision-Recall curve.

In [5]:
scoring_metrics = {
    "accuracy",
    "f1",
    "precision",
    "recall"
}

In [6]:
# Adapted from Lecture notes
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

#### Baseline Model 

In [7]:
model = DummyClassifier()
model.fit(X_train, y_train)


mean_std_cross_val_scores(model, X_train, y_train, scoring=scoring_metrics)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


fit_time          0.014 (+/- 0.004)
score_time        0.017 (+/- 0.004)
test_recall       0.000 (+/- 0.000)
test_f1           0.000 (+/- 0.000)
test_accuracy     0.876 (+/- 0.000)
test_precision    0.000 (+/- 0.000)
dtype: object

As expected, the result of baseline model is giving very high accuracy but precision, recall and f1 is 0

#### Preprocessing columns

In [8]:
numeric_cols = ['BALANCE',
                 'SCR',
                 'HOLDING_PERIOD',
                 'LEN_OF_RLTN_IN_MNTH',
                 'NO_OF_L_CR_TXNS',
                 'NO_OF_L_DR_TXNS',
                 'TOT_NO_OF_L_TXNS',
                 'NO_OF_BR_CSH_WDL_DR_TXNS',
                 'NO_OF_ATM_DR_TXNS',
                 'NO_OF_NET_DR_TXNS',
                 'NO_OF_MOB_DR_TXNS',
                 'NO_OF_CHQ_DR_TXNS',
                 'AMT_ATM_DR',
                 'AMT_BR_CSH_WDL_DR',
                 'AMT_CHQ_DR',
                 'AMT_NET_DR',
                 'AMT_MOB_DR',
                 'AMT_L_DR',
                 'AMT_OTH_BK_ATM_USG_CHGS',
                 'AMT_MIN_BAL_NMC_CHGS',
                 'AVG_AMT_PER_ATM_TXN',
                 'AVG_AMT_PER_CSH_WDL_TXN',
                 'AVG_AMT_PER_CHQ_TXN',
                 'AVG_AMT_PER_NET_TXN',
                 'AVG_AMT_PER_MOB_TXN'
                ]


numeric_cols_remove_collinear = ['BALANCE',
                                 'SCR',
                                 'HOLDING_PERIOD',
                                 'LEN_OF_RLTN_IN_MNTH',
                                 'NO_OF_L_CR_TXNS',
                                 'NO_OF_L_DR_TXNS',
                                 'AMT_OTH_BK_ATM_USG_CHGS',
                                 'AMT_MIN_BAL_NMC_CHGS',
                                 'AVG_AMT_PER_ATM_TXN',
                                 'AVG_AMT_PER_CSH_WDL_TXN',
                                 'AVG_AMT_PER_CHQ_TXN',
                                 'AVG_AMT_PER_NET_TXN',
                                 'AVG_AMT_PER_MOB_TXN'
                                ]

categorical_cols = ['NO_OF_IW_CHQ_BNC_TXNS', 'NO_OF_OW_CHQ_BNC_TXNS', 'AGE_BKT',
                    'GENDER', 'OCCUPATION', 'ACC_TYPE', 'FLG_HAS_CC',
                    'FLG_HAS_ANY_CHGS', 'FLG_HAS_NOMINEE', 'FLG_HAS_OLD_LOAN']

In [9]:
preprocessor = ColumnTransformer([
            ("scaling", StandardScaler(), numeric_cols),
            ("onehot", OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_cols)
])

In [10]:
preprocessor_remove_multicolinear = ColumnTransformer([
            ("scaling", StandardScaler(), numeric_cols_remove_collinear),
            ("onehot", OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_cols)
])

In [11]:
X_train_transformed = preprocessor_remove_multicolinear.fit_transform(X_train)
X_test_transformed = preprocessor_remove_multicolinear.transform(X_test)

onehotencode_feature_names = preprocessor_remove_multicolinear.named_transformers_['onehot'].get_feature_names_out().tolist()
feature_names = numeric_cols_remove_collinear + onehotencode_feature_names

X_train_transformed = pd.DataFrame(X_train_transformed, columns=feature_names)
X_test_transformed = pd.DataFrame(X_test_transformed, columns=feature_names)

# Rename columns
X_train_transformed = X_train_transformed.rename(columns={'AGE_BKT_>50': 'AGE_BKT_GT_50',
                                                          'AGE_BKT_<25': 'AGE_BKT_LT_25'})

X_test_transformed = X_test_transformed.rename(columns={'AGE_BKT_>50': 'AGE_BKT_GT_50',
                                                        'AGE_BKT_<25': 'AGE_BKT_LT_25'})

##### Three problems to address in the dataset

1. **Multicollinearity** - To handle multicollinearity, I looked up into the input features in eda correlation plot and have extracted selected which have very low correlation 

    _Refer correlation plot of selected features below_

2.  **Class Imbalance** - To handle class imbalance, I have using class_weight = 'balanced' and SMOTE and see what works better.

3. **Large number of features** - There are total 38 features per example. Though this dataset is not considered high dimensional, we need to know if we need all 38 features.

# Detect multicollinearity in features

### Method - 1 (VIF- Variance Inflation factor)

In [12]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [13]:
# VIF dataframe
vif_data = pd.DataFrame()
X = train_df[numeric_cols_remove_collinear]
vif_data["feature"] = X.columns
  
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]

In [14]:
vif_data

Unnamed: 0,feature,VIF
0,BALANCE,1.460184
1,SCR,3.391247
2,HOLDING_PERIOD,3.668257
3,LEN_OF_RLTN_IN_MNTH,4.54522
4,NO_OF_L_CR_TXNS,3.372208
5,NO_OF_L_DR_TXNS,6.241194
6,AMT_OTH_BK_ATM_USG_CHGS,1.892585
7,AMT_MIN_BAL_NMC_CHGS,1.011957
8,AVG_AMT_PER_ATM_TXN,2.473066
9,AVG_AMT_PER_CSH_WDL_TXN,1.762525


In [15]:
from sklearn.decomposition import PCA
pca_model = PCA(n_components=20)
pca_model.fit(X_train_transformed)

PCA(n_components=20)

In [16]:
sum(pca_model.explained_variance_ratio_)

0.9207161273117026

In [17]:
X = pca_model.transform(X_train_transformed)

In [18]:
X.shape

(16000, 20)

In [19]:
scoring_metrics = {
    "accuracy",
    "f1",
    "precision",
    "recall"
}

In [20]:
model_RF = RandomForestClassifier()
model_RF.fit(X, y_train)

mean_std_cross_val_scores(model_RF,
                       X,
                       y_train,
                       return_train_score=True,
                       scoring=scoring_metrics)

fit_time           14.924 (+/- 0.334)
score_time          0.214 (+/- 0.007)
test_recall         0.703 (+/- 0.048)
train_recall        1.000 (+/- 0.000)
test_f1             0.821 (+/- 0.034)
train_f1            1.000 (+/- 0.000)
test_accuracy       0.962 (+/- 0.006)
train_accuracy      1.000 (+/- 0.000)
test_precision      0.989 (+/- 0.006)
train_precision     1.000 (+/- 0.000)
dtype: object

In [21]:
from xgboost import XGBClassifier

  from pandas import MultiIndex, Int64Index


In [22]:
model_xgb = XGBClassifier(use_label_encoder =False, 
                          verbosity=0)
model_xgb.fit(X, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=0)

In [23]:
mean_std_cross_val_scores(model_xgb,
                           X,
                           y_train,
                           return_train_score=True,
                           scoring=scoring_metrics)

fit_time           4.007 (+/- 0.128)
score_time         0.042 (+/- 0.019)
test_recall        0.663 (+/- 0.031)
train_recall       0.957 (+/- 0.011)
test_f1            0.769 (+/- 0.023)
train_f1           0.978 (+/- 0.006)
test_accuracy      0.951 (+/- 0.004)
train_accuracy     0.995 (+/- 0.001)
test_precision     0.918 (+/- 0.011)
train_precision    1.000 (+/- 0.000)
dtype: object