In [1]:
# packages
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import os

# rebalancing
from imblearn.over_sampling import SMOTENC

# import feature data types
import json
with open('python_scripts/data_types.json') as f:
    data_types = json.load(f) 

### This notebook deals with resampling the data to handle extreme class imbalance in the dataset

In [7]:
# reading training set to restructure
X_train = pd.read_csv('data/X_train.csv')
y_train = pd.read_csv('data/y_train.csv')

Currently there is extreme class imbalance in the dataset with approximately 100:1 negative:positive instances of fraud
- Resampling strategy:
    - to reduce the risk of overfitting to the specific fraud positive instances, we will oversample positive instances by varying degrees:
        - 10:1 negative:positive ratio
        - 4:1 negative:positive ratio 
        - 2:1 negative:positive ratio

In [8]:
# Handling class imbalance
# define nominal and continuous variables
numericals = data_types['numerical_continuous_bounded']+data_types['numerical_continuous_unbounded']+data_types['numerical_discrete']+data_types['ordinal']
to_drop = ['prev_address_months_count', 'bank_months_count', 'month'] # variables not used anymore
numericals = [i for i in numericals if i not in to_drop]
nominals = [i for i in X_train.columns if i not in numericals]

# specify categorical feature indices
categorical_columns_idx = [X_train.columns.get_loc(i) for i in nominals]

Tooling: apply smote nc (to handle nominal and categorical variables)


In [9]:
# 10:1 negative:positive 
smote_nc = SMOTENC(categorical_features=categorical_columns_idx, sampling_strategy=0.1, random_state=0)
X_train_resampled_10, y_train_resampled_10 = smote_nc.fit_resample(X_train, y_train)
y_train_resampled_10.value_counts()

fraud_bool
0             786838
1              78683
Name: count, dtype: int64

In [10]:
# 4:1 negative:positive 
smote_nc = SMOTENC(categorical_features=categorical_columns_idx, sampling_strategy=0.25, random_state=0)
X_train_resampled_25, y_train_resampled_25 = smote_nc.fit_resample(X_train, y_train)
y_train_resampled_25.value_counts()

fraud_bool
0             786838
1             196709
Name: count, dtype: int64

In [11]:
# 2:1 negative:positive
smote_nc = SMOTENC(categorical_features=categorical_columns_idx, sampling_strategy=0.5, random_state=0)
X_train_resampled_50, y_train_resampled_50 = smote_nc.fit_resample(X_train, y_train)
y_train_resampled_50.value_counts()

fraud_bool
0             786838
1             393419
Name: count, dtype: int64

Save resampled training sets to .csv

In [13]:
data_folder = os.path.join(os.getcwd(), 'data/resampled')

# 10:1
file_path_train = os.path.join(data_folder, 'X_train_10.csv')
file_path_test = os.path.join(data_folder, 'y_train_10.csv')
X_train_resampled_10.to_csv(file_path_train, index=False)
y_train_resampled_10.to_csv(file_path_test, index=False)

# 4:1
file_path_train = os.path.join(data_folder, 'X_train_25.csv')
file_path_test = os.path.join(data_folder, 'y_train_25.csv')
X_train_resampled_25.to_csv(file_path_train, index=False)
y_train_resampled_25.to_csv(file_path_test, index=False)

# 2:1
file_path_train = os.path.join(data_folder, 'X_train_50.csv')
file_path_test = os.path.join(data_folder, 'y_train_50.csv')
X_train_resampled_50.to_csv(file_path_train, index=False)
y_train_resampled_50.to_csv(file_path_test, index=False)