In [47]:
import os
import pandas as pd
import numpy as np
import warnings
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
import smote
import imblearn
from collections import Counter
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

pd.options.mode.chained_assignment = None
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [48]:
print(imblearn.__version__)

0.13.0


In [3]:
bnplData = pd.read_csv("/workspaces/workspace/data_for_synthetic_test.csv", low_memory=False)
bnplData.shape

(33123, 195)

In [4]:
column_names = bnplData.columns.tolist()
print(column_names)

['TXN_UID', 'OUTCOME', 'BAD', 'AGE', 'BALANCE', 'BALANCE_A', 'BALANCE_AGE', 'BALANCE_C', 'CCAAMC060030A', 'CCAASC060001A', 'CCAATC060030A', 'CCAATC060030R', 'CCAATD060030S', 'CCACSD060001R', 'CCAUTD060030R', 'CCHACT730001X', 'CCHAID021000A', 'CCHAMC007000A', 'CCHAMC021000A', 'CCHAMC182001A', 'CCHAMC182007A', 'CCHAMC365001A', 'CCHAMC365182R', 'CCHAMC730001A', 'CCHAMC730014A', 'CCHAMC730365R', 'CCHAMD090001R', 'CCHAMD090001X', 'CCHAMD182001R', 'CCHAMD182001X', 'CCHAMD730001X', 'CCHAMT730001X', 'CCHAST730001X', 'CCHAZC365001A', 'CCHAZT730001X', 'CCHCMC730000A', 'CCHDMD021000N', 'CCHDMD730001N', 'CCHDMT021000N', 'CCHDMT730001N', 'CCHUMC730001A', 'CCHXID006000B', 'CCHXID090001B', 'CCHXMC021000A', 'CCHXMC730001A', 'CCHXMD009000A', 'CCHXMD180999A', 'CCHXMS021000C', 'CCHXZC730001X', 'CCIATC365001A', 'CCIATT365001X', 'CCIDTT365001R', 'CCIUTD365001R', 'CCIXTC365001A', 'CCMAMS030000C', 'CCMDMC021000A', 'CCMXAC003000A', 'CCMXAC021000A', 'CCMXMS000000X', 'CCMXMS004000X', 'CCMXMS007000X', 'CCMXMS030

In [16]:
#separate target from features
#X = bnplData[['TXN_UID','AGE','BALANCE_A','SPECTRUM_RSS2302']]
X = bnplData.drop(['BAD','OUTCOME'], axis=1)
y = bnplData['OUTCOME']

In [17]:
#show distribution of outcome
bnplData['OUTCOME'].value_counts()

OUTCOME
NO ATTEMPTS    23863
DECLINED        5783
GOOD            3262
LOSS             215
Name: count, dtype: int64

Need to impute missing values to avoid errors

In [19]:
# Select numerical columns
numerical_cols = [cname for cname in X.columns if 
                X[cname].dtype in ['int64', 'float64']]
numerical_cols

['TXN_UID',
 'AGE',
 'BALANCE',
 'BALANCE_A',
 'BALANCE_AGE',
 'BALANCE_C',
 'CCAAMC060030A',
 'CCAASC060001A',
 'CCAATC060030A',
 'CCAATC060030R',
 'CCAATD060030S',
 'CCACSD060001R',
 'CCAUTD060030R',
 'CCHACT730001X',
 'CCHAID021000A',
 'CCHAMC007000A',
 'CCHAMC021000A',
 'CCHAMC182001A',
 'CCHAMC182007A',
 'CCHAMC365001A',
 'CCHAMC365182R',
 'CCHAMC730001A',
 'CCHAMC730014A',
 'CCHAMC730365R',
 'CCHAMD090001R',
 'CCHAMD090001X',
 'CCHAMD182001R',
 'CCHAMD182001X',
 'CCHAMD730001X',
 'CCHAMT730001X',
 'CCHAST730001X',
 'CCHAZC365001A',
 'CCHAZT730001X',
 'CCHCMC730000A',
 'CCHDMD021000N',
 'CCHDMD730001N',
 'CCHDMT021000N',
 'CCHDMT730001N',
 'CCHUMC730001A',
 'CCHXID006000B',
 'CCHXID090001B',
 'CCHXMC021000A',
 'CCHXMC730001A',
 'CCHXMD009000A',
 'CCHXMD180999A',
 'CCHXMS021000C',
 'CCHXZC730001X',
 'CCIATC365001A',
 'CCIATT365001X',
 'CCIDTT365001R',
 'CCIUTD365001R',
 'CCIXTC365001A',
 'CCMAMS030000C',
 'CCMDMC021000A',
 'CCMXAC003000A',
 'CCMXAC021000A',
 'CCMXMS000000X',
 'CCMX

In [21]:
# Impute numeric columns, simple fill with -999 value
my_numeric_imputer = SimpleImputer(strategy='constant', fill_value=-999)
num_X = X[numerical_cols]
imputed_num_X = pd.DataFrame(my_numeric_imputer.fit_transform(num_X))

# Imputation removed column names; put them back
imputed_num_X.columns = num_X.columns
imputed_num_X.head()

Unnamed: 0,TXN_UID,AGE,BALANCE,BALANCE_A,BALANCE_AGE,BALANCE_C,CCAAMC060030A,CCAASC060001A,CCAATC060030A,CCAATC060030R,...,CHAIN_NO,CONS_HOME_PHONE,DAY_OF_WEEK,DELIVERY_CODE,ID_CODE,LOCAL_HOUR,MRCH_PRODUCT_CODE,SPECTRUM_ALERT_CODE4,SPECTRUM_ALERT_CODE5,ZIP3
0,1244170000000.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
1,1244170000000.0,37.0,-9999.98,-9999.98,-1.0,-9999.98,18528.0,0.0,1.0,5.4e-05,...,130000.0,9376032000.0,5.0,0.0,-1.0,7.0,-1.0,-999.0,-999.0,945.0
2,1244170000000.0,24.0,1158.64,1153.64,-1.0,1158.64,653.0,0.0,542.0,0.830015,...,160224.0,4057616000.0,5.0,0.0,-1.0,10.0,-1.0,-999.0,-999.0,731.0
3,1244170000000.0,36.0,35.89,106.02,-1.0,35.89,2787.0,0.0,2415.0,0.866523,...,168411.0,2818985000.0,5.0,0.0,-1.0,10.0,-1.0,-999.0,-999.0,775.0
4,1244170000000.0,27.0,-9999.98,-9999.98,-1.0,-9999.98,302.0,0.0,5.0,0.016556,...,139767.0,5417015000.0,5.0,0.0,-1.0,9.0,-1.0,-999.0,-999.0,978.0


In [24]:
#Impute character values
categorical_cols = [cname for cname in X.columns if
                    X[cname].nunique() < 21 and 
                    X[cname].dtype == "object"]
categorical_cols

my_categorical_imputer = SimpleImputer(strategy='constant', fill_value='Z')
char_X = X[categorical_cols]
imputed_char_X = pd.DataFrame(my_categorical_imputer.fit_transform(char_X))

# Imputation removed column names; put them back
imputed_char_X.columns = char_X.columns

imputed_char_X.head(10)

Unnamed: 0,CCHXMT730001C,DL_SWIPE_INDICATOR,EMLAGE_BILL_RISK_COUNTRY,EMLAGE_CITY_MATCH,EMLAGE_DIS_DESCR,EMLAGE_DOMAIN_CNTRY_MATCH,EMLAGE_DOMAIN_CORP,EMLAGE_DOMAIN_COUNTRY,EMLAGE_DOMAIN_EXISTS,EMLAGE_DOMAIN_RISK_CNTRY,...,EMLAGE_STATUS,FM_SCAN_CODE,ID_STN_STATE_MATCH_IND,NYCE_AVAIL_FUNDS,NYCE_RESP_CODE,POSITIVE_GRADE,SPECTRUM_ALERT_CODE1,SPECTRUM_ALERT_CODE2,SPECTRUM_ALERT_CODE3,SPECTRUM_RSS2302_REASON_CODE5
0,Z,Z,Z,Z,Z,Z,Z,Z,Z,Z,...,Z,Z,Z,Z,Z,Z,Z,Z,Z,Z
1,Z,M,No,-1,High Confidence,Yes,No,US,Yes,No,...,Verified,M,NULL_ID,U,U,-1,Z,Z,Z,Z
2,Z,M,No,-1,High Confidence,Yes,No,US,Yes,No,...,Verified,M,NULL_ID,U,U,-1,Z,Z,Z,Z
3,Z,M,No,-1,High Confidence,Yes,No,US,Yes,No,...,Certified,M,NULL_ID,U,U,-1,Z,Z,Z,Z
4,Z,M,No,-1,High Confidence,Yes,No,US,Yes,No,...,Verified,M,NULL_ID,U,U,-1,300A,Z,Z,Z
5,Z,M,No,-1,High Confidence,Yes,No,US,Yes,No,...,Verified,M,NULL_ID,U,U,-1,Z,Z,Z,Z
6,Z,M,No,-1,High Confidence,Yes,No,US,Yes,No,...,Verified,M,NULL_ID,U,U,-1,Z,Z,Z,Z
7,Z,M,No,-1,High Confidence,Yes,No,US,Yes,No,...,Verified,M,NULL_ID,U,U,-1,300A,Z,Z,Z
8,Z,M,No,-1,High Confidence,Yes,No,US,Yes,No,...,Verified,M,NULL_ID,U,U,-1,Z,Z,Z,Z
9,Z,M,No,-1,High Confidence,Yes,No,US,Yes,No,...,Verified,M,NULL_ID,U,U,-1,Z,Z,Z,Z


In [25]:
#now merge the categorical and numeric columns back together
imputed_X = pd.concat([imputed_num_X, imputed_char_X], axis=1)

In [33]:
#run smote
from imblearn.over_sampling import SMOTENC
smote = SMOTENC(random_state=42, categorical_features=categorical_cols)
X_resampled, y_resampled = smote.fit_resample(imputed_X, y)

In [34]:
X_resampled = pd.DataFrame(X_resampled, columns=imputed_X.columns)
y_resampled = pd.DataFrame(y_resampled, columns=['OUTCOME'])

In [36]:
df_resampled = pd.concat([X_resampled,y_resampled], axis=1)

In [37]:
df_resampled['OUTCOME'].value_counts()

OUTCOME
NO ATTEMPTS    23863
GOOD           23863
LOSS           23863
DECLINED       23863
Name: count, dtype: int64

In [39]:
df_resampled.to_csv("/workspaces/workspace/synthetic_test_result.csv", index=False)

In [40]:
categorical_cols = [cname for cname in X.columns if
                    X[cname].nunique() >= 21 and 
                    X[cname].dtype == "object"]
categorical_cols

['CCHXMC730001X',
 'EMLAGE_DOMAIN_CAT',
 'EMLAGE_DOMAIN_CO',
 'EMLAGE_DOMAIN_NAME',
 'SPECTRUM_RSS2302_REASON_CODE1',
 'SPECTRUM_RSS2302_REASON_CODE2',
 'SPECTRUM_RSS2302_REASON_CODE3',
 'SPECTRUM_RSS2302_REASON_CODE4',
 'STATION_STATE']