In [57]:
import pandas as pd

import os, pickle

In [58]:
df_paths = ['Data/pickles/clean_data/{}'.format(file) for file in os.listdir('Data/pickles/clean_data')]

In [59]:
df = pd.concat([
    pd.read_pickle(df_paths[0]),
    pd.read_pickle(df_paths[1])],
axis=1)

df['mismatchingCVV'] = df['cardCVV'] == df['enteredCVV']

df.drop(['echoBuffer',
         'cardLast4Digits',
         'accountOpenDate',
         'transactionDateTime',
         'currentExpDate',
         'customerId',
         'dateOfLastAddressChange',
         'accountNumber',
         'enteredCVV',
         'cardCVV',
         ], inplace=True, axis=1)

In [60]:
# Data I am working with for predictive modeling
df.isnull().sum()

expirationDateKeyInMatch       0
isFraud                        0
merchantCategoryCode           0
merchantCountryCode          724
merchantName                   0
posConditionCode             409
posEntryMode                4054
transactionAmount              0
transactionType              698
acqCountry                  4562
availableMoney                 0
cardPresent                    0
creditLimit                    0
currentBalance                 0
mismatchingCVV                 0
dtype: int64

In [61]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.under_sampling import RandomUnderSampler

In [62]:
# Changing True and False to 1 and 0
df['isFraud'].replace({True: 1, False: 0}, inplace=True)

In [63]:
ros = RandomUnderSampler()
y = df['isFraud']
df.drop('isFraud', inplace=True, axis=1)
new_x, new_y = ros.fit_resample(df, y)
print(f'Before Random Under Sampling: {df.shape}')
print(f'After Random Under Sampling: {new_x.shape}')

Before Random Under Sampling: (786363, 14)
After Random Under Sampling: (24834, 14)


In [64]:
x_train, x_test, y_train, y_test = train_test_split(new_x, new_y)

pipeline = ColumnTransformer([
    ('cat_pipe', Pipeline([
        ('cat_imputer', SimpleImputer(strategy='most_frequent')),
        ('one_hot', OneHotEncoder(handle_unknown='ignore'))
    ]),
     ['merchantCountryCode',
      'posConditionCode',
      'posEntryMode',
      'transactionType',
      'acqCountry',
      'cardPresent',
      'mismatchingCVV',
      'expirationDateKeyInMatch']),
], remainder='passthrough')

x_train = pipeline.fit_transform(x_train)
x_test = pipeline.transform(x_test)

print(x_train.shape)

data = {'x_train': x_train, 'x_test': x_test, 'y_train': y_train, 'y_test': y_test}
with open('Data/pickles/preprocessed_data/preproc_data', 'wb') as file:
    pickle.dump(data, file, protocol=4)

(18625, 31)


* Dimensions grew larger due to one hot encoding, but it is not a significant increase since most features did not have many unique values