In [1]:
import pandas as pd

import os, pickle

In [2]:
df = pd.read_pickle('Data/pickles/clean_data')

df['matchingCVV'] = df['cardCVV'] == df['enteredCVV']

for col in ['cardPresent', 'matchingCVV', 'expirationDateKeyInMatch', 'isFraud']:
    df[col] = df[col].replace({False: 0, True: 1})

df.drop(['echoBuffer',
         'cardLast4Digits',
         'merchantName',
         'accountOpenDate',
         'transactionDateTime',
         'currentExpDate',
         'customerId',
         'dateOfLastAddressChange',
         'accountNumber',
         'enteredCVV',
         'cardCVV',
         ], inplace=True, axis=1)

In [3]:
# Data I am working with for predictive modeling
df.isnull().sum()

creditLimit                    0
availableMoney                 0
transactionAmount              0
acqCountry                  4562
merchantCountryCode          724
posEntryMode                4054
posConditionCode             409
merchantCategoryCode           0
transactionType              698
currentBalance                 0
cardPresent                    0
expirationDateKeyInMatch       0
isFraud                        0
matchingCVV                    0
dtype: int64

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.under_sampling import RandomUnderSampler

In [5]:
ros = RandomUnderSampler()
y = df['isFraud']
df.drop('isFraud', inplace=True, axis=1)
new_x, new_y = ros.fit_resample(df, y)
print(f'Before Random Under Sampling: {df.shape}')
print(f'After Random Under Sampling: {new_x.shape}')

Before Random Under Sampling: (786363, 13)
After Random Under Sampling: (24834, 13)


In [None]:
x_train, x_test, y_train, y_test = train_test_split(new_x, new_y)

pipeline = ColumnTransformer([
    ('cat_pipe', Pipeline([
        ('cat_imputer', SimpleImputer(strategy='most_frequent')),
        ('one_hot', OneHotEncoder(handle_unknown='ignore'))
    ]),
     ['merchantCountryCode',
      'merchantCategoryCode',
      'posConditionCode',
      'posEntryMode',
      'transactionType',
      'acqCountry']),
], remainder='passthrough')

remaining_columns = [x for x in df.columns if x not in ['merchantCountryCode',
      'merchantCategoryCode',
      'posConditionCode',
      'posEntryMode',
      'transactionType',
      'acqCountry']]

x_train = pipeline.fit_transform(x_train)
x_test = pipeline.transform(x_test)

clustered_preproc_data = pipeline.fit_transform(df).todense()

remaining_columns = list(pipeline.named_transformers_['cat_pipe']['one_hot'].get_feature_names()) + remaining_columns

clustering_data = pd.DataFrame(clustered_preproc_data, columns=remaining_columns)
clustering_data['isFraud'] = y

clustering_data.to_pickle('Data/pickles/clustering_data', protocol=4)

print(x_train.shape)

data = {'x_train': x_train, 'x_test': x_test, 'y_train': y_train, 'y_test': y_test}
with open('Data/pickles/preproc_data', 'wb') as file:
    pickle.dump(data, file, protocol=4)

* Dimensions grew larger due to one hot encoding, but it is not a significant increase since most features did not have many unique values