# Exploring the Dataset

In [1]:
import pandas as pd
import seaborn as sns
import plotly.graph_objs as go

import os

In [2]:
df_paths = ['Data/pickles/full_data/{}'.format(file) for file in os.listdir('Data/pickles/full_data')]

sorted(df_paths)

['Data/pickles/full_data/df_1',
 'Data/pickles/full_data/df_2',
 'Data/pickles/full_data/df_3',
 'Data/pickles/full_data/df_4']

In [3]:
df = pd.concat([
    pd.read_pickle(df_paths[0]),
    pd.read_pickle(df_paths[1]),
    pd.read_pickle(df_paths[2]),
    pd.read_pickle(df_paths[3])],
axis=0)

df.head()

Unnamed: 0,accountNumber,customerId,creditLimit,availableMoney,transactionDateTime,transactionAmount,merchantName,acqCountry,merchantCountryCode,posEntryMode,...,echoBuffer,currentBalance,merchantCity,merchantState,merchantZip,cardPresent,posOnPremises,recurringAuthInd,expirationDateKeyInMatch,isFraud
589770,882815134,882815134,50000.0,18749.67,31250.3,2016-10-14T08:21:06,118.38,Rodeway Inn #477327,US,US,...,6580,PURCHASE,,,,,,,False,False
589771,882815134,882815134,50000.0,18631.29,31368.7,2016-10-14T09:06:13,191.61,Hilton Hotels #123236,US,US,...,6580,PURCHASE,,,,,,,False,False
589772,882815134,882815134,50000.0,18439.68,31560.3,2016-10-14T11:32:41,98.24,Rodeway Inn #436394,US,US,...,8502,PURCHASE,,,,,,,False,False
589773,882815134,882815134,50000.0,18341.44,31658.6,2016-10-14T13:13:21,92.93,Renaissance Hotel #809953,US,US,...,6580,PURCHASE,,,,,,,False,False
589774,882815134,882815134,50000.0,18248.51,31751.5,2016-10-14T14:31:24,126.89,Rodeway Inn #00167,US,US,...,8502,PURCHASE,,,,,,,False,False


In [4]:
"Dataset Dimensions: {}x{}".format(df.shape[0], df.shape[1])

'Dataset Dimensions: 786363x29'

## Checking for nans

In [5]:
df.isnull().sum()

accountNumber                    0
customerId                       0
creditLimit                      0
availableMoney                   0
transactionDateTime              0
transactionAmount                0
merchantName                     0
acqCountry                       0
merchantCountryCode              0
posEntryMode                     0
posConditionCode                 0
merchantCategoryCode             0
currentExpDate                   0
accountOpenDate                  0
dateOfLastAddressChange          0
cardCVV                          0
enteredCVV                       1
cardLast4Digits                 64
transactionType               1705
echoBuffer                   43174
currentBalance              404969
merchantCity                767096
merchantState               786199
merchantZip                 786024
cardPresent                 362160
posOnPremises               777410
recurringAuthInd            785679
expirationDateKeyInMatch     14117
isFraud             

One can only hope for clean data...

Considering our shape dimensions, I can pretty muh remove merchantCity, merchantState, merchantZip, posOnPremises, and recurringAuthInd right off the bat since they are too many missing values to do anything else while remaining accurate.
I will also remove any ID related columns because they have no use.

I want to keep cardCVV and enteredCVV just in case at the moment as I'm curious to see how many of them differ from each other.

In [6]:
df.drop(['merchantCity', 'merchantState', 'merchantZip', 'posOnPremises', 'recurringAuthInd', 'accountNumber', 'customerId'], axis=1, inplace=True)


df.isnull().sum()


creditLimit                      0
availableMoney                   0
transactionDateTime              0
transactionAmount                0
merchantName                     0
acqCountry                       0
merchantCountryCode              0
posEntryMode                     0
posConditionCode                 0
merchantCategoryCode             0
currentExpDate                   0
accountOpenDate                  0
dateOfLastAddressChange          0
cardCVV                          0
enteredCVV                       1
cardLast4Digits                 64
transactionType               1705
echoBuffer                   43174
currentBalance              404969
cardPresent                 362160
expirationDateKeyInMatch     14117
isFraud                         27
dtype: int64

In [7]:
# Since I'll be constantly modifying data as a explore the values,
# I'll keep a separate pickle used to represent the current state of the data

df_half = df[df.columns[:df.shape[1] // 2]]
df_other_half = df[df.columns[df.shape[1] // 2:]]

# Splitting it up so I can use the pickles and commit to git

pd.to_pickle(df_half, 'Data/pickles/clean_data/clean_data_half_1')
pd.to_pickle(df_other_half, 'Data/pickles/clean_data/clean_data_half_2')