In [82]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, date, time, timedelta
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
#importing over and undersampling algorithms from imblearn (you will have to manually install it in your envoirenment with pip install imblearn) 
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import confusion_matrix
import itertools
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import LabelEncoder


Read in Data

In [83]:
data = pd.read_csv('./data/training.csv')
data_test = pd.read_csv('./data/test.csv')
test_id = data_test.TransactionId

Add Variables

In [84]:
# Creating a new variable
# Convert time to datatime format
data['TransactionStartTime'] = pd.to_datetime(data['TransactionStartTime'], format='%Y-%m-%dT%H:%M:%SZ')
data['Hour'] = data['TransactionStartTime'].dt.hour

data_test['TransactionStartTime'] = pd.to_datetime(data_test['TransactionStartTime'], format='%Y-%m-%dT%H:%M:%SZ')
data_test['Hour'] = data_test['TransactionStartTime'].dt.hour


# day of the week
data['Day'] = data['TransactionStartTime'].dt.dayofweek
data_test['Day'] = data_test['TransactionStartTime'].dt.dayofweek


data.loc[data['Amount'] >= 0, 'DirectionOfMoney'] = 0
data.loc[data['Amount'] < 0, 'DirectionOfMoney'] = 1

data_test.loc[data_test['Amount'] >= 0, 'DirectionOfMoney'] = 0
data_test.loc[data_test['Amount'] < 0, 'DirectionOfMoney'] = 1

In [85]:
len_train = len(data)

Make Dummies

In [86]:
# choose categorical variables
cat_var = [ 'ProviderId', 'ProductId', 'ChannelId', 'ProductCategory', 'PricingStrategy', 'Hour', 'Day'] # '

columns = data[cat_var].columns.tolist()
new_df = pd.concat([data, data_test], sort=False)

le = LabelEncoder()
for each in columns:
    new_df[each] = le.fit_transform(new_df[each])#

new_df = new_df.drop(['FraudResult','Amount','TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 'CurrencyCode', 'CountryCode', 'TransactionStartTime'], axis=1,)
new_df = pd.get_dummies(new_df, columns=cat_var)


In [87]:

df = new_df[:len_train]
data_test = new_df[len_train:].reset_index(drop=True)

Mapping

{'ProviderId_1': 0, 'ProviderId_2': 1, 'ProviderId_3': 2, 'ProviderId_4': 3, 'ProviderId_5': 4, 'ProviderId_6': 5}

{'ProductId_1': 0, 'ProductId_10': 1, 'ProductId_11': 2, 'ProductId_12': 3, 'ProductId_13': 4, 'ProductId_14': 5, 'ProductId_15': 6, 'ProductId_16': 7, 'ProductId_17': 8, 'ProductId_18': 9, 'ProductId_19': 10, 'ProductId_2': 11, 'ProductId_20': 12, 'ProductId_21': 13, 'ProductId_22': 14, 'ProductId_23': 15, 'ProductId_24': 16, 'ProductId_25': 17, 'ProductId_26': 18, 'ProductId_27': 19, 'ProductId_3': 20, 'ProductId_4': 21, 'ProductId_5': 22, 'ProductId_6': 23, 'ProductId_7': 24, 'ProductId_8': 25, 'ProductId_9': 26}


{'ChannelId_1': 0, 'ChannelId_2': 1, 'ChannelId_3': 2, 'ChannelId_4': 3, 'ChannelId_5': 4}

{'airtime': 0, 'data_bundles': 1, 'financial_services': 2, 'movies': 3, 'other': 4, 'retail': 5, 'ticket': 6, 'transport': 7, 'tv': 8, 'utility_bill': 9}

{0: 0, 1: 1, 2: 2, 4: 3}
{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23}

In [88]:
df.columns

Index(['Value', 'DirectionOfMoney', 'ProviderId_0', 'ProviderId_1',
       'ProviderId_2', 'ProviderId_3', 'ProviderId_4', 'ProviderId_5',
       'ProductId_0', 'ProductId_1', 'ProductId_2', 'ProductId_3',
       'ProductId_4', 'ProductId_5', 'ProductId_6', 'ProductId_7',
       'ProductId_8', 'ProductId_9', 'ProductId_10', 'ProductId_11',
       'ProductId_12', 'ProductId_13', 'ProductId_14', 'ProductId_15',
       'ProductId_16', 'ProductId_17', 'ProductId_18', 'ProductId_19',
       'ProductId_20', 'ProductId_21', 'ProductId_22', 'ProductId_23',
       'ProductId_24', 'ProductId_25', 'ProductId_26', 'ChannelId_0',
       'ChannelId_1', 'ChannelId_2', 'ChannelId_3', 'ChannelId_4',
       'ProductCategory_0', 'ProductCategory_1', 'ProductCategory_2',
       'ProductCategory_3', 'ProductCategory_4', 'ProductCategory_5',
       'ProductCategory_6', 'ProductCategory_7', 'ProductCategory_8',
       'ProductCategory_9', 'PricingStrategy_0', 'PricingStrategy_1',
       'PricingStrategy_2', 

In [89]:
#defining X and y
X = df
y = data['FraudResult']


In [90]:
X

Unnamed: 0,Value,DirectionOfMoney,ProviderId_0,ProviderId_1,ProviderId_2,ProviderId_3,ProviderId_4,ProviderId_5,ProductId_0,ProductId_1,...,Hour_21,Hour_22,Hour_23,Day_0,Day_1,Day_2,Day_3,Day_4,Day_5,Day_6
0,1000,0.0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,1,0,0,0
1,20,1.0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,500,0.0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,1,0,0,0
3,21800,0.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,644,1.0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95657,1000,1.0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
95658,1000,0.0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,1,0,0,0,0
95659,20,1.0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
95660,3000,0.0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [91]:
#defining X and y
X = df
y = data['FraudResult']
X.to_csv("data/X.csv")  
y.to_csv("data/y.csv")  
data_test['TransactionId'] = test_id
data_test.to_csv('data/X_test.csv')