In [78]:
import pandas as pd
import numpy as np
import random as rnd

import time

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier

from sklearn import model_selection

In [79]:
train_df = pd.read_json('../input/transactions_data.txt',lines=True)
train_df.head()

Unnamed: 0,cardCVV,cardCompany,cardDisplayNumber,cardExpDate,cardName,cardNetwork,cardNumber,cardType,customer,demo,...,products,storePOS,storeWeb,transAmount,transChannel,transCurrency,transDate,transDescription,transId,webDetails
0,992,Runolfsson-Runolfsson,5262119058090111,06/20,Sid Metz,visa,8972768699995084,credit,"{'cardNumber': 0, 'customerType': None, 'userI...",False,...,[],,,2019.212078,MOBILE,pound,1519108951539,Aut velit est quis e,8eef7316-0bfc-45cc-9d30-1cb943082e71,
1,527,Fahey LLC,9647953517911559,06/21,Mertie Considine,visa,1502854323418887,prepaid,"{'cardNumber': 0, 'customerType': None, 'userI...",False,...,[],,,0.287541,MOBILE,yen,1519108951690,Qui voluptatem quod,42971644-1b00-4f53-a8e9-9d76662c37cb,
2,430,Hamill-Hamill,4444700311329356,05/22,Chasity Kozey,americanexpress,1315266292682552,credit,"{'cardNumber': 0, 'customerType': None, 'userI...",False,...,[],,,2569.155177,MOBILE,pound,1519108951792,Qui aliquam vel ut.,0f92b181-76b2-4d55-a968-cb4e1bd88925,
3,385,Feil-Feil,9569169426519242,02/19,Montana Aufderhar Jr.,visa,5393441870846185,credit,"{'cardNumber': 0, 'customerType': None, 'userI...",False,...,[],"{'POSCode': 0, 'storeName': None, 'storeRiskSc...",,1102.480583,POS,dollar,1519108951894,Et eveniet facilis v,969550ac-dd81-4374-a5c7-69a51811678f,
4,542,Bechtelar-Bechtelar,3248504556664997,07/18,Jovan Murazik Jr.,americanexpress,293874280627531,debit,"{'cardNumber': 0, 'customerType': None, 'userI...",False,...,[],"{'POSCode': 0, 'storeName': None, 'storeRiskSc...",,3589.21948,POS,rupee,1519108952027,Similique iure archi,c1472cda-858e-4d39-8fe2-f0e4ba061581,


In [80]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74988 entries, 0 to 74987
Data columns (total 30 columns):
cardCVV                74988 non-null int64
cardCompany            74988 non-null object
cardDisplayNumber      74988 non-null int64
cardExpDate            74988 non-null object
cardName               74988 non-null object
cardNetwork            74988 non-null object
cardNumber             74988 non-null int64
cardType               74988 non-null object
customer               74988 non-null object
demo                   74988 non-null bool
deviceIP               50168 non-null object
fraud                  74988 non-null bool
geoIp                  74988 non-null object
isAccountFlagged       74988 non-null bool
mobileDetails          25083 non-null object
paymentCard            74988 non-null object
posDetails             24820 non-null object
processingEndTime      74988 non-null int64
processingStartTime    74988 non-null int64
productIdList          74988 non-null object
pr

In [81]:
train_df.iloc[0,14]#mobile details

{'appName': 'Redhold',
 'deviceIP': '072.169.009.234',
 'deviceId': 23,
 'deviceName': 'Gavin',
 'deviceNetwork': 'airtel',
 'deviceOS': 'ios',
 'userLat': -36.919486,
 'userLong': 175.2689}

In [82]:
train_df.iloc[3,16]#posDetails

{'POSCode': 2832683732227822980,
 'geoCity': 'FortMill',
 'geoCountry': 'US',
 'geoPoBox': 29707,
 'geoPostalCode': 29707,
 'geoPostalCodeType': 'Supermarket',
 'geoState': 'US-SC',
 'geoStreet1': 'Ewald Falls',
 'geoStreet2': '753',
 'lat': 35.00737,
 'lon': -80.945076}

In [83]:
train_df.iloc[5,29]#webDetails

{'browser': 'ub',
 'deviceIP': '006.079.221.173',
 'deviceMacAddr': '00:b3:92:e2:f7:8d',
 'deviceOS': 'MacOS',
 'hostURL': 'www.judson-pagac.net',
 'network': None}

In [84]:
import re

def companyNames(name):
    p1 = re.compile('[a-zA-Z]+-[a-zA-Z]+')
    p2 = re.compile('[a-zA-Z]+, [a-zA-Z]+ and [a-zA-Z]+')
    p3 = re.compile('[a-zA-Z]+ Inc')
    if p1.match(name):
        return name.split('-')[0]    
    elif p2.match(name):
        return name.split(',')[0]
    elif p3.match(name):
        return name.split(' ')[0]
    else:
        return name
    
def fraudNames(name):
    if name==True:
        return 1
    else:
        return 0

def getCardExpYear(date):
    return date.split('/')[1]

from googletrans import Translator
translator = Translator()

def transDescriptionNames(name):
    try:
        return translator.translate(name, dest='en').text
    except ValueError:
        return "NA"

In [85]:
train_df['cardCompany'] = train_df['cardCompany'].apply(companyNames)
train_df['fraud'] = train_df['fraud'].apply(fraudNames)
train_df['cardExpYear'] = train_df['cardExpDate'].apply(getCardExpYear)

In [86]:
train_df.loc[train_df.query('fraud == 0').sample(frac=.1).index,'fraud'] = 1
train_df.query('fraud == 1')

Unnamed: 0,cardCVV,cardCompany,cardDisplayNumber,cardExpDate,cardName,cardNetwork,cardNumber,cardType,customer,demo,...,storePOS,storeWeb,transAmount,transChannel,transCurrency,transDate,transDescription,transId,webDetails,cardExpYear
1,527,Fahey LLC,9647953517911559,06/21,Mertie Considine,visa,1502854323418887,prepaid,"{'cardNumber': 0, 'customerType': None, 'userI...",False,...,,,0.287541,MOBILE,yen,1519108951690,Qui voluptatem quod,42971644-1b00-4f53-a8e9-9d76662c37cb,,21
17,411,McGlynn,681227962284246,09/22,Mariela Bashirian,mastercard,9034392871924690,debit,"{'cardNumber': 0, 'customerType': None, 'userI...",False,...,,"{'storeIP': None, 'storeId': None, 'storeRiskS...",1452.285210,WEB,dollar,1519108953479,Ullam est aperiam vo,80c15450-9fba-41d7-883c-569141921caf,"{'deviceMacAddr': 'b6:47:bc:ea:c2:75', 'device...",22
26,552,Harber,4141047807362103,01/22,Maribel Aufderhar MD,mastercard,13426068806272,credit,"{'cardNumber': 0, 'customerType': None, 'userI...",False,...,,,2762.670152,MOBILE,pound,1519108954408,Voluptas omnis paria,59acb504-91ec-4264-96f2-6591b14ff1cc,,22
34,209,Emmerich,8876965250324956,11/19,Sidney Ryan,mastercard,7313720273520988,prepaid,"{'cardNumber': 0, 'customerType': None, 'userI...",False,...,,"{'storeIP': None, 'storeId': None, 'storeRiskS...",0.506684,WEB,yen,1519108955225,Doloribus at fugit.,7c9d1b02-61d6-4823-9cc5-042eb8947da1,"{'deviceMacAddr': '99:89:a6:cf:9b:e8', 'device...",19
38,836,Gutkowski Group,9106732771650231,01/20,Chasity Conroy,americanexpress,6080144594875828,debit,"{'cardNumber': 0, 'customerType': None, 'userI...",False,...,"{'POSCode': 0, 'storeName': None, 'storeRiskSc...",,1379.243494,POS,pound,1519108955632,Nam repellat nulla.,7132b961-8acd-4bb2-a977-f990eb2c4aac,,20
45,170,Medhurst,3937972729531636,10/22,Miss Mable Bogisich,americanexpress,4096810085046393,prepaid,"{'cardNumber': 0, 'customerType': None, 'userI...",False,...,,"{'storeIP': None, 'storeId': None, 'storeRiskS...",0.386575,WEB,pound,1519108956345,Est non nulla odit.,8801a50b-029f-4708-a21a-ae806a19f1fb,"{'deviceMacAddr': '6e:4c:77:14:15:eb', 'device...",22
46,915,Wuckert and Sons,331282950210579,10/21,Jaquelin Schoen II,visa,4942360745599322,credit,"{'cardNumber': 0, 'customerType': None, 'userI...",False,...,"{'POSCode': 0, 'storeName': None, 'storeRiskSc...",,3950.522048,POS,dollar,1519108956446,Sunt cupiditate non,608e99d5-ca9a-451e-9c90-1c2b4c7273f0,,21
51,580,Kovacek,6120109767312604,02/23,Arely Rohan,visa,3256273502227565,debit,"{'cardNumber': 0, 'customerType': None, 'userI...",False,...,"{'POSCode': 0, 'storeName': None, 'storeRiskSc...",,1544.697307,POS,rupee,1519108956956,Occaecati ut et veli,56def23e-b349-4200-a9c2-b328a4b90958,,23
64,246,Parisian,1217088711396281,06/20,Alayna Mertz,visa,9360057456910297,credit,"{'cardNumber': 0, 'customerType': None, 'userI...",False,...,,,2483.282959,MOBILE,rupee,1519108958273,Iste quia tenetur ut,87fc0588-fb44-4fc7-9ed8-2c4860865d7d,,20
67,350,Romaguera,7527134772847768,01/23,Isaac Vandervort Sr.,visa,4415401985104041,credit,"{'cardNumber': 0, 'customerType': None, 'userI...",False,...,,,2678.659173,MOBILE,yen,1519108958578,Dolorem velit dolore,11ec5990-a453-48d6-8f1b-98d00508b29d,,23


In [87]:
train_df.columns

Index(['cardCVV', 'cardCompany', 'cardDisplayNumber', 'cardExpDate',
       'cardName', 'cardNetwork', 'cardNumber', 'cardType', 'customer', 'demo',
       'deviceIP', 'fraud', 'geoIp', 'isAccountFlagged', 'mobileDetails',
       'paymentCard', 'posDetails', 'processingEndTime', 'processingStartTime',
       'productIdList', 'products', 'storePOS', 'storeWeb', 'transAmount',
       'transChannel', 'transCurrency', 'transDate', 'transDescription',
       'transId', 'webDetails', 'cardExpYear'],
      dtype='object')

In [88]:
train_df = train_df.drop(['cardCVV','paymentCard','transDate','isAccountFlagged','cardDisplayNumber','cardExpDate','cardName','cardNumber','customer','demo','deviceIP','geoIp','mobileDetails','posDetails','processingEndTime','processingStartTime','productIdList','products','storePOS','storeWeb','transDescription','transId','webDetails'], axis=1)
train_df.head(10)

Unnamed: 0,cardCompany,cardNetwork,cardType,fraud,transAmount,transChannel,transCurrency,cardExpYear
0,Runolfsson,visa,credit,0,2019.212078,MOBILE,pound,20
1,Fahey LLC,visa,prepaid,1,0.287541,MOBILE,yen,21
2,Hamill,americanexpress,credit,0,2569.155177,MOBILE,pound,22
3,Feil,visa,credit,0,1102.480583,POS,dollar,19
4,Bechtelar,americanexpress,debit,0,3589.21948,POS,rupee,18
5,Parker,visa,credit,0,3360.068744,WEB,yen,21
6,Larson,visa,debit,0,1017.393342,POS,rupee,22
7,Ritchie,visa,debit,0,3822.47509,MOBILE,rupee,20
8,Zemlak Group,americanexpress,prepaid,0,0.499758,POS,rupee,22
9,Funk and Sons,americanexpress,prepaid,0,1000.983918,POS,pound,18


In [89]:
train_df['amountBand'] = pd.qcut(train_df['transAmount'], 4)
train_df[['amountBand', 'fraud']].groupby(['amountBand'], as_index=False).mean().sort_values(by='fraud', ascending=True)

Unnamed: 0,amountBand,fraud
0,"[1.08e-05, 0.796]",0.099323
2,"(1812.738, 2925.726]",0.099483
3,"(2925.726, 4994.472]",0.100123
1,"(0.796, 1812.738]",0.101083


In [90]:
train_df.loc[ train_df['transAmount'] <= 0.796, 'transAmount'] = 0
train_df.loc[(train_df['transAmount'] > 0.796) & (train_df['transAmount'] <= 1812.738), 'transAmount'] = 1
train_df.loc[(train_df['transAmount'] > 1812.738) & (train_df['transAmount'] <= 2925.726), 'transAmount']   = 2
train_df.loc[ train_df['transAmount'] > 2925.726, 'transAmount'] = 3
train_df['transAmount'] = train_df['transAmount'].astype(int)
train_df = train_df.drop(['amountBand'], axis=1)
train_df.head(5)

Unnamed: 0,cardCompany,cardNetwork,cardType,fraud,transAmount,transChannel,transCurrency,cardExpYear
0,Runolfsson,visa,credit,0,2,MOBILE,pound,20
1,Fahey LLC,visa,prepaid,1,0,MOBILE,yen,21
2,Hamill,americanexpress,credit,0,2,MOBILE,pound,22
3,Feil,visa,credit,0,1,POS,dollar,19
4,Bechtelar,americanexpress,debit,0,3,POS,rupee,18


In [91]:
from sklearn.preprocessing import LabelEncoder
train_df = train_df.apply(LabelEncoder().fit_transform)
train_df.head(5)

Unnamed: 0,cardCompany,cardNetwork,cardType,fraud,transAmount,transChannel,transCurrency,cardExpYear
0,727,2,0,0,2,0,1,2
1,210,2,2,1,0,0,3,3
2,290,0,0,0,2,0,1,4
3,217,2,0,0,1,1,0,1
4,51,0,1,0,3,1,2,0


In [92]:
x_full = train_df.drop("fraud", axis=1)
y_full = train_df["fraud"]

x_train, x_test, y_train, y_test = model_selection.train_test_split(x_full, y_full, random_state = 0)

gbrt = tree.DecisionTreeClassifier(max_depth=7)
gbrt.fit(x_train,y_train)   
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
predictions = gbrt.predict(x_test)
print ("Train Accuracy :: ", accuracy_score(y_train, gbrt.predict(x_train)))
print ("Test Accuracy  :: ", accuracy_score(y_test, predictions))
print ("Confusion matrix ", confusion_matrix(y_test, predictions))

Train Accuracy ::  0.8999662168169129
Test Accuracy  ::  0.9003040486477837
Confusion matrix  [[16877     4]
 [ 1865     1]]


In [94]:
pd.crosstab(y_test, predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,16877,4,16881
1,1865,1,1866
All,18742,5,18747
