In [1]:
import pandas as pd
import numpy as np
import random as rnd

import time

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier

from sklearn import model_selection

In [2]:
train_df = pd.read_json('../input/transactions_data.txt',lines=True)
train_df.head()

Unnamed: 0,cardCVV,cardCompany,cardDisplayNumber,cardExpDate,cardName,cardNetwork,cardNumber,cardType,customer,demo,...,products,storePOS,storeWeb,transAmount,transChannel,transCurrency,transDate,transDescription,transId,webDetails
0,992,Runolfsson-Runolfsson,5262119058090111,06/20,Sid Metz,visa,8972768699995084,credit,"{'cardNumber': 0, 'customerType': None, 'userI...",False,...,[],,,2019.212078,MOBILE,pound,1519108951539,Aut velit est quis e,8eef7316-0bfc-45cc-9d30-1cb943082e71,
1,527,Fahey LLC,9647953517911559,06/21,Mertie Considine,visa,1502854323418887,prepaid,"{'cardNumber': 0, 'customerType': None, 'userI...",False,...,[],,,0.287541,MOBILE,yen,1519108951690,Qui voluptatem quod,42971644-1b00-4f53-a8e9-9d76662c37cb,
2,430,Hamill-Hamill,4444700311329356,05/22,Chasity Kozey,americanexpress,1315266292682552,credit,"{'cardNumber': 0, 'customerType': None, 'userI...",False,...,[],,,2569.155177,MOBILE,pound,1519108951792,Qui aliquam vel ut.,0f92b181-76b2-4d55-a968-cb4e1bd88925,
3,385,Feil-Feil,9569169426519242,02/19,Montana Aufderhar Jr.,visa,5393441870846185,credit,"{'cardNumber': 0, 'customerType': None, 'userI...",False,...,[],"{'POSCode': 0, 'storeName': None, 'storeRiskSc...",,1102.480583,POS,dollar,1519108951894,Et eveniet facilis v,969550ac-dd81-4374-a5c7-69a51811678f,
4,542,Bechtelar-Bechtelar,3248504556664997,07/18,Jovan Murazik Jr.,americanexpress,293874280627531,debit,"{'cardNumber': 0, 'customerType': None, 'userI...",False,...,[],"{'POSCode': 0, 'storeName': None, 'storeRiskSc...",,3589.21948,POS,rupee,1519108952027,Similique iure archi,c1472cda-858e-4d39-8fe2-f0e4ba061581,


In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74988 entries, 0 to 74987
Data columns (total 30 columns):
cardCVV                74988 non-null int64
cardCompany            74988 non-null object
cardDisplayNumber      74988 non-null int64
cardExpDate            74988 non-null object
cardName               74988 non-null object
cardNetwork            74988 non-null object
cardNumber             74988 non-null int64
cardType               74988 non-null object
customer               74988 non-null object
demo                   74988 non-null bool
deviceIP               50168 non-null object
fraud                  74988 non-null bool
geoIp                  74988 non-null object
isAccountFlagged       74988 non-null bool
mobileDetails          25083 non-null object
paymentCard            74988 non-null object
posDetails             24820 non-null object
processingEndTime      74988 non-null int64
processingStartTime    74988 non-null int64
productIdList          74988 non-null object
pr

In [4]:
train_df.iloc[0,14]#mobile details

{'appName': 'Redhold',
 'deviceIP': '072.169.009.234',
 'deviceId': 23,
 'deviceName': 'Gavin',
 'deviceNetwork': 'airtel',
 'deviceOS': 'ios',
 'userLat': -36.919486,
 'userLong': 175.2689}

In [5]:
train_df.iloc[3,16]#posDetails

{'POSCode': 2832683732227822980,
 'geoCity': 'FortMill',
 'geoCountry': 'US',
 'geoPoBox': 29707,
 'geoPostalCode': 29707,
 'geoPostalCodeType': 'Supermarket',
 'geoState': 'US-SC',
 'geoStreet1': 'Ewald Falls',
 'geoStreet2': '753',
 'lat': 35.00737,
 'lon': -80.945076}

In [6]:
train_df.iloc[5,29]#webDetails

{'browser': 'ub',
 'deviceIP': '006.079.221.173',
 'deviceMacAddr': '00:b3:92:e2:f7:8d',
 'deviceOS': 'MacOS',
 'hostURL': 'www.judson-pagac.net',
 'network': None}

In [7]:
import re

def companyNames(name):
    p1 = re.compile('[a-zA-Z]+-[a-zA-Z]+')
    p2 = re.compile('[a-zA-Z]+, [a-zA-Z]+ and [a-zA-Z]+')
    p3 = re.compile('[a-zA-Z]+ Inc')
    p4 = re.compile('[a-zA-Z]+ Group')
    p5 = re.compile('[a-zA-Z]+ LLC')
    p6 = re.compile('[a-zA-Z]+ and Sons')
    if p1.match(name):
        return name.split('-')[0]    
    elif p2.match(name):
        return name.split(',')[0]
    elif p3.match(name) or p4.match(name) or p5.match(name) or p6.match(name):
        return name.split(' ')[0]
    else:
        return name
    
def fraudNames(name):
    if name==True:
        return 1
    else:
        return 0

def getCardExpYear(date):
    return date.split('/')[1]

In [8]:
train_df['cardCompany'] = train_df['cardCompany'].apply(companyNames)
train_df['fraud'] = train_df['fraud'].apply(fraudNames)
train_df['cardExpYear'] = train_df['cardExpDate'].apply(getCardExpYear)

In [9]:
train_df.loc[train_df.query('fraud == 0').sample(frac=.1).index,'fraud'] = 1
train_df.query('fraud == 1')

Unnamed: 0,cardCVV,cardCompany,cardDisplayNumber,cardExpDate,cardName,cardNetwork,cardNumber,cardType,customer,demo,...,storePOS,storeWeb,transAmount,transChannel,transCurrency,transDate,transDescription,transId,webDetails,cardExpYear
1,527,Fahey,9647953517911559,06/21,Mertie Considine,visa,1502854323418887,prepaid,"{'cardNumber': 0, 'customerType': None, 'userI...",False,...,,,0.287541,MOBILE,yen,1519108951690,Qui voluptatem quod,42971644-1b00-4f53-a8e9-9d76662c37cb,,21
10,915,Lockman,1753892011274536,02/19,Ray Conn II,americanexpress,1843684557020911,prepaid,"{'cardNumber': 0, 'customerType': None, 'userI...",False,...,,,0.699533,MOBILE,dollar,1519108952762,Quis vel unde. Delen,50b88265-d1e0-489b-b588-5d293020f6c1,,19
16,216,Ward,4962152404796528,12/21,Emanuel Mann,visa,1215602316509785,prepaid,"{'cardNumber': 0, 'customerType': None, 'userI...",False,...,,"{'storeIP': None, 'storeId': None, 'storeRiskS...",0.667463,WEB,rupee,1519108953375,Ratione et nihil aut,629c8a70-9c4d-4ab1-9f0f-9ba19bb6768c,"{'deviceMacAddr': '10:03:b3:9c:9e:ff', 'device...",21
21,430,Hamill,4444700311329356,05/22,Chasity Kozey,americanexpress,1315266292682552,credit,"{'cardNumber': 0, 'customerType': None, 'userI...",False,...,"{'POSCode': 0, 'storeName': None, 'storeRiskSc...",,2229.321605,POS,yen,1519108953895,Sed provident aut eu,05363e19-0421-42ab-95c3-8710e8853278,,22
22,307,Deckow,1047248981367687,07/21,Alexie Haley,mastercard,5912676715959929,prepaid,"{'cardNumber': 0, 'customerType': None, 'userI...",False,...,"{'POSCode': 0, 'storeName': None, 'storeRiskSc...",,0.715935,POS,pound,1519108953998,Laborum voluptatem e,35200c5d-ec16-434b-ad61-b3175f1ba0fd,,21
32,306,McGlynn,4335229471235399,12/20,Ethel Volkman,mastercard,9077091661911000,credit,"{'cardNumber': 0, 'customerType': None, 'userI...",False,...,,,3598.445526,MOBILE,rupee,1519108955022,Iste aut dolorem. Ma,66b1dfaf-2b75-4cf5-98e6-4a7d4ba0e19c,,20
43,980,Gottlieb,373164810120218,07/19,Lafayette Conn,mastercard,5211002170882098,credit,"{'cardNumber': 0, 'customerType': None, 'userI...",False,...,,,1347.842547,MOBILE,dollar,1519108956141,Excepturi commodi sa,c8e93a67-9f92-47fb-8ead-58dbab2aa42a,,19
62,925,Mante,7603251372124743,02/19,Maxwell Morar,mastercard,24844727470759,credit,"{'cardNumber': 0, 'customerType': None, 'userI...",False,...,,,3567.089584,MOBILE,pound,1519108958070,Reprehenderit aspern,6df88eaa-65ae-43cc-8166-08dbf9ed2647,,19
65,378,Koepp,8522949315174117,09/18,Brooks Runolfsdottir,visa,9521647383503233,debit,"{'cardNumber': 0, 'customerType': None, 'userI...",False,...,"{'POSCode': 0, 'storeName': None, 'storeRiskSc...",,3875.204355,POS,rupee,1519108958375,Libero sunt beatae p,0816bd39-aa2e-41fc-9365-0789a32350e9,,18
68,190,Torp,433301608512584,04/18,Miller Hettinger,mastercard,4353734315413108,debit,"{'cardNumber': 0, 'customerType': None, 'userI...",False,...,"{'POSCode': 0, 'storeName': None, 'storeRiskSc...",,2628.193892,POS,pound,1519108958680,Ut consequatur cumqu,025a9db6-cddb-4c2e-81f7-1b122cf1dec3,,18


In [10]:
train_df.columns

Index(['cardCVV', 'cardCompany', 'cardDisplayNumber', 'cardExpDate',
       'cardName', 'cardNetwork', 'cardNumber', 'cardType', 'customer', 'demo',
       'deviceIP', 'fraud', 'geoIp', 'isAccountFlagged', 'mobileDetails',
       'paymentCard', 'posDetails', 'processingEndTime', 'processingStartTime',
       'productIdList', 'products', 'storePOS', 'storeWeb', 'transAmount',
       'transChannel', 'transCurrency', 'transDate', 'transDescription',
       'transId', 'webDetails', 'cardExpYear'],
      dtype='object')

In [11]:
train_df = train_df.drop(['cardCVV','paymentCard','transDate','isAccountFlagged','cardDisplayNumber','cardExpDate','cardName','cardNumber','customer','demo','deviceIP','geoIp','mobileDetails','posDetails','processingEndTime','processingStartTime','productIdList','products','storePOS','storeWeb','transDescription','transId','webDetails'], axis=1)
train_df.head(10)

Unnamed: 0,cardCompany,cardNetwork,cardType,fraud,transAmount,transChannel,transCurrency,cardExpYear
0,Runolfsson,visa,credit,0,2019.212078,MOBILE,pound,20
1,Fahey,visa,prepaid,1,0.287541,MOBILE,yen,21
2,Hamill,americanexpress,credit,0,2569.155177,MOBILE,pound,22
3,Feil,visa,credit,0,1102.480583,POS,dollar,19
4,Bechtelar,americanexpress,debit,0,3589.21948,POS,rupee,18
5,Parker,visa,credit,0,3360.068744,WEB,yen,21
6,Larson,visa,debit,0,1017.393342,POS,rupee,22
7,Ritchie,visa,debit,0,3822.47509,MOBILE,rupee,20
8,Zemlak,americanexpress,prepaid,0,0.499758,POS,rupee,22
9,Funk,americanexpress,prepaid,0,1000.983918,POS,pound,18


In [12]:
train_df['amountBand'] = pd.qcut(train_df['transAmount'], 4)
train_df[['amountBand', 'fraud']].groupby(['amountBand'], as_index=False).mean().sort_values(by='fraud', ascending=True)

Unnamed: 0,amountBand,fraud
0,"[1.08e-05, 0.796]",0.098576
3,"(2925.726, 4994.472]",0.099163
2,"(1812.738, 2925.726]",0.101029
1,"(0.796, 1812.738]",0.101243


In [13]:
train_df.loc[ train_df['transAmount'] <= 0.796, 'transAmount'] = 0
train_df.loc[(train_df['transAmount'] > 0.796) & (train_df['transAmount'] <= 1812.738), 'transAmount'] = 1
train_df.loc[(train_df['transAmount'] > 1812.738) & (train_df['transAmount'] <= 2925.726), 'transAmount']   = 2
train_df.loc[ train_df['transAmount'] > 2925.726, 'transAmount'] = 3
train_df['transAmount'] = train_df['transAmount'].astype(int)
train_df = train_df.drop(['amountBand'], axis=1)
train_df.head(5)

Unnamed: 0,cardCompany,cardNetwork,cardType,fraud,transAmount,transChannel,transCurrency,cardExpYear
0,Runolfsson,visa,credit,0,2,MOBILE,pound,20
1,Fahey,visa,prepaid,1,0,MOBILE,yen,21
2,Hamill,americanexpress,credit,0,2,MOBILE,pound,22
3,Feil,visa,credit,0,1,POS,dollar,19
4,Bechtelar,americanexpress,debit,0,3,POS,rupee,18


In [14]:
from sklearn.preprocessing import LabelEncoder
train_df = train_df.apply(LabelEncoder().fit_transform)
train_df.head(5)

Unnamed: 0,cardCompany,cardNetwork,cardType,fraud,transAmount,transChannel,transCurrency,cardExpYear
0,377,2,0,0,2,0,1,2
1,109,2,2,1,0,0,3,3
2,152,0,0,0,2,0,1,4
3,113,2,0,0,1,1,0,1
4,24,0,1,0,3,1,2,0


In [15]:
x_full = train_df.drop("fraud", axis=1)
y_full = train_df["fraud"]

x_train, x_test, y_train, y_test = model_selection.train_test_split(x_full, y_full, random_state = 0)

gbrt = tree.DecisionTreeClassifier(max_depth=7)
gbrt.fit(x_train,y_train)   
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
predictions = gbrt.predict(x_test)
print ("Train Accuracy :: ", accuracy_score(y_train, gbrt.predict(x_train)))
print ("Test Accuracy  :: ", accuracy_score(y_test, predictions))
print ("Confusion matrix ", confusion_matrix(y_test, predictions))

Train Accuracy ::  0.9000906811756547
Test Accuracy  ::  0.8999839974395903
Confusion matrix  [[16871     3]
 [ 1872     1]]


In [16]:
pd.crosstab(y_test, predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,16871,3,16874
1,1872,1,1873
All,18743,4,18747
