In [296]:
import pandas as pd
import numpy as np
from sklearn.model_selection import  train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score
import pickle


In [297]:
df_transactions = pd.read_csv('transactional-sample.csv')

In [298]:
df_transactions['bin'] = df_transactions['card_number'].astype(str).str[0:6]


In [299]:
df_transactions['last_digits'] = df_transactions['card_number'].astype(str).str[12:]

In [300]:
df_transactions.head()

Unnamed: 0,transaction_id,merchant_id,user_id,card_number,transaction_date,transaction_amount,device_id,has_cbk,bin,last_digits
0,21320398,29744,97051,434505******9116,2019-12-01T23:16:32.812632,374.56,285475.0,False,434505,9116
1,21320399,92895,2708,444456******4210,2019-12-01T22:45:37.873639,734.87,497105.0,True,444456,4210
2,21320400,47759,14777,425850******7024,2019-12-01T22:22:43.021495,760.36,,False,425850,7024
3,21320401,68657,69758,464296******3991,2019-12-01T21:59:19.797129,2556.13,,True,464296,3991
4,21320402,54075,64367,650487******6116,2019-12-01T21:30:53.347051,55.36,860232.0,False,650487,6116


In [301]:
df_transactions['transaction_amount'] = df_transactions['transaction_amount'].fillna(0)
df_transactions['device_id'] = df_transactions['device_id'].fillna('0')
# df_transactions['merchant_id'] = df_transactions['merchant_id'].fillna('0')
# df_transactions['user_id'] = df_transactions['user_id'].fillna('0')

In [302]:
df_transactions['user_id'] = df_transactions['user_id'].astype(str)
df_transactions['merchant_id'] = df_transactions['merchant_id'].astype(str)
df_transactions['transaction_id'] = df_transactions['transaction_id'].astype(str)
df_transactions['device_id'] = df_transactions['device_id'].astype(str)
df_transactions['transaction_date'] = pd.to_datetime(df_transactions['transaction_date'])

In [303]:
df_transactions['day'] = df_transactions['transaction_date'].apply(lambda r:r.day)
df_transactions['hour'] = df_transactions['transaction_date'].apply(lambda r:r.hour)
df_transactions['minute'] = df_transactions['transaction_date'].apply(lambda r:r.minute)

In [304]:
df_fraud = df_transactions[df_transactions.has_cbk == 1]
df_not_fraud = df_transactions[df_transactions.has_cbk == 0]

df_not_fraud_sample = df_not_fraud.sample(391)
df_fraud_sample = df_fraud

df_final = pd.concat([df_not_fraud_sample, df_fraud_sample])

In [305]:
df_final = df_final.reset_index()

In [306]:
x = df_final.drop(['transaction_id', 'card_number', 'has_cbk', 'transaction_date', 'index'], axis = 1)

In [307]:
y = df_final['has_cbk'].astype(int)

In [308]:
x.head()

Unnamed: 0,merchant_id,user_id,transaction_amount,device_id,bin,last_digits,day,hour,minute
0,15227,86620,1834.23,47609.0,444456,8170,26,17,46
1,62052,28771,1295.76,48458.0,498442,5470,18,14,35
2,25671,72132,1073.22,0.0,516376,4270,4,17,45
3,22918,63646,137.47,804022.0,544731,1611,24,21,57
4,6112,82081,420.21,0.0,516292,5463,8,12,39


In [309]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=100)

In [310]:
logisticRegr = LogisticRegression()
logisticRegr.fit(x_train, y_train)

LogisticRegression()

In [311]:
# recall, accuracy, 

In [312]:
pred = logisticRegr.predict(x_test)

In [313]:
acc = accuracy_score(y_test, pred)
recall = recall_score(y_test, pred)

In [315]:
pickle.dump(logisticRegr, open('ml.sav', 'wb'))                             

In [318]:
predict_proba = logisticRegr.predict_proba(x_test)

In [319]:
predict_proba

array([[0.54968525, 0.45031475],
       [0.12266636, 0.87733364],
       [0.52673269, 0.47326731],
       [0.60033378, 0.39966622],
       [0.61483202, 0.38516798],
       [0.56181247, 0.43818753],
       [0.54961454, 0.45038546],
       [0.35816364, 0.64183636],
       [0.66011658, 0.33988342],
       [0.20202205, 0.79797795],
       [0.66157025, 0.33842975],
       [0.67235348, 0.32764652],
       [0.11238325, 0.88761675],
       [0.08738919, 0.91261081],
       [0.46330148, 0.53669852],
       [0.47806437, 0.52193563],
       [0.68692257, 0.31307743],
       [0.59898547, 0.40101453],
       [0.64016566, 0.35983434],
       [0.69002659, 0.30997341],
       [0.6152986 , 0.3847014 ],
       [0.40262878, 0.59737122],
       [0.67932252, 0.32067748],
       [0.42157122, 0.57842878],
       [0.1560571 , 0.8439429 ],
       [0.37308751, 0.62691249],
       [0.34832222, 0.65167778],
       [0.53605928, 0.46394072],
       [0.11683626, 0.88316374],
       [0.66607829, 0.33392171],
       [0.

In [320]:
x_train.dtypes

merchant_id            object
user_id                object
transaction_amount    float64
device_id              object
bin                    object
last_digits            object
day                     int64
hour                    int64
minute                  int64
dtype: object