## Credit Card Fraud Detection Dataset 2023

id: Unique identifier for each transaction

V1-V28: Anonymized features representing various transaction attributes (e.g., time, location, etc.)

Amount: The transaction amount

Class: Binary label indicating whether the transaction is fraudulent (1) or not (0)

In [260]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [261]:
!ls

Pipfile             creditcard_2023.csv notebook.ipynb      predict.py
Pipfile.lock        model_C=1.0.bin     predict-test.py     train.py


In [262]:
df=pd.read_csv("creditcard_2023.csv")

In [263]:
df.head()

Unnamed: 0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-0.260648,-0.469648,2.496266,-0.083724,0.129681,0.732898,0.519014,-0.130006,0.727159,...,-0.110552,0.217606,-0.134794,0.165959,0.12628,-0.434824,-0.08123,-0.151045,17982.1,0
1,1,0.9851,-0.356045,0.558056,-0.429654,0.27714,0.428605,0.406466,-0.133118,0.347452,...,-0.194936,-0.605761,0.079469,-0.577395,0.19009,0.296503,-0.248052,-0.064512,6531.37,0
2,2,-0.260272,-0.949385,1.728538,-0.457986,0.074062,1.419481,0.743511,-0.095576,-0.261297,...,-0.00502,0.702906,0.945045,-1.154666,-0.605564,-0.312895,-0.300258,-0.244718,2513.54,0
3,3,-0.152152,-0.508959,1.74684,-1.090178,0.249486,1.143312,0.518269,-0.06513,-0.205698,...,-0.146927,-0.038212,-0.214048,-1.893131,1.003963,-0.51595,-0.165316,0.048424,5384.44,0
4,4,-0.20682,-0.16528,1.527053,-0.448293,0.106125,0.530549,0.658849,-0.21266,1.049921,...,-0.106984,0.729727,-0.161666,0.312561,-0.414116,1.071126,0.023712,0.419117,14278.97,0


In [264]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568630 entries, 0 to 568629
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   id      568630 non-null  int64  
 1   V1      568630 non-null  float64
 2   V2      568630 non-null  float64
 3   V3      568630 non-null  float64
 4   V4      568630 non-null  float64
 5   V5      568630 non-null  float64
 6   V6      568630 non-null  float64
 7   V7      568630 non-null  float64
 8   V8      568630 non-null  float64
 9   V9      568630 non-null  float64
 10  V10     568630 non-null  float64
 11  V11     568630 non-null  float64
 12  V12     568630 non-null  float64
 13  V13     568630 non-null  float64
 14  V14     568630 non-null  float64
 15  V15     568630 non-null  float64
 16  V16     568630 non-null  float64
 17  V17     568630 non-null  float64
 18  V18     568630 non-null  float64
 19  V19     568630 non-null  float64
 20  V20     568630 non-null  float64
 21  V21     56

In [265]:
df.describe()

Unnamed: 0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,568630.0,568630.0,568630.0,568630.0,568630.0,568630.0,568630.0,568630.0,568630.0,568630.0,...,568630.0,568630.0,568630.0,568630.0,568630.0,568630.0,568630.0,568630.0,568630.0,568630.0
mean,284314.5,-1.109271e-14,-3.429498e-14,-1.209242e-14,3.825991e-15,6.288281e-15,-2.751174e-14,1.240002e-14,8.208047e-15,-1.00298e-14,...,2.210679e-15,-8.767441e-16,4.376179e-16,6.825608e-16,2.545689e-15,1.781906e-15,2.817586e-15,2.891419e-15,12041.957635,0.5
std,164149.486121,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,...,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,6919.644449,0.5
min,0.0,-3.495584,-49.96657,-3.18376,-4.951222,-9.952786,-21.11111,-4.351839,-10.75634,-3.751919,...,-19.38252,-7.734798,-30.29545,-4.067968,-13.61263,-8.226969,-10.49863,-39.03524,50.01,0.0
25%,142157.25,-0.5652859,-0.4866777,-0.6492987,-0.6560203,-0.2934955,-0.4458712,-0.2835329,-0.1922572,-0.5687446,...,-0.1664408,-0.4904892,-0.2376289,-0.6515801,-0.5541485,-0.6318948,-0.3049607,-0.2318783,6054.8925,0.0
50%,284314.5,-0.09363846,-0.1358939,0.0003528579,-0.07376152,0.08108788,0.07871758,0.2333659,-0.1145242,0.09252647,...,-0.03743065,-0.02732881,-0.05968903,0.01590123,-0.008193162,-0.01189208,-0.1729111,-0.01392973,12030.15,0.5
75%,426471.75,0.8326582,0.3435552,0.628538,0.7070047,0.4397368,0.4977881,0.5259548,0.04729905,0.5592621,...,0.1479787,0.4638817,0.1557153,0.7007374,0.5500147,0.6728879,0.334023,0.4095903,18036.33,1.0
max,568629.0,2.229046,4.361865,14.12583,3.201536,42.71689,26.1684,217.873,5.95804,20.27006,...,8.08708,12.63251,31.70763,12.96564,14.62151,5.623285,113.2311,77.25594,24039.93,1.0


In [266]:
df.isnull().sum()

id        0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [267]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [268]:
df.head().T

Unnamed: 0,0,1,2,3,4
id,0.0,1.0,2.0,3.0,4.0
v1,-0.260648,0.9851,-0.260272,-0.152152,-0.20682
v2,-0.469648,-0.356045,-0.949385,-0.508959,-0.16528
v3,2.496266,0.558056,1.728538,1.74684,1.527053
v4,-0.083724,-0.429654,-0.457986,-1.090178,-0.448293
v5,0.129681,0.27714,0.074062,0.249486,0.106125
v6,0.732898,0.428605,1.419481,1.143312,0.530549
v7,0.519014,0.406466,0.743511,0.518269,0.658849
v8,-0.130006,-0.133118,-0.095576,-0.06513,-0.21266
v9,0.727159,0.347452,-0.261297,-0.205698,1.049921


In [213]:
columns=list(df.dtypes.index)[1:-2]

In [214]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [101]:
columns=['v1', 'v2', 'v3', 'v4', 'v5', 'v6', 'v7', 'v8', 'v9', 'v10',
       'v11', 'v12', 'v13', 'v14', 'v15', 'v16', 'v17', 'v18', 'v19', 'v20',
       'v21', 'v22', 'v23', 'v24', 'v25', 'v26', 'v27', 'v28']

In [215]:
def train(df_train, y_train, C=1.0):
    dicts = df_train[columns].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)

    model = LogisticRegression(C=C, max_iter=1000)
    model.fit(X_train, y_train)
    
    return dv, model

In [216]:
def predict(df, dv, model):
    dicts = df[columns].to_dict(orient='records')

    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

In [217]:
C = 1.0
n_splits = 5

In [218]:
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)

scores = []

for train_idx, val_idx in kfold.split(df_full_train):
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]

    y_train = df_train["class"].values
    y_val = df_val["class"].values

    dv, model = train(df_train, y_train, C=C)
    y_pred = predict(df_val, dv, model)

    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)

print('C=%s %.3f +- %.3f' % (C, np.mean(scores), np.std(scores)))

C=1.0 0.994 +- 0.000


In [219]:
dv, model = train(df_full_train, df_full_train["class"].values, C=1.0)
y_pred = predict(df_test, dv, model)

auc = roc_auc_score(df_test["class"].values, y_pred)
auc

0.9934320586635081

In [269]:
import pickle

In [270]:
output_file = f'model_C={C}.bin'

In [271]:
output_file

'model_C=1.0.bin'

In [272]:
f_out = open(output_file, 'wb') 
pickle.dump((dv, model), f_out)
f_out.close()

In [273]:
!ls -lh *.bin


-rw-r--r--  1 muratkahraman  staff   1.3K Oct 26 00:18 model_C=1.0.bin


In [274]:
input_file = 'model_C=1.0.bin'


In [275]:
with open(input_file, 'rb') as f_in: 
    dv, model = pickle.load(f_in)

In [276]:
model

LogisticRegression(max_iter=1000)

In [286]:
credit_card= df.iloc[1000].to_dict()

In [287]:
X = dv.transform([credit_card])

In [288]:
y_pred = model.predict_proba(X)[0, 1]

In [289]:
print('input:', credit_card)
print('output:', y_pred)

input: {'id': 10000.0, 'v1': -0.6396696089088921, 'v2': -0.1085544482705494, 'v3': 0.7782100056976416, 'v4': -0.0387317303261419, 'v5': 0.0931250677366161, 'v6': 0.2300318007380571, 'v7': 0.2907969767219405, 'v8': 0.0209318362579932, 'v9': 1.9438512530541197, 'v10': 0.7255141701267849, 'v11': -0.4221014641688578, 'v12': -0.1395227140902505, 'v13': 1.6100506053085966, 'v14': 1.8758406405835548, 'v15': -0.7389747824146089, 'v16': 0.0076854011409323, 'v17': 1.135717344162971, 'v18': 0.0611976725748488, 'v19': -0.0474454796105257, 'v20': -0.9791604763973756, 'v21': -0.258350615161879, 'v22': 0.3999244836139014, 'v23': 0.7828074052151636, 'v24': 0.7837407009970755, 'v25': 0.2522588245765189, 'v26': -0.5592561594925328, 'v27': 0.4076862143246358, 'v28': 0.0167584875341781, 'amount': 17302.79, 'class': 0.0}
output: 0.03664171787547487


In [290]:
import requests


In [291]:
url = 'http://localhost:9696/predict'


In [313]:
credit_card= df[df["class"]==0].iloc[10].to_dict()

In [314]:
response = requests.post(url, json=credit_card).json()


In [315]:
response

{'fraud': False, 'fraud_probability': 0.0021294114392241066}

In [317]:
df.iloc[1000].to_dict()

{'id': 1000.0,
 'v1': -0.2639028698066069,
 'v2': -0.1525732764811688,
 'v3': 0.1827283788981841,
 'v4': -0.2534167407096296,
 'v5': 0.5818938643664141,
 'v6': -0.1041843872649159,
 'v7': 0.6308849984849844,
 'v8': -0.1389866725243075,
 'v9': 0.395914495624555,
 'v10': 0.9327688331096036,
 'v11': -1.659171142405512,
 'v12': 0.560241374664014,
 'v13': 0.4251851845057326,
 'v14': 1.0202572328560644,
 'v15': 0.9065666056755164,
 'v16': 0.3618075730347657,
 'v17': 0.3311917170669856,
 'v18': 0.4033939107572041,
 'v19': 0.4208006255758966,
 'v20': -0.5657466084016877,
 'v21': -0.1125122582376631,
 'v22': 0.4464000053015059,
 'v23': 0.5268629462331915,
 'v24': -0.7425215709105263,
 'v25': -0.7035976891581637,
 'v26': -0.8576888198591994,
 'v27': -0.0157900366683944,
 'v28': 0.3666313324560797,
 'amount': 8273.81,
 'class': 0.0}