In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
%matplotlib inline

In [2]:
df = pd.read_csv("credit_card_dataset_wo_smote.csv")

In [3]:
df.shape

(52525, 8)

In [4]:
from sklearn import preprocessing

In [5]:
label_encoder = preprocessing.LabelEncoder()

In [6]:
df['merchant']= label_encoder.fit_transform(df['merchant'])
df['merchant'].unique()

array([514, 241, 390, 360, 297, 607, 534, 107, 250, 563, 543, 304, 394,
       308,  42, 557, 379, 409, 331, 570,  29, 231, 324, 467, 387, 351,
       130,  82, 352, 454, 311, 627,  21,  17, 268, 239, 225, 229, 442,
        92, 265, 381, 329, 146, 548, 602, 443,  86, 468, 479, 272, 648,
       309, 298, 117, 194, 589, 271, 403, 436, 153, 270, 172, 237, 392,
       280, 217, 119, 610, 466, 502,   8, 480, 208,  99, 625, 678, 650,
       645,  47, 439,  77, 444, 472, 363, 108, 273, 147, 554, 549, 669,
       483, 162, 349, 175, 421, 300, 595, 198,   4, 210, 522, 681, 657,
       245, 494, 332, 585, 255, 149, 414, 498, 485, 370,  32, 510,  55,
       157, 359, 174, 463, 181, 508,  70, 399,  88, 186, 196, 475, 425,
       431, 327, 346, 328, 597, 491, 450, 378, 183,  67,  73, 691, 256,
        81,  48, 677,  83, 316, 588, 303, 173, 487, 291,  43, 103,   3,
       193, 417,  24,  94, 384, 411, 560, 484, 608, 659, 140, 395, 626,
       342, 523, 664,  12, 462, 295, 121, 118, 504, 635,  53, 63

In [7]:
df['category']= label_encoder.fit_transform(df['category'])
df['category'].unique()

array([ 8,  4,  0,  2,  9,  3, 11, 12,  1, 10,  5, 13,  7,  6])

In [8]:
df['job']= label_encoder.fit_transform(df['job'])
df['job'].unique()

array([357, 412, 295, 316, 111, 463,  27, 121, 362, 317, 230, 428, 159,
       427, 296, 373, 351, 145, 178, 307, 272, 340, 256, 106, 453,  56,
       332, 139, 326, 297, 193, 203,  34, 117, 309, 358, 259, 189, 399,
       378, 456, 225, 424,  26, 466, 109, 377, 163, 138, 324,  21, 141,
       441, 217,  20, 114, 314, 461, 167, 120,  83, 158, 471, 392, 186,
       411, 345, 465,  58, 204, 108, 194, 405, 151, 170, 133,  77,  82,
       457, 181, 262, 150, 382, 367, 136, 308, 242, 168,  15, 315, 187,
         8, 140, 188,  40, 394,   7, 212, 387, 440, 179,   3, 343, 470,
       286, 421, 451, 171,  49, 341, 155, 218, 432, 274, 213, 280, 446,
       439, 249, 166, 126, 443, 279,  50, 454, 419,  90, 113, 180, 462,
       275, 278,   5, 287, 191, 101, 353, 110, 288, 107,  78, 434,  59,
       319, 337, 142, 291,  17,  68,  10,  11,  65, 354, 402, 258, 143,
       233, 442, 102, 210, 104,  47, 248,  66, 303, 182, 197,  31, 243,
       152, 359, 247, 129, 327, 323, 220, 228,  93, 363,  29, 25

In [9]:
df['gender']= label_encoder.fit_transform(df['gender'])
df['gender'].unique()

array([0, 1])

In [10]:
df['city']= label_encoder.fit_transform(df['city'])
df['city'].unique()

array([494, 569, 440,  81, 204, 210, 332, 222, 446, 140, 138, 696, 675,
       293, 327, 307, 396, 116, 304, 230, 473, 500, 603, 458, 792,  43,
       656, 259, 336, 774, 813, 752, 639,  50, 786,  68,  61, 254,  11,
       496, 237, 130, 747, 544, 407, 282, 160, 523, 735, 236, 658, 606,
       181, 363, 545,  35, 132, 233, 737,  25, 264,  82, 317, 679, 665,
       168, 633, 771, 766, 517, 745, 585, 129, 258, 239,  93, 700,  70,
       546, 334, 419, 331, 466, 228, 483, 202,  85, 338, 314, 379, 666,
        40, 621, 697, 436, 704, 150, 411, 449, 814, 527, 300,  10, 373,
       701, 335,  88, 588, 812, 751, 214, 167, 231, 268, 426, 225, 101,
       207,  44, 253, 810, 799, 410, 369, 105, 795, 574,  72, 184, 690,
       664, 364, 348, 330,   1, 111, 707, 599, 514, 467, 579, 366,  58,
       811, 731, 424,  26, 674, 271, 243, 248, 382, 720, 414, 375, 431,
       374, 471, 607, 427, 613, 128, 155, 221, 800, 460, 515, 367, 472,
        73, 301, 782, 587, 709, 147, 638,  47, 115, 384, 600, 53

In [11]:
X = df.drop(columns=["is_fraud", "dob"], axis=1)
Y = df["is_fraud"] 

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=0)

In [13]:
classifier = GaussianNB()

In [14]:
classifier.fit(X_train, Y_train)

GaussianNB()

In [15]:
classifier.score(X_train, Y_train)

0.9831370522479397

In [16]:
Y_predicted=classifier.predict(X_test)

In [17]:
cm=confusion_matrix(Y_test, Y_predicted)
cm

array([[15441,   171],
       [   74,    72]], dtype=int64)

In [19]:
report = classification_report(Y_test, Y_predicted, digits=3)
print(report)

              precision    recall  f1-score   support

           0      0.995     0.989     0.992     15612
           1      0.296     0.493     0.370       146

    accuracy                          0.984     15758
   macro avg      0.646     0.741     0.681     15758
weighted avg      0.989     0.984     0.986     15758

