In [None]:
import pandas as pd
import numpy as np
import math
from tabulate import tabulate
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def get_dataset():
  dataset = pd.read_csv('/content/drive/MyDrive/Datasets/fraud_detection_bank_dataset.csv')
  return dataset

In [None]:
def normalize_data(data, norm='std'):
  if norm == 'min':
    normalized = (data - data.min(axis=0)) / (data.max(axis=0) - data.min(axis=0))
  elif norm == 'std':
    normalized = ((data - np.mean(data, axis=0)) / np.std(data, axis=0))
  normalized = np.nan_to_num(normalized, 0)
  return normalized

In [None]:
def preprocess_data(dataset):
  X = dataset.iloc[:, 1:112]
  y = dataset.iloc[:, 113]
  return normalize_data(X.to_numpy(dtype='float'), norm='min'), y.to_numpy(dtype='float')

In [None]:
def evaluate(y, predictions):
  cm = confusion_matrix(y, predictions)
  tn, fp, fn, tp = confusion_matrix(y, predictions).ravel()
  acc = (tn + tp) / (tn + fp + fn +tp)
  tpr = tp / (tp + fn)
  tnr = tn / (tn + fp)
  return cm, acc, tpr, tnr

In [None]:
dataset = get_dataset()
dataset

In [None]:

X, y = preprocess_data(dataset)

variances = [0.75, 0.9, 0.99]
models = [
          ('Naive Bayes', GaussianNB()),
          ('SVM Linear', SVC(kernel='linear')),
          ('SVM RBF', SVC(kernel='rbf')),
          ('C4.5', DecisionTreeClassifier())
          ]

results = {i: {} for i, j in models}

for i in models:
  for variance in variances:
    pca = PCA(n_components=variance, svd_solver='full')
    new_data = pca.fit_transform(np.copy(X), y)
    X_train, X_test, y_train, y_test = train_test_split(new_data, y, test_size=0.33, random_state=10)
    name, model = i
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    cm, acc, tpr, tnr = evaluate(y_test, predictions)
    results[name].setdefault(variance, (pca.n_components_, cm, acc, tpr, tnr))

  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
headers = ['Modelo', 'Variância 75%', 'Variância 95%', 'Variância 99%']
rows = []

for model, values in results.items():
  row = [model]
  for variance in values.keys():
    n_componentes, cm, acc, tpr, tnr = values[variance]
    row.append('Nº Comp.: %s\nACC: %.2f\nTPR: %.2f\nTNR: %.2f\nCM:\n%s' % (n_componentes, acc, tpr, tnr, cm))
  rows.append(row)

table = tabulate(rows, headers=headers, tablefmt='fancy_grid')
print(table)

╒═════════════╤═════════════════╤═════════════════╤═════════════════╕
│ Modelo      │ Variância 75%   │ Variância 95%   │ Variância 99%   │
╞═════════════╪═════════════════╪═════════════════╪═════════════════╡
│ Naive Bayes │ Nº Comp.: 11    │ Nº Comp.: 18    │ Nº Comp.: 34    │
│             │ ACC: 0.83       │ ACC: 0.81       │ ACC: 0.83       │
│             │ TPR: 0.52       │ TPR: 0.55       │ TPR: 0.57       │
│             │ TNR: 0.95       │ TNR: 0.91       │ TNR: 0.92       │
│             │ CM:             │ CM:             │ CM:             │
│             │ [[4663  270]    │ [[4467  466]    │ [[4554  379]    │
│             │  [ 881  941]]   │  [ 813 1009]]   │  [ 782 1040]]   │
├─────────────┼─────────────────┼─────────────────┼─────────────────┤
│ SVM Linear  │ Nº Comp.: 11    │ Nº Comp.: 18    │ Nº Comp.: 34    │
│             │ ACC: 0.83       │ ACC: 0.85       │ ACC: 0.85       │
│             │ TPR: 0.46       │ TPR: 0.48       │ TPR: 0.49       │
│             │ TNR: