# Naivný Bayes

## 1. Import knižníc

In [1]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## 2. Pôvodné dáta

### a) bez redukcie

In [2]:
origin = pd.read_csv("../Priprava_dat/origin.csv", index_col=0)
X = origin.drop('fraud', axis=1)
y = origin[ 'fraud']

nadvzrk = RandomOverSampler(sampling_strategy='auto', random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
X_train_resampled, y_train_resampled = nadvzrk.fit_resample(X_train, y_train)

model = GaussianNB()
model.fit(X_train_resampled, y_train_resampled)

predikovane = model.predict(X_test)

print("Kontingenčná tabuľka:")
print(pd.crosstab(y_test, predikovane))
print("Report:")
print(classification_report(y_test, predikovane, digits=4))

tn, fp, fn, tp = confusion_matrix(y_test, predikovane).ravel()
speci = tn / (tn + fp)
print("Špecificita:", round(speci, 4))

Kontingenčná tabuľka:
col_0      0      1
fraud              
0      75332  19822
1         19   4827
Report:
              precision    recall  f1-score   support

           0     0.9997    0.7917    0.8836     95154
           1     0.1958    0.9961    0.3273      4846

    accuracy                         0.8016    100000
   macro avg     0.5978    0.8939    0.6055    100000
weighted avg     0.9608    0.8016    0.8567    100000

Špecificita: 0.7917


### b) s redukciou

In [3]:
origin_red = pd.read_csv("../Priprava_dat/origin_red.csv", index_col=0)
X = origin_red.drop('fraud', axis=1)
y = origin_red[ 'fraud']

nadvzrk = RandomOverSampler(sampling_strategy='auto', random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
X_train_resampled, y_train_resampled = nadvzrk.fit_resample(X_train, y_train)

model = GaussianNB()
model.fit(X_train_resampled, y_train_resampled)

predikovane = model.predict(X_test)

print("Kontingenčná tabuľka:")
print(pd.crosstab(y_test, predikovane))
print("Report:")
print(classification_report(y_test, predikovane, digits=4))

tn, fp, fn, tp = confusion_matrix(y_test, predikovane).ravel()
speci = tn / (tn + fp)
print("Špecificita:", round(speci, 4))

Kontingenčná tabuľka:
col_0      0      1
fraud              
0      75329  19825
1         19   4827
Report:
              precision    recall  f1-score   support

           0     0.9997    0.7917    0.8836     95154
           1     0.1958    0.9961    0.3273      4846

    accuracy                         0.8016    100000
   macro avg     0.5978    0.8939    0.6054    100000
weighted avg     0.9608    0.8016    0.8567    100000

Špecificita: 0.7917


## 3. Normalizované dáta

### a) bez redukcie

In [4]:
norm = pd.read_csv("../Priprava_dat/norm.csv", index_col=0)
X = norm[['trustLevel', 'totalScanTimeInSeconds','grandTotal', 'lineItemVoids', 'scansWithoutRegistration','quantityModifications', 'scannedLineItemsPerSecond', 'valuePerSecond', 'lineItemVoidsPerPosition']]
y = norm[ 'fraud']

nadvzrk = RandomOverSampler(sampling_strategy='auto', random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
X_train_resampled, y_train_resampled = nadvzrk.fit_resample(X_train, y_train)

model = GaussianNB()
model.fit(X_train_resampled, y_train_resampled)

predikovane = model.predict(X_test)

print("Kontingenčná tabuľka:")
print(pd.crosstab(y_test, predikovane))
print("Report:")
print(classification_report(y_test, predikovane, digits=4))

tn, fp, fn, tp = confusion_matrix(y_test, predikovane).ravel()
speci = tn / (tn + fp)
print("Špecificita:", round(speci, 4))

Kontingenčná tabuľka:
col_0    0.0    1.0
fraud              
0.0    57719  10647
1.0       26   4279
Report:
              precision    recall  f1-score   support

         0.0     0.9995    0.8443    0.9154     68366
         1.0     0.2867    0.9940    0.4450      4305

    accuracy                         0.8531     72671
   macro avg     0.6431    0.9191    0.6802     72671
weighted avg     0.9573    0.8531    0.8875     72671

Špecificita: 0.8443


### b) s redukciou

In [5]:
norm_red = pd.read_csv("../Priprava_dat/norm_red.csv", index_col=0)
X = norm_red.drop('fraud', axis=1)
y = norm_red[ 'fraud']

nadvzrk = RandomOverSampler(sampling_strategy='auto', random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
X_train_resampled, y_train_resampled = nadvzrk.fit_resample(X_train, y_train)

model = GaussianNB()
model.fit(X_train_resampled, y_train_resampled)

predikovane = model.predict(X_test)

print("Kontingenčná tabuľka:")
print(pd.crosstab(y_test, predikovane))
print("Report:")
print(classification_report(y_test, predikovane, digits=4))

tn, fp, fn, tp = confusion_matrix(y_test, predikovane).ravel()
speci = tn / (tn + fp)
print("Špecificita:", round(speci, 4))


Kontingenčná tabuľka:
col_0    0.0    1.0
fraud              
0.0    57672  10694
1.0       29   4276
Report:
              precision    recall  f1-score   support

         0.0     0.9995    0.8436    0.9149     68366
         1.0     0.2856    0.9933    0.4437      4305

    accuracy                         0.8524     72671
   macro avg     0.6426    0.9184    0.6793     72671
weighted avg     0.9572    0.8524    0.8870     72671

Špecificita: 0.8436


## 4. Diskretizované dáta

### a) bez redukcie

In [6]:
dis = pd.read_csv("../Priprava_dat/dis.csv", index_col=0)
X = dis[['trustLevel', 'totalScanTimeInSeconds','grandTotal', 'lineItemVoids', 'scansWithoutRegistration','quantityModifications', 'scannedLineItemsPerSecond', 'valuePerSecond', 'lineItemVoidsPerPosition']]
y = dis[ 'fraud']

nadvzrk = RandomOverSampler(sampling_strategy='auto', random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
X_train_resampled, y_train_resampled = nadvzrk.fit_resample(X_train, y_train)

model = GaussianNB()
model.fit(X_train_resampled, y_train_resampled)

predikovane = model.predict(X_test)

print("Kontingenčná tabuľka:")
print(pd.crosstab(y_test, predikovane))
print("Report:")
print(classification_report(y_test, predikovane, digits=4))

tn, fp, fn, tp = confusion_matrix(y_test, predikovane).ravel()
speci = tn / (tn + fp)
print("Špecificita:", round(speci, 4))

Kontingenčná tabuľka:
col_0    0.0    1.0
fraud              
0.0    57493  10873
1.0       32   4273
Report:
              precision    recall  f1-score   support

         0.0     0.9994    0.8410    0.9134     68366
         1.0     0.2821    0.9926    0.4394      4305

    accuracy                         0.8499     72671
   macro avg     0.6408    0.9168    0.6764     72671
weighted avg     0.9569    0.8499    0.8853     72671

Špecificita: 0.841


### b) s redukciou

In [7]:
dis_red = pd.read_csv("../Priprava_dat/dis_red.csv", index_col=0)
X = dis_red.drop('fraud', axis=1)
y = dis_red[ 'fraud']

nadvzrk = RandomOverSampler(sampling_strategy='auto', random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
X_train_resampled, y_train_resampled = nadvzrk.fit_resample(X_train, y_train)

model = GaussianNB()
model.fit(X_train_resampled, y_train_resampled)

predikovane = model.predict(X_test)

print("Kontingenčná tabuľka:")
print(pd.crosstab(y_test, predikovane))
print("Report:")
print(classification_report(y_test, predikovane, digits=4))

tn, fp, fn, tp = confusion_matrix(y_test, predikovane).ravel()
speci = tn / (tn + fp)
print("Špecificita:", round(speci, 4))

Kontingenčná tabuľka:
col_0    0.0    1.0
fraud              
0.0    57451  10915
1.0       35   4270
Report:
              precision    recall  f1-score   support

         0.0     0.9994    0.8403    0.9130     68366
         1.0     0.2812    0.9919    0.4382      4305

    accuracy                         0.8493     72671
   macro avg     0.6403    0.9161    0.6756     72671
weighted avg     0.9568    0.8493    0.8849     72671

Špecificita: 0.8403
