# LightGBM

## 1. Import knižníc

In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## 2. Pôvodné dáta

### a) bez redukcie

In [3]:
origin = pd.read_csv("../Priprava_dat/origin.csv", index_col=0)
X = origin.drop('fraud', axis=1)
y = origin[ 'fraud']

nadvzrk = RandomOverSampler(sampling_strategy='auto', random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
X_train_resampled, y_train_resampled = nadvzrk.fit_resample(X_train, y_train)

model = LGBMClassifier(random_state=0)
model.fit(X_train_resampled, y_train_resampled)

predikovane = model.predict(X_test)

print("Kontingenčná tabuľka:")
print(pd.crosstab(y_test, predikovane))
print("Report:")
print(classification_report(y_test, predikovane, digits=4))

tn, fp, fn, tp = confusion_matrix(y_test, predikovane).ravel()
speci = tn / (tn + fp)
print("Špecificita:", round(speci, 4))


[LightGBM] [Info] Number of positive: 381015, number of negative: 381015
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005679 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1272
[LightGBM] [Info] Number of data points in the train set: 762030, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Kontingenčná tabuľka:
col_0      0     1
fraud             
0      93487  1667
1          1  4845
Report:
              precision    recall  f1-score   support

           0     1.0000    0.9825    0.9912     95154
           1     0.7440    0.9998    0.8531      4846

    accuracy                         0.9833    100000
   macro avg     0.8720    0.9911    0.9222    100000
weighted avg     0.9876    0.9833    0.9845    100000

Špecificita: 0.9825


### b) s redukciou

In [3]:
origin_red = pd.read_csv("../Priprava_dat/origin_red.csv", index_col=0)
X = origin_red.drop('fraud', axis=1)
y = origin_red[ 'fraud']

nadvzrk = RandomOverSampler(sampling_strategy='auto', random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
X_train_resampled, y_train_resampled = nadvzrk.fit_resample(X_train, y_train)

model = LGBMClassifier(random_state=0)
model.fit(X_train_resampled, y_train_resampled)

predikovane = model.predict(X_test)

print("Kontingenčná tabuľka:")
print(pd.crosstab(y_test, predikovane))
print("Report:")
print(classification_report(y_test, predikovane, digits=4))

tn, fp, fn, tp = confusion_matrix(y_test, predikovane).ravel()
speci = tn / (tn + fp)
print("Špecificita:", round(speci, 4))

[LightGBM] [Info] Number of positive: 381015, number of negative: 381015
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003788 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1011
[LightGBM] [Info] Number of data points in the train set: 762030, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Kontingenčná tabuľka:
col_0      0     1
fraud             
0      93492  1662
1          1  4845
Report:
              precision    recall  f1-score   support

           0     1.0000    0.9825    0.9912     95154
           1     0.7446    0.9998    0.8535      4846

    accuracy                         0.9834    100000
   macro avg     0.8723    0.9912    0.9224    100000
weighted avg     0.9876    0.9834    0.9845    100000

Špecificita: 0.9825


## 3. Normalizované dáta

### a) bez redukcie

In [4]:
norm = pd.read_csv("../Priprava_dat/norm.csv", index_col=0)
X = norm[['trustLevel', 'totalScanTimeInSeconds','grandTotal', 'lineItemVoids', 'scansWithoutRegistration','quantityModifications', 'scannedLineItemsPerSecond', 'valuePerSecond', 'lineItemVoidsPerPosition']]
y = norm[ 'fraud']

nadvzrk = RandomOverSampler(sampling_strategy='auto', random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
X_train_resampled, y_train_resampled = nadvzrk.fit_resample(X_train, y_train)

model = LGBMClassifier(random_state=0)
model.fit(X_train_resampled, y_train_resampled)

predikovane = model.predict(X_test)

print("Kontingenčná tabuľka:")
print(pd.crosstab(y_test, predikovane))
print("Report:")
print(classification_report(y_test, predikovane, digits=4))

tn, fp, fn, tp = confusion_matrix(y_test, predikovane).ravel()
speci = tn / (tn + fp)
print("Špecificita:", round(speci, 4))

[LightGBM] [Info] Number of positive: 273534, number of negative: 273534
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002937 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1245
[LightGBM] [Info] Number of data points in the train set: 547068, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Kontingenčná tabuľka:
col_0    0.0   1.0
fraud             
0.0    67224  1142
1.0        1  4304
Report:
              precision    recall  f1-score   support

         0.0     1.0000    0.9833    0.9916     68366
         1.0     0.7903    0.9998    0.8828      4305

    accuracy                         0.9843     72671
   macro avg     0.8951    0.9915    0.9372     72671
weighted avg     0.9876    0.9843    0.9851     72671

Špecificita: 0.9833


### b) s redukciou

In [5]:
norm_red = pd.read_csv("../Priprava_dat/norm_red.csv", index_col=0)
X = norm_red.drop('fraud', axis=1)
y = norm_red[ 'fraud']

nadvzrk = RandomOverSampler(sampling_strategy='auto', random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
X_train_resampled, y_train_resampled = nadvzrk.fit_resample(X_train, y_train)

model = LGBMClassifier(random_state=0)
model.fit(X_train_resampled, y_train_resampled)

predikovane = model.predict(X_test)

print("Kontingenčná tabuľka:")
print(pd.crosstab(y_test, predikovane))
print("Report:")
print(classification_report(y_test, predikovane, digits=4))

tn, fp, fn, tp = confusion_matrix(y_test, predikovane).ravel()
speci = tn / (tn + fp)
print("Špecificita:", round(speci, 4))

[LightGBM] [Info] Number of positive: 273534, number of negative: 273534
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002936 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 984
[LightGBM] [Info] Number of data points in the train set: 547068, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Kontingenčná tabuľka:
col_0    0.0   1.0
fraud             
0.0    67206  1160
1.0        1  4304
Report:
              precision    recall  f1-score   support

         0.0     1.0000    0.9830    0.9914     68366
         1.0     0.7877    0.9998    0.8812      4305

    accuracy                         0.9840     72671
   macro avg     0.8938    0.9914    0.9363     72671
weighted avg     0.9874    0.9840    0.9849     72671

Špecificita: 0.983


## 4. Diskretizované dáta

### a) bez redukcie

In [6]:
dis = pd.read_csv("../Priprava_dat/dis.csv", index_col=0)
X = dis[['trustLevel', 'totalScanTimeInSeconds','grandTotal', 'lineItemVoids', 'scansWithoutRegistration','quantityModifications', 'scannedLineItemsPerSecond', 'valuePerSecond', 'lineItemVoidsPerPosition']]
y = dis[ 'fraud']

nadvzrk = RandomOverSampler(sampling_strategy='auto', random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
X_train_resampled, y_train_resampled = nadvzrk.fit_resample(X_train, y_train)

model = LGBMClassifier(random_state=0)
model.fit(X_train_resampled, y_train_resampled)

predikovane = model.predict(X_test)

print("Kontingenčná tabuľka:")
print(pd.crosstab(y_test, predikovane))
print("Report:")
print(classification_report(y_test, predikovane, digits=4))

tn, fp, fn, tp = confusion_matrix(y_test, predikovane).ravel()
speci = tn / (tn + fp)
print("Špecificita:", round(speci, 4))

[LightGBM] [Info] Number of positive: 273534, number of negative: 273534
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003684 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 101
[LightGBM] [Info] Number of data points in the train set: 547068, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Kontingenčná tabuľka:
col_0    0.0   1.0
fraud             
0.0    66159  2207
1.0       24  4281
Report:
              precision    recall  f1-score   support

         0.0     0.9996    0.9677    0.9834     68366
         1.0     0.6598    0.9944    0.7933      4305

    accuracy                         0.9693     72671
   macro avg     0.8297    0.9811    0.8884     72671
weighted avg     0.9795    0.9693    0.9722     72671

Špecificita: 0.9677


### b) s redukciou

In [7]:
dis_red = pd.read_csv("../Priprava_dat/dis_red.csv", index_col=0)
X = dis_red.drop('fraud', axis=1)
y = dis_red[ 'fraud']

nadvzrk = RandomOverSampler(sampling_strategy='auto', random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
X_train_resampled, y_train_resampled = nadvzrk.fit_resample(X_train, y_train)

model = LGBMClassifier(random_state=0)
model.fit(X_train_resampled, y_train_resampled)

predikovane = model.predict(X_test)

print("Kontingenčná tabuľka:")
print(pd.crosstab(y_test, predikovane))
print("Report:")
print(classification_report(y_test, predikovane, digits=4))

tn, fp, fn, tp = confusion_matrix(y_test, predikovane).ravel()
speci = tn / (tn + fp)
print("Špecificita:", round(speci, 4))

[LightGBM] [Info] Number of positive: 273534, number of negative: 273534
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012489 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 82
[LightGBM] [Info] Number of data points in the train set: 547068, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Kontingenčná tabuľka:
col_0    0.0   1.0
fraud             
0.0    66184  2182
1.0       27  4278
Report:
              precision    recall  f1-score   support

         0.0     0.9996    0.9681    0.9836     68366
         1.0     0.6622    0.9937    0.7948      4305

    accuracy                         0.9696     72671
   macro avg     0.8309    0.9809    0.8892     72671
weighted avg     0.9796    0.9696    0.9724     72671

Špecificita: 0.9681
