# Logistická regresia

## 1. Import knižníc

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## 2. Pôvodné dáta

### a) bez redukcie

In [2]:
origin = pd.read_csv("../Priprava_dat/origin.csv", index_col=0)
X = origin.drop('fraud', axis=1)
y = origin[ 'fraud']

nadvzrk = RandomOverSampler(sampling_strategy='auto', random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
X_train_resampled, y_train_resampled = nadvzrk.fit_resample(X_train, y_train)

model = LogisticRegression(random_state=0).fit(X_train_resampled, y_train_resampled)

predikovane = model.predict(X_test)

print("Kontingenčná tabuľka:")
print(pd.crosstab(y_test, predikovane))
print("Report:")
print(classification_report(y_test, predikovane, digits=4))

tn, fp, fn, tp = confusion_matrix(y_test, predikovane).ravel()
speci = tn / (tn + fp)
print("Špecificita:", round(speci, 4))

Kontingenčná tabuľka:
col_0      0     1
fraud             
0      89528  5626
1         55  4791
Report:
              precision    recall  f1-score   support

           0     0.9994    0.9409    0.9692     95154
           1     0.4599    0.9887    0.6278      4846

    accuracy                         0.9432    100000
   macro avg     0.7297    0.9648    0.7985    100000
weighted avg     0.9732    0.9432    0.9527    100000

Špecificita: 0.9409


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### b) s redukciou

In [3]:
origin_red = pd.read_csv("../Priprava_dat/origin_red.csv", index_col=0)
X = origin_red.drop('fraud', axis=1)
y = origin_red[ 'fraud']

nadvzrk = RandomOverSampler(sampling_strategy='auto', random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
X_train_resampled, y_train_resampled = nadvzrk.fit_resample(X_train, y_train)

model = LogisticRegression(random_state=0).fit(X_train_resampled, y_train_resampled)

predikovane = model.predict(X_test)

print("Kontingenčná tabuľka:")
print(pd.crosstab(y_test, predikovane))
print("Report:")
print(classification_report(y_test, predikovane, digits=4))

tn, fp, fn, tp = confusion_matrix(y_test, predikovane).ravel()
speci = tn / (tn + fp)
print("Špecificita:", round(speci, 4))

Kontingenčná tabuľka:
col_0      0     1
fraud             
0      88694  6460
1         58  4788
Report:
              precision    recall  f1-score   support

           0     0.9993    0.9321    0.9646     95154
           1     0.4257    0.9880    0.5950      4846

    accuracy                         0.9348    100000
   macro avg     0.7125    0.9601    0.7798    100000
weighted avg     0.9715    0.9348    0.9466    100000

Špecificita: 0.9321


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## 3. Normalizované dáta

### a) bez redukcie

In [4]:
norm = pd.read_csv("../Priprava_dat/norm.csv", index_col=0)
X = norm[['trustLevel', 'totalScanTimeInSeconds','grandTotal', 'lineItemVoids', 'scansWithoutRegistration','quantityModifications', 'scannedLineItemsPerSecond', 'valuePerSecond', 'lineItemVoidsPerPosition']]
y = norm[ 'fraud']

nadvzrk = RandomOverSampler(sampling_strategy='auto', random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
X_train_resampled, y_train_resampled = nadvzrk.fit_resample(X_train, y_train)

model = LogisticRegression(random_state=0).fit(X_train_resampled, y_train_resampled)

predikovane = model.predict(X_test)

print("Kontingenčná tabuľka:")
print(pd.crosstab(y_test, predikovane))
print("Report:")
print(classification_report(y_test, predikovane, digits=4))

tn, fp, fn, tp = confusion_matrix(y_test, predikovane).ravel()
speci = tn / (tn + fp)
print("Špecificita:", round(speci, 4))

Kontingenčná tabuľka:
col_0    0.0   1.0
fraud             
0.0    65717  2649
1.0       38  4267
Report:
              precision    recall  f1-score   support

         0.0     0.9994    0.9613    0.9800     68366
         1.0     0.6170    0.9912    0.7605      4305

    accuracy                         0.9630     72671
   macro avg     0.8082    0.9762    0.8703     72671
weighted avg     0.9768    0.9630    0.9670     72671

Špecificita: 0.9613


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### b) s redukciou

In [5]:
norm_red = pd.read_csv("../Priprava_dat/norm_red.csv", index_col=0)
X = norm_red.drop('fraud', axis=1)
y = norm_red[ 'fraud']

nadvzrk = RandomOverSampler(sampling_strategy='auto', random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
X_train_resampled, y_train_resampled = nadvzrk.fit_resample(X_train, y_train)

model = LogisticRegression(random_state=0).fit(X_train_resampled, y_train_resampled)

predikovane = model.predict(X_test)

print("Kontingenčná tabuľka:")
print(pd.crosstab(y_test, predikovane))
print("Report:")
print(classification_report(y_test, predikovane, digits=4))

tn, fp, fn, tp = confusion_matrix(y_test, predikovane).ravel()
speci = tn / (tn + fp)
print("Špecificita:", round(speci, 4))

Kontingenčná tabuľka:
col_0    0.0   1.0
fraud             
0.0    65621  2745
1.0       38  4267
Report:
              precision    recall  f1-score   support

         0.0     0.9994    0.9598    0.9792     68366
         1.0     0.6085    0.9912    0.7541      4305

    accuracy                         0.9617     72671
   macro avg     0.8040    0.9755    0.8667     72671
weighted avg     0.9763    0.9617    0.9659     72671

Špecificita: 0.9598


## 4. Diskretizované dáta

### a) bez redukcie

In [6]:
dis = pd.read_csv("../Priprava_dat/dis.csv", index_col=0)
X = dis[['trustLevel', 'totalScanTimeInSeconds','grandTotal', 'lineItemVoids', 'scansWithoutRegistration','quantityModifications', 'scannedLineItemsPerSecond', 'valuePerSecond', 'lineItemVoidsPerPosition']]
y = dis[ 'fraud']

nadvzrk = RandomOverSampler(sampling_strategy='auto', random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
X_train_resampled, y_train_resampled = nadvzrk.fit_resample(X_train, y_train)

model = LogisticRegression(random_state=0).fit(X_train_resampled, y_train_resampled)

predikovane = model.predict(X_test)

print("Kontingenčná tabuľka:")
print(pd.crosstab(y_test, predikovane))
print("Report:")
print(classification_report(y_test, predikovane, digits=4))

tn, fp, fn, tp = confusion_matrix(y_test, predikovane).ravel()
speci = tn / (tn + fp)
print("Špecificita:", round(speci, 4))

Kontingenčná tabuľka:
col_0    0.0   1.0
fraud             
0.0    64846  3520
1.0       80  4225
Report:
              precision    recall  f1-score   support

         0.0     0.9988    0.9485    0.9730     68366
         1.0     0.5455    0.9814    0.7012      4305

    accuracy                         0.9505     72671
   macro avg     0.7721    0.9650    0.8371     72671
weighted avg     0.9719    0.9505    0.9569     72671

Špecificita: 0.9485


### b) s redukciou

In [7]:
dis_red = pd.read_csv("../Priprava_dat/dis_red.csv", index_col=0)
X = dis_red.drop('fraud', axis=1)
y = dis_red[ 'fraud']

nadvzrk = RandomOverSampler(sampling_strategy='auto', random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
X_train_resampled, y_train_resampled = nadvzrk.fit_resample(X_train, y_train)

model = LogisticRegression(random_state=0).fit(X_train_resampled, y_train_resampled)

predikovane = model.predict(X_test)

print("Kontingenčná tabuľka:")
print(pd.crosstab(y_test, predikovane))
print("Report:")
print(classification_report(y_test, predikovane, digits=4))

tn, fp, fn, tp = confusion_matrix(y_test, predikovane).ravel()
speci = tn / (tn + fp)
print("Špecificita:", round(speci, 4))

Kontingenčná tabuľka:
col_0    0.0   1.0
fraud             
0.0    64682  3684
1.0       82  4223
Report:
              precision    recall  f1-score   support

         0.0     0.9987    0.9461    0.9717     68366
         1.0     0.5341    0.9810    0.6916      4305

    accuracy                         0.9482     72671
   macro avg     0.7664    0.9635    0.8317     72671
weighted avg     0.9712    0.9482    0.9551     72671

Špecificita: 0.9461
