# Лабораторная работа №2

# Алгоритмы классификации

## Подключение библиотек

In [6]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

## Задание 1

Какова вероятность отправиться на прогулку если идёт дождь, при наличии следующих наблюдений?

In [7]:
data = [
    ("солнечно", True),
    ("снег", False),
    ("облачно", False),
    ("дождь", False),
    ("солнечно", True),
    ("снег", False),
    ("облачно", True),
    ("снег", False),
    ("солнечно", False),
    ("облачно", True),
    ("снег", True),
    ("солнечно", True),
    ("дождь", False),
    ("дождь", True),
    ("облачно", True),
]

In [8]:
counts = {weather[0]: (data.count(weather), data.count((weather[0], not weather[1]))) for weather in data}

new_data = pd.DataFrame({
    "Погода": [key for key in counts],
    "Прогулка (да)": [counts[key][0] for key in counts],
    "Прогулка (нет)": [counts[key][1] for key in counts],
})

new_data

Unnamed: 0,Погода,Прогулка (да),Прогулка (нет)
0,солнечно,3,1
1,снег,1,3
2,облачно,3,1
3,дождь,1,2


In [9]:
p_yes = 8 / 15

p_rain = 3 / 15

p_yes_rain = (1 / 15) / p_yes

p = (p_yes_rain * p_yes) / p_rain

p

0.3333333333333333

## Задание 2

In [10]:
data = pd.read_csv("card_transdata.csv")

data.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0


Проверим, что удаление объектов с нулевыми значениями не повлечёт за собой чрезмерное сокращение датасета:

In [11]:
data.shape, data.dropna().shape

((96225, 8), (96224, 8))

In [12]:
data = data.dropna()

Разделим выборку на матрицу "объект-признак" и прогнозируемый результат:

In [13]:
X = data.drop("fraud", axis=1)
y = data["fraud"]

Разобъём данные на обучающую и тестовую выборки:

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

Проведём масштабирование данных:

In [15]:
scaler = StandardScaler()

In [16]:
X_train = scaler.fit_transform(X_train)

In [17]:
X_test = scaler.transform(X_test)

Проверим балансировку классов:

In [18]:
np.sum(data["fraud"] == 0), np.sum(data["fraud"] == 1)

(87858, 8366)

Сбалансируем обучающую выборку:

In [19]:
ros = RandomOverSampler(random_state=42)

In [20]:
X_train, y_train = ros.fit_resample(X_train, y_train)

Обучим алгоритм k ближайших соседей с подбором параметра количества проверок по сетке:

In [21]:
knn = KNeighborsClassifier()
knn_params = {"n_neighbors": [3, 5, 7]}
knn_grid = GridSearchCV(knn, knn_params, cv=5, scoring="f1")
knn_grid.fit(X_train, y_train)

Обучение алгоритма наивного Байеса:

In [22]:
nb = GaussianNB()
nb.fit(X_train, y_train)

Обучение алгоритм логистической регресси с подбором параметра C по сетке:

In [23]:
lr = LogisticRegression()
lr_params = {"C": [0.1, 1, 10]}
lr_grid = GridSearchCV(lr, lr_params, cv=5, scoring="f1")
lr_grid.fit(X_train, y_train)

In [24]:
svm = SVC()
svm_params = {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]}
svm_grid = GridSearchCV(svm, svm_params, cv=5, scoring="f1")
svm_grid.fit(X_train, y_train)

Получение прогнозов от каждой модели:

In [25]:
knn_pred = knn_grid.predict(X_test)
nb_pred = nb.predict(X_test)
lr_pred = lr_grid.predict(X_test)
svm_pred = svm_grid.predict(X_test)

In [30]:
print("[+] KNN Accuracy:", accuracy_score(y_test, knn_pred))
print("[+] KNN Precision:", precision_score(y_test, knn_pred))
print("[+] KNN Recall:", recall_score(y_test, knn_pred))
print("[+] KNN F1-Score:", f1_score(y_test, knn_pred))
print("[+] KNN ROC-AUC Score:", roc_auc_score(y_test, knn_pred))

[+] KNN Accuracy: 0.9964320354718027
[+] KNN Precision: 0.9698275862068966
[+] KNN Recall: 0.9896041583366654
[+] KNN F1-Score: 0.9796160696615873
[+] KNN ROC-AUC Score: 0.993341920636835


In [31]:
print("[+] Naive Bayes Accuracy:", accuracy_score(y_test, nb_pred))
print("[+] Naive Bayes Precision:", precision_score(y_test, nb_pred))
print("[+] Naive Bayes Recall:", recall_score(y_test, nb_pred))
print("[+] Naive Bayes F1-Score:", f1_score(y_test, nb_pred))
print("[+] Naive Bayes ROC-AUC Score:", roc_auc_score(y_test, nb_pred))

[+] Naive Bayes Accuracy: 0.9308230566717473
[+] Naive Bayes Precision: 0.5587138863000932
[+] Naive Bayes Recall: 0.9588164734106357
[+] Naive Bayes F1-Score: 0.7060209038716325
[+] Naive Bayes ROC-AUC Score: 0.9434921294500366


In [32]:
print("[+] Logistic regression Accuracy:", accuracy_score(y_test, lr_pred))
print("[+] Logistic regression Precision:", precision_score(y_test, lr_pred))
print("[+] Logistic regression Recall:", recall_score(y_test, lr_pred))
print("[+] Logistic regression F1-Score:", f1_score(y_test, lr_pred))
print("[+] Logistic regression ROC-AUC Score:", roc_auc_score(y_test, lr_pred))

[+] Logistic regression Accuracy: 0.9313080227241236
[+] Logistic regression Precision: 0.560969868173258
[+] Logistic regression Recall: 0.9528188724510196
[+] Logistic regression F1-Score: 0.7061786931397244
[+] Logistic regression ROC-AUC Score: 0.9410432588067666


In [33]:
print("[+] SVM Accuracy:", accuracy_score(y_test, svm_pred))
print("[+] SVM Precision:", precision_score(y_test, svm_pred))
print("[+] SVM Recall:", recall_score(y_test, svm_pred))
print("[+] SVM F1-Score:", f1_score(y_test, svm_pred))
print("[+] SVM ROC-AUC Score:", roc_auc_score(y_test, svm_pred))

[+] SVM Accuracy: 0.9913745323541637
[+] SVM Precision: 0.9118507681053402
[+] SVM Recall: 0.9968012794882047
[+] SVM F1-Score: 0.9524355300859599
[+] SVM ROC-AUC Score: 0.9938305331714926
