# Лабораторная работа 2.
### Вариант 1

Работа выполнена студентом группы М1О-415Бки-19 Кравченко Д.В.

# Подготовка данных
## Загрузка датасета из файла

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import pickle

dataset = pd.read_csv("datasets/breast_cancer.csv")
dataset = dataset.dropna(axis=1)
labels = dataset.pop('diagnosis').astype('category').cat.codes

## Разделение на тестовую и обучающую выборки

In [2]:
from sklearn.model_selection import StratifiedShuffleSplit

seed = 1989
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)

for train, test in sss.split(dataset, labels):
    pass

print(f'Train set size:\t{len(train)}\nTest set size:\t{len(test)}')


Train set size:	455
Test set size:	114


## Предобработка данных

In [3]:
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

id_col = ['id']
num_cols = [col for col in dataset.columns if col not in id_col]

numeric_transformer = StandardScaler()
id_transformer = 'drop'

preprocessor = ColumnTransformer(
    transformers=[
        ("id", id_transformer, id_col),
        ("num", numeric_transformer, num_cols)
    ],
    verbose_feature_names_out = False
)
preprocessor.set_output(transform='pandas')

# Обучение и оценка моделей

In [4]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, roc_auc_score
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

X_test = dataset.iloc[test]
y_test = labels.iloc[test]

X_train = dataset.iloc[train]
y_train = labels.iloc[train]


def fit_and_score(model, X_train, y_train, name):

    model.fit(X_train, y_train)

    with open(f"models/{name}.pkl", "wb") as f:
        pickle.dump(model, f, protocol=3)
    with open(f"models/{name}_params.txt", "w") as f:
        f.write(str(model.best_params_))

    print(model.best_params_, '\n' + '=' * 20)

    y_pred = model.predict(X_test)
    y_true = np.array(y_test)

    print(  f'Confusion matrix:\n {confusion_matrix(y_true, y_pred)}\n\n'
        f'Accuracy:\t{accuracy_score(y_true, y_pred)}\n'
        f'Recall:\t\t{recall_score(y_true, y_pred)}\n'
        f'Precision:\t{precision_score(y_true, y_pred)}\n'
        f'ROC AUC Curve:\t{roc_auc_score(y_true, y_pred)}\n'
        )

## Логистическая регрессия

In [5]:
from sklearn.linear_model import LogisticRegression

log = LogisticRegression(max_iter=250)

log_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('clf', log)
])

parameters = [
    {'clf__C': [1, 10], 'clf__solver': ('lbfgs', 'newton-cg', 'newton-cholesky', 'sag'), 'clf__penalty': ['l2']},
    {'clf__C': [1, 10], 'clf__solver': ['liblinear'], 'clf__penalty': ('l1', 'l2')},
    {'clf__C': [1, 10], 'clf__solver': ['saga'], 'clf__penalty': ('l1', 'l2')},
    {'clf__C': [1, 10], 'clf__solver': ['saga'], 'clf__penalty': ['elasticnet'], 'clf__l1_ratio': [0.1, 0.9]} 
]

log_model = GridSearchCV(log_pipeline, parameters)

fit_and_score(log_model, X_train, y_train, 'logistic')

{'clf__C': 1, 'clf__penalty': 'l2', 'clf__solver': 'liblinear'} 
Confusion matrix:
 [[70  2]
 [ 2 40]]

Accuracy:	0.9649122807017544
Recall:		0.9523809523809523
Precision:	0.9523809523809523
ROC AUC Curve:	0.9623015873015873



## SVM

In [9]:
from sklearn.svm import SVC

svm = SVC()

svm_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('clf', svm)
])

parameters = [
    {'clf__C': [1, 10], 'clf__kernel': ['poly'], 'clf__degree': [1,8], 'clf__gamma': ('scale', 'auto'), 'clf__coef0': [-1.0, 1.0]},
    {'clf__C': [1, 10], 'clf__kernel': ['poly'], 'clf__degree': [1,8], 'clf__gamma': [0, 20], 'clf__coef0': [-1.0, 1.0]},
    {'clf__C': [1, 10], 'clf__kernel': ['sigmoid'], 'clf__gamma': ('scale', 'auto'), 'clf__coef0': [-1.0, 1.0]},
    {'clf__C': [1, 10], 'clf__kernel': ['sigmoid'], 'clf__gamma': [0, 20], 'clf__coef0': [-1.0, 1.0]},
    {'clf__C': [1, 10], 'clf__kernel': ['rbf'], 'clf__gamma': ('scale', 'auto')},
    {'clf__C': [1, 10], 'clf__kernel': ['rbf'], 'clf__gamma': [0, 20]},
    {'clf__C': [1, 10], 'clf__kernel': ['linear'],}
]

svm_model = GridSearchCV(svm_pipeline, parameters)

fit_and_score(svm_model, X_train, y_train, 'svm')


{'clf__C': 1, 'clf__coef0': -1.0, 'clf__degree': 1, 'clf__gamma': 'scale', 'clf__kernel': 'poly'} 
Confusion matrix:
 [[71  1]
 [ 2 40]]

Accuracy:	0.9736842105263158
Recall:		0.9523809523809523
Precision:	0.975609756097561
ROC AUC Curve:	0.9692460317460317



## KNN

In [14]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

knn_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('clf', knn)
])

parameters = [
    {'clf__n_neighbors': [3, 12], 'clf__weights': ('uniform', 'distance'), 'clf__algorithm': ('ball_tree', 'kd_tree'), 
        'clf__leaf_size': [10, 60], 'clf__p': [1,2]},
    {'clf__n_neighbors': [3, 12], 'clf__weights': ('uniform', 'distance'), 'clf__algorithm': ['brute'], 'clf__p': [1,2]},
]

knn_model = GridSearchCV(knn_pipeline, parameters)

fit_and_score(knn_model, X_train, y_train, 'knn')

{'clf__algorithm': 'ball_tree', 'clf__leaf_size': 10, 'clf__n_neighbors': 12, 'clf__p': 2, 'clf__weights': 'distance'} 
Confusion matrix:
 [[70  2]
 [ 4 38]]

Accuracy:	0.9473684210526315
Recall:		0.9047619047619048
Precision:	0.95
ROC AUC Curve:	0.9384920634920634



## Наивный байесовский классификатор

In [21]:
from sklearn.naive_bayes import GaussianNB

bay = GaussianNB()

bay_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('clf', bay)
])

parameters = [
    {'clf__var_smoothing': [1e-10, 1.0]}
]

bay_model = GridSearchCV(bay_pipeline, parameters)

fit_and_score(bay_model, X_train, y_train, 'bayes')

{'clf__var_smoothing': 1e-10} 
Confusion matrix:
 [[67  5]
 [ 6 36]]

Accuracy:	0.9035087719298246
Recall:		0.8571428571428571
Precision:	0.8780487804878049
ROC AUC Curve:	0.8938492063492064



## Случайный лес

In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

forest = RandomForestClassifier()

forest_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('clf', forest)
])

parameters = [
    {'clf__n_estimators': [60, 140], 'clf__criterion': ('gini', 'entropy', 'log_loss'), 'clf__min_samples_split': [1, 5], 
     'clf__min_samples_leaf': [2, 4], 'clf__min_weight_fraction_leaf': [0.0, 0.1], 'clf__max_features': ('sqrt', 'log2', None),
     'clf__min_impurity_decrease': [0.0, 0.1], 'clf__ccp_alpha':[0.0, 0.2]}
]

forest_model = HalvingGridSearchCV(forest_pipeline, parameters, n_jobs=-1)

fit_and_score(forest_model, X_train, y_train, 'randforest')

{'clf__ccp_alpha': 0.0, 'clf__criterion': 'entropy', 'clf__max_features': 'log2', 'clf__min_impurity_decrease': 0.0, 'clf__min_samples_leaf': 4, 'clf__min_samples_split': 5, 'clf__min_weight_fraction_leaf': 0.0, 'clf__n_estimators': 60} 
Confusion matrix:
 [[71  1]
 [ 5 37]]

Accuracy:	0.9473684210526315
Recall:		0.8809523809523809
Precision:	0.9736842105263158
ROC AUC Curve:	0.933531746031746



# Итоги
|              | Accuracy | Recall | Precision | ROC AUC Curve |
|--------------|----------|--------|-----------|---------------|
| Лог. регр-я  | 0.965    | 0.952  | 0.952     | 0.962         |
| SVM          | 0.974    | 0.952  | 0.976     | 0.969         |
| KNN          | 0.947    | 0.905  | 0.950     | 0.938         |
| Наивн. Байес | 0.903    | 0.857  | 0.878     | 0.894         |
| Случ. лес    | 0.947    | 0.880  | 0.974     | 0.933         |

Как можно увидеть по сравнению оценок моделей, наилучшие результаты показал метод SVM. Наихудший результат показан наивным Байесовским классификатором, что обусловлено тем, что данный классификатор лучше работает при независимых друг от друга параметрах. Лучшая достигнутая точность составляет 97.4%, что превосходит результаты 1 л/р, но все еще недостаточно для реального применения.
