# Введение в машинное обучение

## Семинар #8

### Екатерина Кондратьева

ekaterina.kondrateva@skoltech.ru

## Отбор и генерация признаков (Feature Engineering). Поиск и оптимизация модели (Grid Search). Поиск аномалий (Anomaly Detection)

# 1. Отбор и генерация признаков: Feature Engineering


Feature Engineering, как мы уже знаем из предыдущей лекции - очень общий термин, который включает в себя:
    a. преобработку данных и составление датасета
    - перевод категориальных признаков в бинарные
    - заполнение пропусков данных
    - снижение размерности данных (выбор характеристик)
    - генерацию новых признаков из набора данных. включает: генерацию новых характеристик исходя из знания предметной области, или геометрические методы снижения размерности данных
    
### Источники:
    
   1. Размышления на тему https://habr.com/ru/company/mlclass/blog/248129/,
    https://habr.com/ru/company/mlclass/blog/249759/
   2. Для датасета Титаник https://habr.com/ru/company/otus/blog/433084/
   3. Лекция https://www.youtube.com/watch?v=leTyvBPhYzw

In [None]:
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
%matplotlib inline

## Пример 1: Выжившие в катастрофе Титаника

Соревнование: 
Источник: https://www.kaggle.com/kernels/scriptcontent/13445201/download

In [None]:
import os

os.listdir()

In [None]:
os.chdir('feature_engineering_for_titanic/')
%run -i titanic_solved.ipynb

## Пример 2: Крестики - нолики

Конкурс: https://datahub.io/machine-learning/tic-tac-toe-endgame


Подходы к решению: https://towardsdatascience.com/tic-tac-toe-creating-unbeatable-ai-with-minimax-algorithm-8af9e52c1e7d

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from mlxtend.plotting import plot_decision_regions

In [None]:
os.chdir('../')

In [None]:
data = pd.read_csv('data/tic-tac-toe_csv.csv')
X, y = data.drop('class', axis=1), data['class'].astype(int)
data.sample(3)

In [None]:
X = X.applymap(lambda v: 1 if v == 'x' else -1 if v == 'o' else 0)
X.sample(4).sort_index()

In [None]:
rf = RandomForestClassifier(n_estimators=400, max_depth=10)
svm = SVC(gamma='auto')
lr = LogisticRegression(solver='lbfgs')
clfs = (rf, svm, lr)

for clf in clfs:
    score = cross_val_score(clf, X, y, cv=5).mean()
    name = clf.__class__.__name__
    print(f"{name} scored {round(score, 3)}")

# 2. Оптимизация модели: поиск гиперпараметров

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

parameters = {
    'kernel':( 'linear', 'rbf', 'poly', 'sigmoid'), 
    'C':[0.1, 0.2, 0.21, 0.23, 0.25, 0.3, 0.4, 0.5, 1,  5]
}
svm_new = GridSearchCV(svm, parameters, cv=5)
svm_new.fit(X, y)

print(f"Best score for SVM {svm_new.best_score_}")
print(f"Best params for SVM {svm_new.best_params_}")

**Вопрос**: по какому критерию происходит выбор модели и гиперпараметров?

In [None]:
rf = RandomForestClassifier(n_estimators=400, max_depth=10, random_state=404)

X_new = X
X_new['sum'] = X.sum(1)
score = cross_val_score(rf, X_new, y, cv=5).mean()
name = rf.__class__.__name__

print(f"{name} scored {round(score, 3)}")

In [None]:
X_features = pd.DataFrame()
X_features['rowT'] = X['TL'] + X['TM'] + X['TR']
X_features['rowM'] = X['ML'] + X['MM'] + X['MR']
X_features['rowB'] = X['BL'] + X['BM'] + X['BR']
X_features['colL'] = X['TL'] + X['ML'] + X['BL']
X_features['colM'] = X['TM'] + X['MM'] + X['BM']
X_features['colR'] = X['TR'] + X['MR'] + X['BR']
X_features['diag1'] = X['TL'] + X['MM'] + X['BR']
X_features['diag2'] = X['BL'] + X['MM'] + X['TR']

In [None]:
rf = RandomForestClassifier(n_estimators=400, max_depth=10, random_state=404)

score = cross_val_score(rf, X_features, y, cv=5).mean()
name = rf.__class__.__name__
print(f'{name} scored {round(score, 3)}')

In [None]:
parameters = {
    'kernel':( 'linear', 'rbf', 'poly', 'sigmoid'), 
    'C':[0.1, 0.2, 0.21, 0.23, 0.25, 0.3, 0.4, 0.5, 1,  5]
}
svm_new = GridSearchCV(svm, parameters, cv=5)
svm_new.fit(X_features, y)
print(f"Best score for SVM {svm_new.best_score_}")
print(f"Best params for SVM {svm_new.best_params_}")

## Пример 1: использование полиномиальных характеристик

In [None]:
from sklearn.datasets import make_moons
X, y = make_moons(n_samples=100, noise=0.2, random_state=42)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from mlxtend.plotting import plot_decision_regions

clfs = [RandomForestClassifier(), 
        LogisticRegression(),
        SVC(), 
        KNeighborsClassifier()]
        

fig, axes = plt.subplots(
    ncols=(len(clfs) + 1)// 2,
    nrows=2, figsize=(6 * ((len(clfs) + 1)) // 2, 12),
    dpi=75
)

labels = [ 'RandomForestClassifier', 'LogisticRegression',
         'SVM with RBF kernel', 'kNN']

for clf, ax, label in zip(clfs, axes.flat, labels):
    clf.fit(X, y)
    fig = plot_decision_regions(X=X, y=y, clf=clf, legend=1, ax=ax)
    accuracy = clf.score(X, y)
    ax.set_title(label + ', accuracy = ' + str(accuracy))

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from mlxtend.plotting import plot_decision_regions
from sklearn.pipeline import Pipeline

polyTransform = PolynomialFeatures(degree=2)
scaler = StandardScaler()
X_poly = polyTransform.fit_transform(X)
X_poly = scaler.fit_transform(X_poly)

clfTree = Pipeline([('polyTransform', PolynomialFeatures(degree=2)), 
                     ('scaler', scaler),
                ('decision_tree', DecisionTreeClassifier(max_depth=4))])

clfForest = Pipeline([('polyTransform', PolynomialFeatures(degree=2)), 
                     ('scaler', scaler),
                ('random_forest', RandomForestClassifier(n_estimators=25, max_depth=3))])

clfs = [clfTree, clfForest]

fig, axes = plt.subplots(
    ncols=2, nrows=1, figsize=(18, 8),
    dpi=75
)

labels = ['DecisionTreeClassifier', 'RandomForestClassifier']

for clf, ax, label in zip(clfs, axes.flat, labels):
    clf.fit(X, y)
    fig = plot_decision_regions(X=X, y=y, clf=clf, legend=1, ax=ax)
    accuracy = clf.score(X, y)
    ax.set_title(label + ', accuracy = ' + str(accuracy))

## Пример 2: Мультикласс классификация для конкурса предсказания состава стекла

Данные: https://www.kaggle.com/uciml/glass#glass.csv

In [None]:
data = pd.read_csv('data/glass.csv')
X, y = data.drop('Type', axis=1), data.Type
data.sample(3)

In [None]:
data['Type'].value_counts()

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
import time

clfs = [DecisionTreeClassifier(max_depth = 7), 
        RandomForestClassifier(n_estimators=100, max_depth=3),
        KNeighborsClassifier(n_neighbors=5),
        LogisticRegression()]

names = ['Decision Tree', 'Random Forest', 'KNN', 'Logistic regression']

Scores = pd.DataFrame({'method':['Multiclass', 'OneVsRest', 'OneVsOne']})
Times = pd.DataFrame({'method':['Multiclass', 'OneVsRest', 'OneVsOne']})

for clf, name in zip(clfs, names):
    scores = []
    times = []
    start = time.time() #отсчет времени
    score = cross_val_score(estimator=clf, X=X, y=y, scoring='f1_macro', cv=5).mean()
    end = time.time()
    times.append(end - start)
    scores.append(score)
    
    clf = OneVsRestClassifier(clf)
    start = time.time()
    score = cross_val_score(estimator=clf, X=X, y=y, scoring='f1_macro', cv=5).mean()
    end = time.time()
    times.append(end - start)
    scores.append(score)
    
    clf = OneVsOneClassifier(clf)
    start = time.time()
    score = cross_val_score(estimator=clf, X=X, y=y, scoring='f1_macro', cv=5).mean()
    end = time.time()
    times.append(end - start)
    scores.append(score)
    
    Scores[name] = scores
    Times[name] = times

In [None]:
Scores

In [None]:
Times

#### Посмотрим на скоры лучших моделей. Как расположены предсказания классов: ближе к [0,1]  или около 0.5

### Grid search для мультиклассовой классификации:

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest

In [None]:
parameters = {# <YOUR CODE>}

dtc= # <YOUR CODE>
dtc_new = GridSearchCV(dtc, parameters, cv=5)
dtc_new.fit(X, y)
print(f"Best score for dtc {dtc_new.best_score_}")
print(f"Best params for dtc {dtc_new.best_params_}")

In [None]:
polyTransform = PolynomialFeatures(degree=2)
scaler = StandardScaler()
X_poly = polyTransform.fit_transform(X)
X_poly = scaler.fit_transform(X_poly)

clfTree = Pipeline([('polyTransform', PolynomialFeatures(degree=3)), 
                     ('scaler', scaler),
                    ('dim_red', SelectKBest(k=6)),
                ('decision_tree', DecisionTreeClassifier(max_depth=4))])

clfForest = Pipeline([('polyTransform', PolynomialFeatures(degree=3)), 
                     ('scaler', scaler),
                      ('dim_red', SelectKBest(k=6)),
                ('random_forest', RandomForestClassifier(n_estimators=25, max_depth=3))])

In [None]:
clfs = [clfTree, clfForest ]

names = ['Decision Tree', 'Random Forest', 'KNN', 'Logistic regression']

Scores = pd.DataFrame({'method':['Multiclass', 'OneVsRest', 'OneVsOne']})
Times = pd.DataFrame({'method':['Multiclass', 'OneVsRest', 'OneVsOne']})

for clf, name in zip(clfs, names):
    scores = []
    times = []
    start = time.time() #отсчет времени
    score = cross_val_score(estimator=clf, X=X, y=y, scoring='f1_macro', cv=5).mean()
    end = time.time()
    times.append(end - start)
    scores.append(score)
    
    clf = OneVsRestClassifier(clf)
    start = time.time()
    score = cross_val_score(estimator=clf, X=X, y=y, scoring='f1_macro', cv=5).mean()
    end = time.time()
    times.append(end - start)
    scores.append(score)
    
    clf = OneVsOneClassifier(clf)
    start = time.time()
    score = cross_val_score(estimator=clf, X=X, y=y, scoring='f1_macro', cv=5).mean()
    end = time.time()
    times.append(end - start)
    scores.append(score)
    
    Scores[name] = scores
    Times[name] = times

In [None]:
Scores

In [None]:
Times

# Отбор признаков:

### Мы можем проводить отбор признаков:
- на основе модели
- статистическими тестами 
- регуляризацией

In [None]:
from sklearn.feature_selection import SelectKBest, SelectPercentile, SelectFromModel

In [None]:
select= SelectKBest(k=6)

In [None]:
select.fit_transform(X, y)

In [None]:
X_new = X[X.columns[select.get_support()]]

In [None]:
select.scores_

 #### Попробуем реализовать это в пайплайне:


In [None]:
from sklearn.feature_selection import SelectKBest, SelectPercentile, SelectFromModel
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

cancer= load_breast_cancer()

X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, stratify=cancer.target, random_state=42)

In [None]:
clf = Pipeline([('selectk', SelectKBest(k=6)), 
                     ('scaler', StandardScaler()),
                ('decision_tree',  RandomForestClassifier(n_estimators=25, max_depth=3))])

cross_val_score(estimator=clf, X=X_train, y=y_train, cv=5).mean()