In [23]:
import pandas as pd

In [24]:
data = pd.read_csv('Pokemon.csv')

In [25]:
data

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,719,Diancie,Rock,Fairy,600,50,100,150,100,150,50,6,True
796,719,DiancieMega Diancie,Rock,Fairy,700,50,160,110,160,110,110,6,True
797,720,HoopaHoopa Confined,Psychic,Ghost,600,80,110,60,150,130,70,6,True
798,720,HoopaHoopa Unbound,Psychic,Dark,680,80,160,60,170,130,80,6,True


Columns description (it's crucial!)


- #: ID for each pokemon
- Name: Name of each pokemon
- Type 1: Each pokemon has a type, this determines weakness/resistance to attacks
- Type 2: Some pokemon are dual type and have 2
- Total: sum of all stats that come after this, a general guide to how strong a pokemon is
- HP: hit points, or health, defines how much damage a pokemon can withstand before fainting
- Attack: the base modifier for normal attacks (eg. Scratch, Punch)
- Defense: the base damage resistance against normal attacks
- SP Atk: special attack, the base modifier for special attacks (e.g. fire blast, bubble beam)
- SP Def: the base damage resistance against special attacks
- Speed: determines which pokemon attacks first each round

In [26]:
# fillna and drop useless cols

display(data.isnull().sum())
data['Type 2'] = data['Type 2'].fillna('No 2nd type')

data.drop(columns = ['#', 'Name'], inplace = True)

#               0
Name            0
Type 1          0
Type 2        386
Total           0
HP              0
Attack          0
Defense         0
Sp. Atk         0
Sp. Def         0
Speed           0
Generation      0
Legendary       0
dtype: int64

In [27]:
X = data.drop(columns = 'Legendary')
y = data['Legendary'].astype('int')

In [28]:
X.head()

Unnamed: 0,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation
0,Grass,Poison,318,45,49,49,65,65,45,1
1,Grass,Poison,405,60,62,63,80,80,60,1
2,Grass,Poison,525,80,82,83,100,100,80,1
3,Grass,Poison,625,80,100,123,122,120,80,1
4,Fire,No 2nd type,309,39,52,43,60,50,65,1


In [29]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Legendary, dtype: int64

In [30]:
y.value_counts(normalize = True)

Legendary
0    0.91875
1    0.08125
Name: proportion, dtype: float64

# Make some default pipeline

In [31]:
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from category_encoders.leave_one_out import LeaveOneOutEncoder
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
import sklearn

In [32]:
# define cat_cols

cat_cols = ['Type 1', 'Type 2']

default_pipeline = Pipeline([
    ('cat_encoder_', LeaveOneOutEncoder(cols = cat_cols)),
    ('scaler_', StandardScaler()),
    ('model_', SVC(kernel = 'linear'))]
)

In [33]:
cv_res1 = cross_validate(default_pipeline,
                        X,
                        y,
                        cv = 5,
                        scoring = 'f1',
                        n_jobs = -1,
                        return_train_score = True,
                        error_score = 'raise'
                       )

In [34]:
cv_res1

{'fit_time': array([0.07032895, 0.08630991, 0.09872198, 0.08691692, 0.06466913]),
 'score_time': array([0.00832582, 0.02092385, 0.00578094, 0.0136199 , 0.02724695]),
 'test_score': array([0.5       , 0.72727273, 0.47619048, 0.38095238, 0.64864865]),
 'train_score': array([0.71287129, 0.56097561, 0.6744186 , 0.7311828 , 0.72727273])}

In [35]:
cv_res1['train_score'].mean()

0.681344204901525

In [36]:
cv_res1['test_score'].mean()

0.5466128466128466

# Make pipeline more complicated

In [37]:
import pandas as pd
from category_encoders import LeaveOneOutEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.svm import SVC

In [38]:
# difficult pipeline

pipe_dif = Pipeline([
    ('cat_encoder_', LeaveOneOutEncoder(cols = cat_cols)),
    ('poly_featurizer_', PolynomialFeatures(degree = 4)),
    ('scaler_', StandardScaler()),
    ('model_', SVC(kernel = 'linear'))]
)

In [39]:
cv_res2 = cross_validate(pipe_dif,
                        X,
                        y,
                        cv = 5,
                        scoring = 'f1',
                        n_jobs = -1,
                        return_train_score = True
                       )

cv_res2

{'fit_time': array([0.09452009, 0.05255699, 0.0844748 , 0.095402  , 0.09460187]),
 'score_time': array([0.011096  , 0.00907397, 0.01084805, 0.00792122, 0.007864  ]),
 'test_score': array([0.375     , 0.88888889, 0.5       , 0.66666667, 0.53658537]),
 'train_score': array([0.95145631, 0.89583333, 0.97142857, 0.96153846, 0.98076923])}

In [40]:
cv_res2['train_score'].mean()

0.9522051815498418

In [41]:
cv_res2['test_score'].mean()

0.5934281842818427

train_score - просто класс ! модель получилась сложная, только очевидно переобученная ...

согласны, узнали ?


# Introduce feature selectors

In [42]:
pipe_dif

In [43]:
data_tr = pipe_dif[:-1]

In [44]:
data_tr

In [45]:
help(pipe_dif)

Help on Pipeline in module sklearn.pipeline object:

class Pipeline(sklearn.utils.metaestimators._BaseComposition)
 |  Pipeline(steps, *, memory=None, verbose=False)
 |  
 |  A sequence of data transformers with an optional final predictor.
 |  
 |  `Pipeline` allows you to sequentially apply a list of transformers to
 |  preprocess the data and, if desired, conclude the sequence with a final
 |  :term:`predictor` for predictive modeling.
 |  
 |  Intermediate steps of the pipeline must be 'transforms', that is, they
 |  must implement `fit` and `transform` methods.
 |  The final :term:`estimator` only needs to implement `fit`.
 |  The transformers in the pipeline can be cached using ``memory`` argument.
 |  
 |  The purpose of the pipeline is to assemble several steps that can be
 |  cross-validated together while setting different parameters. For this, it
 |  enables setting parameters of the various steps using their names and the
 |  parameter name separated by a `'__'`, as in the 

In [46]:
X_tr = data_tr.fit_transform(X, y)

In [47]:
X_tr

array([[ 0.        , -0.40675128, -1.46809988, ..., -0.7760009 ,
        -0.82565808, -0.78950478],
       [ 0.        , -0.40675128, -1.46809988, ..., -0.75915834,
        -0.82278699, -0.78950478],
       [ 0.        , -0.40675128, -1.46809988, ..., -0.72921601,
        -0.81895887, -0.78950478],
       ...,
       [ 0.        ,  1.62491492, -1.46809988, ...,  1.0887111 ,
         2.05978425,  2.42568009],
       [ 0.        ,  1.62491492, -1.46809988, ...,  1.6661703 ,
         2.47322076,  2.42568009],
       [ 0.        , -0.03035286, -1.46809988, ...,  1.0887111 ,
         2.05978425,  2.42568009]])

In [48]:
print(f'data shape after transformation is {X_tr.shape}')

data shape after transformation is (800, 1001)


1k признаков - многовато, добавим в пайплайн селектор

## Фильтрационные методы

Суть таких методов в том, чтобы для каждого признака посчитать некоторую метрику "связи" с целевым признаком. И в результате оставить топ-K признаков согласно выбранной метрике.

В том числе:

 - статистика хи-квадрат
 - mutual_info_classif

In [49]:
from sklearn.feature_selection import f_classif, chi2, mutual_info_classif

In [61]:
k_best = 50

pipe = Pipeline([
    ('cat_encoder_', LeaveOneOutEncoder(cols = cat_cols)),
    ('poly_featurizer_', PolynomialFeatures(degree = 4)),
    ('scaler_', StandardScaler()),
    ('selector_', SelectKBest(score_func = mutual_info_classif, k = 50)), 
    ('model_', SVC(kernel = 'linear'))]
)

In [62]:
cv_res = cross_validate(pipe, X, y, cv = 5, scoring = 'f1', return_train_score = True, error_score = 'raise')
cv_res

{'fit_time': array([1.25422812, 1.20774603, 1.18757701, 1.19072104, 1.23731899]),
 'score_time': array([0.00390887, 0.00380707, 0.00351906, 0.00411797, 0.00370717]),
 'test_score': array([0.14285714, 0.72      , 0.71428571, 0.42105263, 0.6       ]),
 'train_score': array([0.8       , 0.69565217, 0.75247525, 0.75247525, 0.8       ])}

In [64]:
cv_res['train_score'].mean()

0.7601205337925097

In [63]:
# k best нужно подбирать

cv_res['test_score'].mean()

0.5196390977443609

## Жадный метод отбора

In [65]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [66]:
rfe = RFE(LogisticRegression(max_iter = 1000), n_features_to_select = k_best, step = 30)

In [67]:
X_tr.shape

(800, 1001)

In [68]:
res = rfe.fit_transform(X_tr, y)
display(res.shape)
res

(800, 50)

array([[-0.98555744, -0.94218651, -0.91241606, ..., -0.41626469,
        -0.52135831, -0.72668962],
       [-0.48479877, -0.94218651, -0.51386057, ..., -0.10954899,
        -0.18003271, -0.70651232],
       [ 0.42451538, -0.94218651,  0.42528182, ...,  0.67405053,
         0.73602435, -0.6731154 ],
       ...,
       [-0.16049792, -0.94218651,  0.13418659, ...,  4.39362648,
         2.1076934 ,  4.87819432],
       [-0.16049792, -0.94218651,  0.31864298, ...,  5.8291638 ,
         2.51608978,  4.87819432],
       [ 1.36562373, -0.94218651,  1.51760948, ...,  1.1636675 ,
         0.19750623,  1.93926564]])

In [69]:
pipe_rfe = Pipeline([
    ('cat_encoder_', LeaveOneOutEncoder(cols=cat_cols)),
    ('poly_featurizer_', PolynomialFeatures(degree=4)),
    ('scaler_', StandardScaler()),
    ('selector_', RFE(LogisticRegression(max_iter=1000),
                      n_features_to_select=30,
                      step=30
                     )), 
    ('model_', SVC(kernel='linear'))])

In [70]:
cv_res3 = cross_validate(pipe_rfe, X, y, cv=5, scoring='f1', return_train_score=True)
cv_res3

{'fit_time': array([3.72923589, 3.5540092 , 3.31338501, 5.84997725, 4.50356102]),
 'score_time': array([0.00377917, 0.00471091, 0.008425  , 0.00409484, 0.00644493]),
 'test_score': array([0.33333333, 0.84615385, 0.83870968, 0.48      , 0.75      ]),
 'train_score': array([0.91089109, 0.82105263, 0.91262136, 0.91836735, 0.96078431])}

In [71]:
cv_res3['test_score'].mean()

0.6496393713813069

## С помощью L1 регуляризации

In [72]:
from sklearn.feature_selection import SelectFromModel

In [73]:
sel = SelectFromModel(LogisticRegression(penalty = 'l1', max_iter = 1000, solver = 'liblinear'), threshold = 1e-5)

In [74]:
# пример

res = sel.fit_transform(X_tr, y)
display(res.shape)
res

(800, 49)

array([[-1.39963712, -1.20562657, -0.44705251, ..., -0.52135831,
        -0.72668962, -0.81966779],
       [-1.39963712, -1.20562657, -0.37458929, ..., -0.18003271,
        -0.70651232, -0.81698899],
       [-1.39963712, -1.20562657, -0.24576578, ...,  0.73602435,
        -0.6731154 , -0.81341726],
       ...,
       [ 1.61195431, -1.20562657,  0.66347512, ...,  2.1076934 ,
         4.87819432,  4.18343684],
       [ 1.61195431, -1.20562657,  1.03195307, ...,  2.51608978,
         4.87819432,  4.18343684],
       [ 1.61195431, -1.20562657, -0.13354138, ...,  0.19750623,
         1.93926564,  2.6404483 ]])

In [75]:
pipe_lasso =  Pipeline([
    ('cat_encoder_', LeaveOneOutEncoder(cols=cat_cols)),
    ('poly_featurizer_', PolynomialFeatures(degree=4)),
    ('scaler_', StandardScaler()),
    ('selector_', SelectFromModel(LogisticRegression(penalty = 'l1', max_iter = 1000, solver = 'liblinear'), 
                                  threshold=1e-5)), 
    ('model_', SVC(kernel = 'linear'))])

In [76]:
cv_res4 = cross_validate(pipe_lasso, X, y, cv = 5, scoring = 'f1', return_train_score = True)
cv_res4

{'fit_time': array([0.08206892, 0.08668113, 0.07276487, 0.064605  , 0.08207011]),
 'score_time': array([0.0039649 , 0.00360203, 0.00409913, 0.00446415, 0.00374293]),
 'test_score': array([0.44444444, 0.84615385, 0.64285714, 0.60869565, 0.68421053]),
 'train_score': array([0.92307692, 0.89795918, 0.95145631, 0.93877551, 0.92929293])}

In [77]:
cv_res4['train_score'].mean()

0.928112171385403

In [78]:
cv_res4['test_score'].mean()

0.6452723223890271

# Нелинейные классификаторы

- SVM с ядром
- Наивный байесовский классификатор
- Метод k ближайших соседей

In [79]:
help(SVC)

Help on class SVC in module sklearn.svm._classes:

class SVC(sklearn.svm._base.BaseSVC)
 |  SVC(*, C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=None)
 |  
 |  C-Support Vector Classification.
 |  
 |  The implementation is based on libsvm. The fit time scales at least
 |  quadratically with the number of samples and may be impractical
 |  beyond tens of thousands of samples. For large datasets
 |  consider using :class:`~sklearn.svm.LinearSVC` or
 |  :class:`~sklearn.linear_model.SGDClassifier` instead, possibly after a
 |  :class:`~sklearn.kernel_approximation.Nystroem` transformer or
 |  other :ref:`kernel_approximation`.
 |  
 |  The multiclass support is handled according to a one-vs-one scheme.
 |  
 |  For details on the precise mathematical formulation of the provided
 |  kernel functions and how `g

In [80]:
pipe_lasso2 =  Pipeline([
    ('cat_encoder_', LeaveOneOutEncoder(cols = cat_cols)),
    ('poly_featurizer_', PolynomialFeatures(degree = 4)),
    ('scaler_', StandardScaler()),
    ('selector_', SelectFromModel(LogisticRegression(penalty = 'l1', max_iter = 1000, solver = 'liblinear'), 
                                  threshold = 1e-5)), 
    ('model_', SVC(kernel = 'rbf'))])

In [81]:
cv_res5 = cross_validate(pipe_lasso2, X, y, cv = 5, scoring = 'f1', return_train_score=True)
cv_res5

{'fit_time': array([0.08592105, 0.07492518, 0.07165504, 0.06402302, 0.07623506]),
 'score_time': array([0.00577378, 0.00622988, 0.00545406, 0.00497794, 0.004812  ]),
 'test_score': array([0.35294118, 0.7       , 0.5       , 0.42105263, 0.54545455]),
 'train_score': array([0.90909091, 0.83146067, 0.85106383, 0.91666667, 0.91836735])}

In [82]:
cv_res5['train_score'].mean()

0.8853298853281777

In [83]:
cv_res5['test_score'].mean()

0.5038896707008161

Обучите наивный байесовский классификатор и метод k ближайших соседей вместо SVM в пайплайне выше. 

In [84]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB

In [None]:
# your code here

С помощью GridSearch подберите гиперпараметр KNN (число соседей) внутри пайплайна.

In [None]:
# your code here

# Сохранение и загрузка модели

In [85]:
import pickle

Обучим лучшую модель на всех данных и сохраним её в файл.

In [None]:
#your code here

model = ...

Сохранение модели в файл

In [None]:
filename = 'best_model.sav'
pickle.dump(model, open(filename, 'wb'))

Загрузка модели из файла

In [None]:
loaded_model = pickle.load(open(filename, 'rb'))

Почитать подробнее про сохранение модели в файл и загрузку из файла тут: https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/