In [154]:
import pandas as pd


DATA_URL = "https://gist.githubusercontent.com/armgilles/194bcff35001e7eb53a2a8b441e8b2c6/raw/92200bc0a673d5ce2110aaad4544ed6c4010f687/pokemon.csv"
data = pd.read_csv(DATA_URL)

In [155]:
data.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


In [156]:
data["Generation"].unique()

array([1, 2, 3, 4, 5, 6])

In [157]:
cat_features = ["Type 1", "Type 2", "Generation"]

## **Use one hot encoding**

In [158]:
data["Generation"] = data["Generation"].astype(str)

In [159]:
data["Generation"]

0      1
1      1
2      1
3      1
4      1
      ..
795    6
796    6
797    6
798    6
799    6
Name: Generation, Length: 800, dtype: object

In [160]:
ohe_encoded_cat_features = pd.get_dummies(data[cat_features])

In [161]:
cols_to_drop = cat_features + ["Name", "#"]
data.drop(cols_to_drop, axis=1, inplace=True)

In [162]:
numeric_features = data.drop(["Legendary"], axis=1)
target = data["Legendary"]

In [163]:
target = target.astype(int)

In [164]:
from sklearn.model_selection import train_test_split

In [165]:
X_cat_train, X_cat_test, X_num_train, X_num_test, y_train, y_test = train_test_split(ohe_encoded_cat_features,
                                                                                     numeric_features,
                                                                                     target, test_size=0.2,
                                                                                     random_state=17)

In [166]:
from sklearn.preprocessing import StandardScaler

In [167]:
scaler = StandardScaler()
scaler.fit(X_num_train)

In [168]:
X_num_train.loc[:, :] = scaler.transform(X_num_train)
X_num_test.loc[:, :] = scaler.transform(X_num_test)

In [169]:
X_num_test

Unnamed: 0,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
201,0.531785,0.818260,-0.140578,0.018348,0.521474,1.022130,0.064073
797,1.372613,0.417888,0.944890,-0.466879,2.383190,2.113312,0.064073
781,-0.855582,-1.023450,-0.419698,-0.143395,-0.905841,-0.614642,-0.416745
25,-0.199736,-0.583041,0.045502,-0.466879,-0.719670,-0.069051,0.991366
614,-1.023748,0.017516,0.324623,-0.952106,-1.805670,-0.978369,-0.622810
...,...,...,...,...,...,...,...
172,-1.032156,-0.783227,-0.450711,-0.337486,-0.905841,-0.869251,-0.863219
416,1.204447,0.417888,-0.140578,2.444483,0.056045,2.840766,-0.622810
588,-0.914440,-0.382856,0.169556,-1.113849,-1.340241,-0.978369,-0.004615
563,-1.528245,-0.983413,-0.760845,-1.146197,-1.185098,-1.196606,-0.897564


In [170]:
X_train = X_num_train.join(X_cat_train)
X_test = X_num_test.join(X_cat_test)

In [171]:
X_train.isna().sum()

Total              0
HP                 0
Attack             0
Defense            0
Sp. Atk            0
Sp. Def            0
Speed              0
Type 1_Bug         0
Type 1_Dark        0
Type 1_Dragon      0
Type 1_Electric    0
Type 1_Fairy       0
Type 1_Fighting    0
Type 1_Fire        0
Type 1_Flying      0
Type 1_Ghost       0
Type 1_Grass       0
Type 1_Ground      0
Type 1_Ice         0
Type 1_Normal      0
Type 1_Poison      0
Type 1_Psychic     0
Type 1_Rock        0
Type 1_Steel       0
Type 1_Water       0
Type 2_Bug         0
Type 2_Dark        0
Type 2_Dragon      0
Type 2_Electric    0
Type 2_Fairy       0
Type 2_Fighting    0
Type 2_Fire        0
Type 2_Flying      0
Type 2_Ghost       0
Type 2_Grass       0
Type 2_Ground      0
Type 2_Ice         0
Type 2_Normal      0
Type 2_Poison      0
Type 2_Psychic     0
Type 2_Rock        0
Type 2_Steel       0
Type 2_Water       0
Generation_1       0
Generation_2       0
Generation_3       0
Generation_4       0
Generation_5 

Preprocessing

In [172]:
from sklearn.linear_model import LogisticRegression


lr = LogisticRegression()
lr.fit(X_train, y_train)

In [173]:
from sklearn.metrics import roc_auc_score


y_pred_train = lr.predict_proba(X_train)[:, 1]
roc_train = roc_auc_score(y_train, y_pred_train)
print(roc_train)

y_pred_test = lr.predict_proba(X_test)[:, 1]
roc_test = roc_auc_score(y_test, y_pred_test)
print(roc_test)

0.9866506874396618
0.9809197651663405


In [174]:
y_pred_train.shape

(640,)

## Наивный байесовский классификатор

In [175]:
from sklearn.naive_bayes import GaussianNB


nb = GaussianNB()
nb.fit(X_train, y_train)

In [176]:
from sklearn.metrics import roc_auc_score


y_pred_train = nb.predict_proba(X_train)[:, 1]
roc_train = roc_auc_score(y_train, y_pred_train)
print(roc_train)

y_pred_test = nb.predict_proba(X_test)[:, 1]
roc_test = roc_auc_score(y_test, y_pred_test)
print(roc_test)

0.6519524617996604
0.5393835616438356


## k-ближайших соседей

In [177]:
from sklearn.neighbors import KNeighborsClassifier


knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [178]:
from sklearn.metrics import roc_auc_score


y_pred_train = knn.predict_proba(X_train)[:, 1]
roc_train = roc_auc_score(y_train, y_pred_train)
print(roc_train)

y_pred_test = knn.predict_proba(X_test)[:, 1]
roc_test = roc_auc_score(y_test, y_pred_test)
print(roc_test)

0.9879323546056793
0.9796966731898239


## Деревья решений

In [179]:
from sklearn.tree import DecisionTreeClassifier


tree = DecisionTreeClassifier(max_depth=5)
tree.fit(X_train, y_train)

In [180]:
from sklearn.metrics import roc_auc_score


y_pred_train = tree.predict_proba(X_train)[:, 1]
roc_train = roc_auc_score(y_train, y_pred_train)
print(roc_train)

y_pred_test = tree.predict_proba(X_test)[:, 1]
roc_test = roc_auc_score(y_test, y_pred_test)
print(roc_test)

0.9949232664203203
0.9535225048923679


## Метод опорных векторов

In [181]:
from sklearn.svm import SVC


svm = SVC(probability=True)
svm.fit(X_train, y_train)

In [182]:
from sklearn.metrics import roc_auc_score


y_pred_train = svm.predict_proba(X_train)[:, 1]
roc_train = roc_auc_score(y_train, y_pred_train)
print(roc_train)

y_pred_test = svm.predict_proba(X_test)[:, 1]
roc_test = roc_auc_score(y_test, y_pred_test)
print(roc_test)

0.9966709943739805
0.9838551859099804
