In [1]:
from sklearn.datasets import fetch_covtype
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#### 1. Загрузите датасет при помощи sklearn.datasets.fetch_covtype

In [2]:
dataset = fetch_covtype(shuffle=True)

In [3]:
x, y = dataset.data[:50000], dataset.target[:50000]
print(x.shape, y.shape)

(50000, 54) (50000,)


#### 2. Стратифицированно по классам разделите датасет на обучающую и тестовую выборку (в отношении 1 к 4)

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, train_size=0.75)

#### 3. Обучите решающее дерево. Определите точность предсказания на тестовой выборке. Подберите гиперпараметры алгоритма

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

tree = DecisionTreeClassifier(random_state=0)
tree.fit(x_train, y_train)
y_pred = tree.predict(x_test)
accuracy_score(y_pred, y_test)

0.81696

In [6]:
parametrs = {'ccp_alpha': range (0, 10)}
grid = GridSearchCV(tree, parametrs, cv=6)
grid.fit(x_train, y_train)
grid.best_params_

{'ccp_alpha': 0}

In [7]:
parametrs = {'max_depth': range (1, 30)}
grid = GridSearchCV(tree, parametrs, cv=6)
grid.fit(x_train, y_train)
grid.best_params_

{'max_depth': 21}

In [8]:
parametrs = {'min_samples_leaf': range (1, 10, 2),
             'min_samples_split': range (2, 10, 2)}
grid = GridSearchCV(tree, parametrs, cv=6)
grid.fit(x_train, y_train)
grid.best_params_

{'min_samples_leaf': 1, 'min_samples_split': 2}

In [10]:
tree = DecisionTreeClassifier(ccp_alpha=0, max_depth=21, min_samples_leaf=1, min_samples_split=2, random_state=0)
tree.fit(x_train, y_train)
y_pred = tree.predict(x_test)
accuracy_score(y_pred, y_test)

0.82192

#### 4. Обучите случайный лес, подберите оптимальные гиперпараметры и сравните точность с решающим деревом.

In [14]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(random_state=0)
forest.fit(x_train, y_train)
y_pred = forest.predict(x_test)
accuracy_score(y_pred, y_test)

0.86728

In [15]:
parametrs = {'n_estimators': range(100, 501, 50)}  
grid = GridSearchCV(forest, parametrs, cv=5)
grid.fit(x_train, y_train)
grid.best_params_

{'n_estimators': 450}

In [16]:
parametrs = {'max_features': ['auto', 'sqrt', 'log2', None]}
grid = GridSearchCV(forest, parametrs, cv=5)
grid.fit(x_train, y_train)
grid.best_params_

{'max_features': None}

In [17]:
parametrs = {'max_depth': range (1, 50)}
grid = GridSearchCV(forest, parametrs, cv=5)
grid.fit(x_train, y_train)
grid.best_params_

{'max_depth': 45}

In [18]:
parametrs = {'min_samples_leaf': range (1, 10, 2),
             'min_samples_split': range (2, 10, 2)}
grid = GridSearchCV(forest, parametrs, cv=5)
grid.fit(x_train, y_train)
grid.best_params_

{'min_samples_leaf': 1, 'min_samples_split': 2}

In [19]:
parametrs = {'criterion': ['gini', 'entropy']}
grid = GridSearchCV(forest, parametrs, cv=5)
grid.fit(x_train, y_train)
grid.best_params_

{'criterion': 'entropy'}

In [21]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=450,max_features=None,max_depth=45,criterion='entropy',random_state=0)
forest.fit(x_train, y_train)
y_pred = forest.predict(x_test)
accuracy_score(y_pred, y_test)

0.89048

Вывод: Точность случайного леса лучше, чем у решающего дерева

#### 6. Обучите на тех же данных CatBoostClassifier. Подберите оптимальные параметры. 

In [8]:
from catboost import CatBoostClassifier

model = CatBoostClassifier()
model.fit(x_train, y_train, verbose=False)
y_pred = model.predict(x_test)
accuracy_score(y_pred, y_test)

0.8476

In [9]:
from sklearn.model_selection import GridSearchCV

model = CatBoostClassifier()
parametrs = {'depth':[6, 8, 10, None]}

grid = GridSearchCV(estimator=model, param_grid=parametrs, cv=2, n_jobs=-1)
grid.fit(x_train, y_train, verbose=False)
print("The best depth across searched params:\n", grid.best_params_)

The best depth across searched params:
 {'depth': 10}


In [10]:
model = CatBoostClassifier(iterations=1500,
                           custom_loss=['Accuracy'],
                           eval_metric='Accuracy',       
                           use_best_model=True)
model.fit(x_train, y_train, eval_set = (x_test, y_test), early_stopping_rounds=50, verbose=False)

<catboost.core.CatBoostClassifier at 0x7f906f6f44f0>

In [11]:
print("Best iteration: ", model.get_best_iteration())

Best iteration:  1495


#### 7. Для всех трёх алгоритмов (решающее дерево, случайный лес, CatBoost) найдите индексы топ-5 самых важных признаков

In [36]:
import numpy as np

tree_importances = tree.feature_importances_
indices = np.argsort(tree_importances)[::-1]
print("Indexes of the top-5 most important attributes: ", indices[:5])
print("Feature ranking:")
for f in range(10):
    print("%d. feature %d (%f)" % (f + 1, indices[f], tree_importances[indices[f]]))

Indexes of the top-5 most important attributes:  [0 5 9 3 4]
Feature ranking:
1. feature 0 (0.359501)
2. feature 5 (0.118494)
3. feature 9 (0.110800)
4. feature 3 (0.064681)
5. feature 4 (0.045462)
6. feature 7 (0.045316)
7. feature 6 (0.037691)
8. feature 1 (0.035964)
9. feature 8 (0.032527)
10. feature 2 (0.022522)


In [37]:
forest_importances = forest.feature_importances_
indices = np.argsort(forest_importances)[::-1]
print("Indexes of the top-5 most important attributes: ", indices[:5])
print("Feature ranking:")
for f in range(10):
    print("%d. feature %d (%f)" % (f + 1, indices[f], forest_importances[indices[f]]))

Indexes of the top-5 most important attributes:  [0 5 9 3 4]
Feature ranking:
1. feature 0 (0.425927)
2. feature 5 (0.102520)
3. feature 9 (0.098887)
4. feature 3 (0.057414)
5. feature 4 (0.047281)
6. feature 7 (0.038454)
7. feature 1 (0.033827)
8. feature 6 (0.032824)
9. feature 8 (0.027713)
10. feature 2 (0.023067)


In [40]:
model_importances = model.get_feature_importance()
indices = np.argsort(model_importances)[::-1]
print("Indexes of the top-5 most important attributes: ", indices[:5])
print("Feature ranking:")
for f in range(10):
    print("%d. feature %d (%f)" % (f + 1, indices[f], model_importances[indices[f]]))

Indexes of the top-5 most important attributes:  [0 5 9 3 4]
Feature ranking:
1. feature 0 (29.848894)
2. feature 5 (15.332662)
3. feature 9 (13.129037)
4. feature 3 (5.684703)
5. feature 4 (4.783116)
6. feature 6 (3.952754)
7. feature 1 (3.810602)
8. feature 7 (3.803290)
9. feature 12 (3.470155)
10. feature 10 (2.501143)
