In [139]:
import pandas as pd
import matplotlib
import pylab as pl
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier


def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(axis=1)
    return df[indices_to_keep].astype(np.float64)


df = pd.read_csv('test.csv')
df.drop_duplicates()


for i in ["satisfied", "neutral or dissatisfied"]:
    print(len(df[df['satisfaction']==i]))

11403
14573


В качестве датасета были выбраны данные о клиентах авиакомпании и о их удовлетворённости полётом, каждый пассажир обладает такими атрибутами, как пол, возраст, класс полёта, причина поездки, сервис и тд, а так же атрибутом satisfaction, который может принимать всего два значения - "удовлетворён" и "неудовлетворён". Именно этот атрибут будет выступать в качестве класса. Количество удовлетворенных полетом людей - 11403, неудовлетворённых - 14573, дисбаланс классов практически отсутствует

Приведем строковые данные к числовому виду (мужской пол - 0, женский - 1, и тд):

In [140]:
df['Gender'] = pd.factorize(df['Gender'])[0]
df['Customer Type'] = pd.factorize(df['Customer Type'])[0]
df['Type of Travel'] = pd.factorize(df['Type of Travel'])[0]
df['Class'] = pd.factorize(df['Class'])[0]
df['satisfaction']=pd.factorize(df['satisfaction'])[0]
clean_dataset(df)

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0.0,19556.0,0.0,0.0,52.0,0.0,0.0,160.0,5.0,4.0,...,5.0,5.0,5.0,5.0,2.0,5.0,5.0,50.0,44.0,0.0
1,1.0,90035.0,0.0,0.0,36.0,0.0,1.0,2863.0,1.0,1.0,...,4.0,4.0,4.0,4.0,3.0,4.0,5.0,0.0,0.0,0.0
2,2.0,12360.0,1.0,1.0,20.0,0.0,0.0,192.0,2.0,0.0,...,2.0,4.0,1.0,3.0,2.0,2.0,2.0,0.0,0.0,1.0
3,3.0,77959.0,1.0,0.0,44.0,0.0,1.0,3377.0,0.0,0.0,...,1.0,1.0,1.0,1.0,3.0,1.0,4.0,0.0,6.0,0.0
4,4.0,36875.0,0.0,0.0,49.0,0.0,0.0,1182.0,2.0,3.0,...,2.0,2.0,2.0,2.0,4.0,2.0,4.0,0.0,20.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25971,25971.0,78463.0,1.0,1.0,34.0,0.0,1.0,526.0,3.0,3.0,...,4.0,3.0,2.0,4.0,4.0,5.0,4.0,0.0,0.0,1.0
25972,25972.0,71167.0,1.0,0.0,23.0,0.0,1.0,646.0,4.0,4.0,...,4.0,4.0,5.0,5.0,5.0,5.0,4.0,0.0,0.0,0.0
25973,25973.0,37675.0,0.0,0.0,17.0,1.0,0.0,828.0,2.0,5.0,...,2.0,4.0,3.0,4.0,5.0,4.0,2.0,0.0,0.0,1.0
25974,25974.0,90086.0,1.0,0.0,14.0,0.0,1.0,1127.0,3.0,3.0,...,4.0,3.0,2.0,5.0,4.0,5.0,4.0,0.0,0.0,0.0


In [141]:
df.tail()

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
25971,25971,78463,1,1,34,0,1,526,3,3,...,4,3,2,4,4,5,4,0,0.0,1
25972,25972,71167,1,0,23,0,1,646,4,4,...,4,4,5,5,5,5,4,0,0.0,0
25973,25973,37675,0,0,17,1,0,828,2,5,...,2,4,3,4,5,4,2,0,0.0,1
25974,25974,90086,1,0,14,0,1,1127,3,3,...,4,3,2,5,4,5,4,0,0.0,0
25975,25975,34799,0,0,42,1,0,264,2,5,...,1,1,2,1,1,1,1,0,0.0,1


Сначала воспользуемся методом k ближайших соседей, разбив выборку следующим образом - 70% для обучения, остальные 30% - отложенная выборка, на которой будет произведена оценка качества полученной модели. Подобное разбиение будет использоваться и во всех методах, приведенных ниже.

Выберем случайное количество соседей (10) и проведем обучение модели:

In [142]:
y = df['satisfaction']
df.drop('satisfaction', axis=1, inplace=True)
df = df.reset_index()
knn = KNeighborsClassifier(n_neighbors=10)
X_train, X_holdout, y_train, y_holdout = train_test_split(
    df.values, y, test_size=0.3, random_state=17
)
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_holdout)

Для каждого случая будем выводить следующие метрики оценки качества: accuracy, precision, recall, f-measure (или по-другому f1) и ROC:

In [143]:
accuracy_score(y_holdout, knn_pred)

0.597193614830072

In [144]:
precision_score(y_holdout, knn_pred)

0.6300881242131766

In [145]:
recall_score(y_holdout, knn_pred)

0.6873426413366903

In [146]:
f1_score(y_holdout, knn_pred)

0.6574712643678161

In [147]:
roc_auc_score(y_holdout, knn_pred)

0.5843303380263917

Теперь используем кросс-валидацию для настройки числа соседей

In [148]:
knn_pipe = Pipeline(
    [("scaler", StandardScaler()), ("knn", KNeighborsClassifier(n_jobs=-1))]
)

knn_params = {"knn__n_neighbors": range(1, 10)}

knn_grid = GridSearchCV(knn_pipe, knn_params, cv=5, n_jobs=-1, verbose=True)

knn_grid.fit(X_train, y_train)

knn_grid.best_params_, knn_grid.best_score_


Fitting 5 folds for each of 9 candidates, totalling 45 fits


({'knn__n_neighbors': 8}, 0.9091862068965517)

Оптимальное количество соседей = 8, искомые метрики:

In [149]:
accuracy_score(y_holdout, knn_grid.predict(X_holdout))

0.9155509783728115

In [150]:
precision_score(y_holdout, knn_grid.predict(X_holdout))

0.9095521729538937

In [151]:
recall_score(y_holdout, knn_grid.predict(X_holdout))

0.9436942092011902

In [152]:
f1_score(y_holdout, knn_grid.predict(X_holdout))

0.9263086946753538

In [153]:
roc_auc_score(y_holdout, knn_grid.predict(X_holdout))

0.9115352481722339

Обучим дерево решений с глубиной 5:

In [154]:
tree = DecisionTreeClassifier(max_depth=5, random_state=17)

tree.fit(X_train, y_train)
tree_pred = tree.predict(X_holdout)

In [155]:
accuracy_score(y_holdout, tree_pred)

0.9007466529351185

In [156]:
precision_score(y_holdout, tree_pred)

0.8929663608562691

In [157]:
recall_score(y_holdout, tree_pred)

0.9356832227054246

In [158]:
f1_score(y_holdout, tree_pred)

0.9138258634179055

In [159]:
roc_auc_score(y_holdout, tree_pred)

0.895761587816378

Настроим при помощи кросс-валидации параметры дерева: глубину и максимальное количество используемых параметров.

In [160]:
tree_params = {"max_depth": range(1, 11), "max_features": range(4, 19)}

tree_grid = GridSearchCV(tree, tree_params, cv=5, n_jobs=-1, verbose=True)

tree_grid.fit(X_train, y_train)

tree_grid.best_params_, tree_grid.best_score_

Fitting 5 folds for each of 150 candidates, totalling 750 fits


({'max_depth': 9, 'max_features': 17}, 0.9405241379310345)

Оптимальными являются глубина 9 и максимальное количество параметров 17

In [161]:
accuracy_score(y_holdout, tree_grid.predict(X_holdout))

0.9402677651905252

In [162]:
precision_score(y_holdout, tree_grid.predict(X_holdout))

0.9335998223406617

In [163]:
recall_score(y_holdout, tree_grid.predict(X_holdout))

0.9622339208056764

In [164]:
f1_score(y_holdout, tree_grid.predict(X_holdout))

0.9477006311992786

In [165]:
roc_auc_score(y_holdout, tree_grid.predict(X_holdout))

0.9371334358367895

Таким образом, получены следующие значения метрик оценки качества:

|           | kNN c 10 соседями | kNN с 8 соседями | Дерево решений с глубиной 5  | Дерево решений с глубиной 9 и числом признаков 17 |
|-----------|-------------------|-----------------------------|-----------------------------|------------------------------------------------------------|
| accuracy  | 0.59              | 0.91                        | 0.90                        | 0.94                                                       |
| precision | 0.63              | 0.90                        | 0.89                        | 0.93                                                       |
| recall    | 0.68              | 0.94                        | 0.93                        | 0.96                                                       |
| f-measure | 0.65              | 0.92                        | 0.91                        | 0.94                                                       |
| roc       | 0.58              | 0.91                        | 0.89                        | 0.93                                                       |

Можно сделать вывод о том, что дерево решений лучше справляется с задачей в случае с нашей выборкой.

Это можно связать с большим количеством атрибутов, для которых мы не настроили веса.