# Тестирование алгоритмов классификации на датасете Титаник

Суть этого ноутбука - протестировать различные подходы к анализу данных, используя различные модели и методы бинарной классификации

## Инициализация

### Импорт библиотек

In [3]:
import pickle

import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import numpy as np

### Подгрузка датасета

In [3]:
df = pd.read_csv('titanic_train.csv', index_col='PassengerId')

## Описательный анализ датасета

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [5]:
df.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
df.head(15)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [7]:
df.isna().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

## Подготовка данных для ML

* Удалить лишние колонки
* Проверить таблицу на пропуски в данных и сделать так, чтоб их небыло
* Выполнить подготовку сета, к дальнейшему ML

### Удаление лишних колонок

**Embarked** - Порты посадки пассажиров:
* S - Southampton
* C - Cherbourg
* Q - Queenstown

**SibSp** — количество братьев, сестер, сводных братьев, сводных сестер, супругов на борту титаника

**Parch** — количество родителей, детей (в том числе приемных) на борту титаника

**Целевая колонка: Survived**

**Категориальные признаки:**
* Pclass - класс палубы
* Sex - пол пассажира

In [8]:
#df.drop(['Name', 'Ticket', 'Cabin', 'Embarked', 'Fare'], axis=1, inplace=True)

In [9]:
df.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [10]:
df.loc[ df.Age.isna(), 'Age' ] = df.Age.mean()

In [11]:
df.dropna(inplace=True)

In [12]:
df.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.0,1,0,7.25,S
2,1,1,female,38.0,1,0,71.2833,C
3,1,3,female,26.0,0,0,7.925,S
4,1,1,female,35.0,1,0,53.1,S
5,0,3,male,35.0,0,0,8.05,S


## ML

Для категориальных фичей использую Hot One Encoding, так как их последовательность считаю математически не связанной (0<1<2) и их необходимо представлять в виде бинарных категорий

Далее идут функции тестирования для алгоритмов классификаций kNN, Decision Trees, Random Forest, SVM, Алгоритм Байеса.

Каждая функция принимает X, y; 

после:
* самостоятельно делит на train и тестовую выборки
* проводит тестирование лучших аргументов посредством сетки параметров (кроме Байеса, он падает с ошибкой, поэтому он "из коробки")
* возвращает скоринги F1, Accurency, ROC AUC
* возвращает лучшие найденные аргументы для алгоритма

Непосредственно отбор лучшего решения происходит по скорингу F1, так как считаю его наиболее показательным (он учитывает TP/TN/FP/FN)


### kNN

In [13]:
def knn_test(X, y, cv=5, scoring='f1'):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=np.random.randint(10, 10**4))
    
    gs = GridSearchCV(KNeighborsClassifier(), {
        'n_neighbors': range(1,20),
        'weights':['uniform', 'distance'],
        'algorithm':['auto'],
        'metric': ['euclidean', 'manhattan', 'minkowski', 'chebyshev'],
    }, scoring=scoring, cv=cv)
    gs.fit(X_train, y_train)

    y_pred = gs.predict(X_test)
    
    accuracy = round(accuracy_score(y_test, y_pred) * 100)
    f1 = round(f1_score(y_test, y_pred) * 100)
    auc_s = round(roc_auc_score(y_test, y_pred) * 100)
    
    return (accuracy, f1, auc_s, gs.best_params_)

### Decision Trees

In [14]:
def dt_test(X, y, cv=5, scoring='f1'):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=np.random.randint(10, 10**4))
    
    gs = GridSearchCV(DecisionTreeClassifier(), {
       'criterion': ['gini', 'entropy'],
        'splitter': ['best', 'random'],
        'min_samples_split': range(2,5)
    }, scoring=scoring, cv=cv)
    gs.fit(X_train, y_train)

    y_pred = gs.predict(X_test)
    
    accuracy = round(accuracy_score(y_test, y_pred) * 100)
    f1 = round(f1_score(y_test, y_pred) * 100)
    auc_s = round(roc_auc_score(y_test, y_pred) * 100)
    
    return (accuracy, f1, auc_s, gs.best_params_)

### Random Forest

In [15]:
def rf_test(X, y, cv=5, scoring='f1'):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=np.random.randint(10, 10**4))
    
    gs = GridSearchCV(RandomForestClassifier(), {
       'n_estimators': [100,200,300,400,500],
        'criterion': ['gini', 'entropy'],
        'min_samples_split': range(2,11)
    }, scoring=scoring, cv=cv)
    gs.fit(X_train, y_train)

    y_pred = gs.predict(X_test)
    
    accuracy = round(accuracy_score(y_test, y_pred) * 100)
    f1 = round(f1_score(y_test, y_pred) * 100)
    auc_s = round(roc_auc_score(y_test, y_pred) * 100)
    
    return (accuracy, f1, auc_s, gs.best_params_)

### SVM

In [16]:
def svm_test(X, y, cv=5, scoring='f1'):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=np.random.randint(10, 10**4))
    
    gs = GridSearchCV(SVC(), {
       'C': np.arange(0.25, 3.0, 0.25),
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'degree': range(1,5)
    }, scoring=scoring, cv=cv)
    gs.fit(X_train, y_train)

    y_pred = gs.predict(X_test)
    
    accuracy = round(accuracy_score(y_test, y_pred) * 100)
    f1 = round(f1_score(y_test, y_pred) * 100)
    auc_s = round(roc_auc_score(y_test, y_pred) * 100)
    
    return (accuracy, f1, auc_s, gs.best_params_)

### Байес

In [72]:
def bayes_test(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=np.random.randint(10, 10**4))
    
    bayes = CategoricalNB()
    bayes.fit(X_train, y_train)
    
    try:
        y_pred = bayes.predict(X_test)
    except: 
        return bayes_test(X, y)
    
    accuracy = round(accuracy_score(y_test, y_pred) * 100)
    f1 = round(f1_score(y_test, y_pred) * 100)
    auc_s = round(roc_auc_score(y_test, y_pred) * 100)
    
    return (accuracy, f1, auc_s, {})

Функция **compare_models(X, y)** тестирует все вышеприведенные алгоритмы и выдает сводную таблицу для сравнения алгоритмов с использованными параметрами

In [45]:
from datetime import datetime

def compare_models(X, y, scoring='f1'):
    results = pd.DataFrame(columns=['Algo', 'Accurancy', 'F1', 'AUC', 'Params'])
    
    dt_start = datetime.now()
    knn_accuracy, knn_f1, knn_auc, params = knn_test(X, y, scoring=scoring)
    results = results.append({
        'Algo': 'kNN',
        'Accurancy': knn_accuracy,
        'F1': knn_f1,
        'AUC': knn_auc, 
        'Params': params
    }, ignore_index=True)
    dt_finish = datetime.now()
    dt_diff = dt_finish - dt_start
    print('kNN testing took {}'.format(dt_diff))
    
    dt_start = datetime.now()
    dt_accuracy, dt_f1, dt_auc, params = dt_test(X, y, scoring=scoring)
    results = results.append({
        'Algo': 'Decision Tree',
        'Accurancy': dt_accuracy,
        'F1': dt_f1,
        'AUC': dt_auc, 
        'Params': params
    }, ignore_index=True)
    dt_finish = datetime.now()
    dt_diff = dt_finish - dt_start
    print('Decision Tree testing took {}'.format(dt_diff))
    
    dt_start = datetime.now()
    rf_accuracy, rf_f1, rf_auc, params = rf_test(X, y, scoring=scoring)
    results = results.append({
        'Algo': 'Random Forest',
        'Accurancy': rf_accuracy,
        'F1': rf_f1,
        'AUC': rf_auc, 
        'Params': params
    }, ignore_index=True)
    dt_finish = datetime.now()
    dt_diff = dt_finish - dt_start
    print('Random Forest testing took {}'.format(dt_diff))
    
    dt_start = datetime.now()
    svm_accuracy, svm_f1, svm_auc, params = svm_test(X, y, scoring=scoring)
    results = results.append({
        'Algo': 'SVM',
        'Accurancy': svm_accuracy,
        'F1': svm_f1,
        'AUC': svm_auc, 
        'Params': params
    }, ignore_index=True)
    dt_finish = datetime.now()
    dt_diff = dt_finish - dt_start
    print('SVM testing took {}'.format(dt_diff))
    
    
    dt_start = datetime.now()
    try:
        bayes_accuracy, bayes_f1, bayes_auc, params = bayes_test(X, y)
    except:
        results = results.append({
            'Algo': 'Bayes',
            'Accurancy': np.nan,
            'F1': np.nan,
            'AUC': np.nan, 
            'Params': {'error': 'error'}
        }, ignore_index=True)
    else:
        results = results.append({
            'Algo': 'Bayes',
            'Accurancy': bayes_accuracy,
            'F1': bayes_f1,
            'AUC': bayes_auc, 
            'Params': params
        }, ignore_index=True)
        
    dt_finish = datetime.now()
    dt_diff = dt_finish - dt_start
    print('Bayes testing took {}'.format(dt_diff))
    
    return results

In [19]:
df_math = pd.concat([
    df[['Age', 'SibSp', 'Parch', 'Fare']], 
    pd.get_dummies(df['Pclass'], prefix='Pclass'), 
    pd.get_dummies(df['Sex'], prefix='Sex'),
    pd.get_dummies(df['Embarked'], prefix='Embarked')
], axis=1)

X = df_math.values
y = df['Survived'].values

df_math.head()

Unnamed: 0_level_0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,22.0,1,0,7.25,0,0,1,0,1,0,0,1
2,38.0,1,0,71.2833,1,0,0,1,0,1,0,0
3,26.0,0,0,7.925,0,0,1,1,0,0,0,1
4,35.0,1,0,53.1,1,0,0,1,0,0,0,1
5,35.0,0,0,8.05,0,0,1,0,1,0,0,1


In [20]:
# Задаем минимальную стоимость оплаты, иначе Байес не будет работать
df_math.loc[ df_math.Fare == 0, 'Fare' ] = 1.0

In [21]:
df_math.Fare.describe()

count    889.000000
mean      32.113554
std       49.686761
min        1.000000
25%        7.895800
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

In [41]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=np.random.randint(10, 10**4))

gs = GridSearchCV(SVC(), {
    'C': np.arange(0.25, 3.0, 0.25),
    'kernel': ['linear'],
    'degree': range(1,5)
}, scoring='f1', cv=5)
gs.fit(X_train, y_train)

y_pred = gs.predict(X_test)

accuracy = round(accuracy_score(y_test, y_pred) * 100)
f1 = round(f1_score(y_test, y_pred) * 100)
auc_s = round(roc_auc_score(y_test, y_pred) * 100)

print('accuracy={}, f1={}, AUC={}, params={}'.format(accuracy, f1, auc_s, gs.best_params_))

accuracy=77, f1=70, AUC=75, params={'C': 0.25, 'degree': 1, 'kernel': 'linear'}


In [43]:
compare_f1 = compare_models(X, y, scoring='f1')
compare_f1

kNN testing took 0:00:03.518972
Decision Tree testing took 0:00:00.124000
Random Forest testing took 0:02:53.234000
Bayes testing took 0:00:00.007024


NameError: name 'compare' is not defined

In [44]:
compare_f1

Unnamed: 0,Algo,Accurancy,F1,AUC,Params
0,kNN,74,65,73,"{'algorithm': 'auto', 'metric': 'manhattan', '..."
1,Decision Tree,74,70,74,"{'criterion': 'entropy', 'min_samples_split': ..."
2,Random Forest,81,72,78,"{'criterion': 'gini', 'min_samples_split': 5, ..."
3,Bayes,76,67,74,{}


### Без поля Fare

In [263]:
df_math2 = pd.concat([
    df[['Age', 'SibSp', 'Parch']], 
    pd.get_dummies(df['Pclass'], prefix='Pclass'), 
    pd.get_dummies(df['Sex'], prefix='Sex'),
    pd.get_dummies(df['Embarked'], prefix='Embarked')
], axis=1)

X2 = df_math2.values

In [248]:
compare = compare_models(X2, y)
compare

kNN testing took 0:00:03.503115
Decision Tree testing took 0:00:00.113995
Random Forest testing took 0:02:51.446610
SVM testing took 0:00:15.682070
Bayes testing took 0:00:00.006968


Unnamed: 0,Algo,Accurancy,F1,AUC,Params
0,kNN,82,75,80,"{'algorithm': 'auto', 'metric': 'manhattan', '..."
1,Decision Tree,74,63,71,"{'criterion': 'entropy', 'min_samples_split': ..."
2,Random Forest,80,72,78,"{'criterion': 'gini', 'min_samples_split': 7, ..."
3,SVM,79,67,75,"{'C': 0.5, 'degree': 1, 'kernel': 'linear'}"
4,Bayes,78,71,76,{}


### Все кроме Age поля разложены по HOE

In [249]:
df_math3 = pd.concat([
    df[['Age']], 
    pd.get_dummies(df['SibSp'], prefix='SibSp'), 
    pd.get_dummies(df['Parch'], prefix='Parch'), 
    pd.get_dummies(df['Pclass'], prefix='Pclass'), 
    pd.get_dummies(df['Sex'], prefix='Sex'),
    pd.get_dummies(df['Embarked'], prefix='Embarked')
], axis=1)

X3 = df_math3.values

In [250]:
compare = compare_models(X3, y)
compare

kNN testing took 0:00:04.349270
Decision Tree testing took 0:00:00.128999
Random Forest testing took 0:02:50.944115
SVM testing took 0:02:18.348000
Bayes testing took 0:00:00.008024


Unnamed: 0,Algo,Accurancy,F1,AUC,Params
0,kNN,80,73,79,"{'algorithm': 'auto', 'metric': 'manhattan', '..."
1,Decision Tree,80,75,79,"{'criterion': 'entropy', 'min_samples_split': ..."
2,Random Forest,79,72,78,"{'criterion': 'gini', 'min_samples_split': 8, ..."
3,SVM,78,69,76,"{'C': 0.5, 'degree': 1, 'kernel': 'linear'}"
4,Bayes,80,76,79,{}


In [256]:
compare = compare_models(X3, y, scoring='accuracy')
compare

kNN testing took 0:00:03.931005
Decision Tree testing took 0:00:00.100000
Random Forest testing took 0:02:50.120077
SVM testing took 0:06:04.935998
Bayes testing took 0:00:00.008000


Unnamed: 0,Algo,Accurancy,F1,AUC,Params
0,kNN,75,71,75,"{'algorithm': 'auto', 'metric': 'manhattan', '..."
1,Decision Tree,78,72,77,"{'criterion': 'entropy', 'min_samples_split': ..."
2,Random Forest,84,78,84,"{'criterion': 'gini', 'min_samples_split': 10,..."
3,SVM,81,76,80,"{'C': 1.0, 'degree': 1, 'kernel': 'linear'}"
4,Bayes,87,83,86,{}


In [257]:
compare[ compare.Algo == 'Random Forest' ].Params.values

array([{'criterion': 'gini', 'min_samples_split': 10, 'n_estimators': 300}],
      dtype=object)

### Без Age и все поля разложены по HOE

In [258]:
df_math4 = pd.concat([
    pd.get_dummies(df['SibSp'], prefix='SibSp'), 
    pd.get_dummies(df['Parch'], prefix='Parch'), 
    pd.get_dummies(df['Pclass'], prefix='Pclass'), 
    pd.get_dummies(df['Sex'], prefix='Sex'),
    pd.get_dummies(df['Embarked'], prefix='Embarked')
], axis=1)

X4 = df_math4.values

In [259]:
compare_X4_accuracy = compare_models(X4, y, scoring='accuracy')
compare_x4_F1 = compare_models(X4, y, scoring='f1')


kNN testing took 0:00:04.747025
Decision Tree testing took 0:00:00.084000
Random Forest testing took 0:02:41.272974
SVM testing took 0:00:05.844030
Bayes testing took 0:00:00.007972
kNN testing took 0:00:05.378025
Decision Tree testing took 0:00:00.110000
Random Forest testing took 0:02:47.062974
SVM testing took 0:00:06.575000
Bayes testing took 0:00:00.008001


In [260]:
compare_X4_accuracy

Unnamed: 0,Algo,Accurancy,F1,AUC,Params
0,kNN,78,66,74,"{'algorithm': 'auto', 'metric': 'euclidean', '..."
1,Decision Tree,82,71,78,"{'criterion': 'gini', 'min_samples_split': 2, ..."
2,Random Forest,82,76,80,"{'criterion': 'gini', 'min_samples_split': 10,..."
3,SVM,79,68,75,"{'C': 0.25, 'degree': 3, 'kernel': 'poly'}"
4,Bayes,79,75,79,{}


In [261]:
compare_x4_F1

Unnamed: 0,Algo,Accurancy,F1,AUC,Params
0,kNN,79,73,78,"{'algorithm': 'auto', 'metric': 'euclidean', '..."
1,Decision Tree,78,72,77,"{'criterion': 'gini', 'min_samples_split': 3, ..."
2,Random Forest,78,70,75,"{'criterion': 'gini', 'min_samples_split': 3, ..."
3,SVM,79,67,75,"{'C': 0.5, 'degree': 1, 'kernel': 'rbf'}"
4,Bayes,84,78,83,{}


### Без age и без HOE

In [265]:
df_math5 = pd.concat([
    df[['SibSp', 'Parch']], 
    pd.get_dummies(df['Pclass'], prefix='Pclass'), 
    pd.get_dummies(df['Sex'], prefix='Sex'),
    pd.get_dummies(df['Embarked'], prefix='Embarked')
], axis=1)

X5 = df_math5.values

In [266]:
compare_X5_accuracy = compare_models(X5, y, scoring='accuracy')
compare_x5_F1 = compare_models(X5, y, scoring='f1')

kNN testing took 0:00:03.489020
Decision Tree testing took 0:00:00.074000
Random Forest testing took 0:02:37.953995
SVM testing took 0:00:04.976004
Bayes testing took 0:00:00.006024
kNN testing took 0:00:03.785025
Decision Tree testing took 0:00:00.097975
Random Forest testing took 0:02:37.810180
SVM testing took 0:00:06.290029
Bayes testing took 0:00:00.006970


In [267]:
compare_X5_accuracy

Unnamed: 0,Algo,Accurancy,F1,AUC,Params
0,kNN,79,65,74,"{'algorithm': 'auto', 'metric': 'euclidean', '..."
1,Decision Tree,81,67,75,"{'criterion': 'entropy', 'min_samples_split': ..."
2,Random Forest,79,64,73,"{'criterion': 'gini', 'min_samples_split': 8, ..."
3,SVM,80,73,78,"{'C': 1.25, 'degree': 2, 'kernel': 'poly'}"
4,Bayes,75,69,74,{}


In [47]:
compare_x5_F1

NameError: name 'compare_x5_F1' is not defined

### Тест Embarked

#### With Embarked

In [50]:
df_math6 = pd.concat([
    df[['SibSp', 'Parch']], 
    pd.get_dummies(df['Pclass'], prefix='Pclass'), 
    pd.get_dummies(df['Sex'], prefix='Sex'),
    pd.get_dummies(df['Embarked'], prefix='Embarked')
], axis=1)

X6 = df_math6.values

In [51]:
compare_models(X6, y, scoring='f1')

kNN testing took 0:00:03.937003
Decision Tree testing took 0:00:00.101996
Random Forest testing took 0:02:37.737005
SVM testing took 0:00:05.448001
Bayes testing took 0:00:00.005999


Unnamed: 0,Algo,Accurancy,F1,AUC,Params
0,kNN,77,70,75,"{'algorithm': 'auto', 'metric': 'euclidean', '..."
1,Decision Tree,80,72,77,"{'criterion': 'gini', 'min_samples_split': 4, ..."
2,Random Forest,80,71,77,"{'criterion': 'entropy', 'min_samples_split': ..."
3,SVM,78,70,75,"{'C': 0.25, 'degree': 1, 'kernel': 'rbf'}"
4,Bayes,76,69,76,{}


#### Without Embarked

In [52]:
df_math7 = pd.concat([
    df[['SibSp', 'Parch']], 
    pd.get_dummies(df['Pclass'], prefix='Pclass'), 
    pd.get_dummies(df['Sex'], prefix='Sex')
], axis=1)

X7 = df_math7.values

In [53]:
compare_models(X7, y, scoring='f1')

kNN testing took 0:00:03.409000
Decision Tree testing took 0:00:00.092026
Random Forest testing took 0:02:33.182034
SVM testing took 0:00:06.180005
Bayes testing took 0:00:00.005998


Unnamed: 0,Algo,Accurancy,F1,AUC,Params
0,kNN,78,70,75,"{'algorithm': 'auto', 'metric': 'manhattan', '..."
1,Decision Tree,79,68,75,"{'criterion': 'entropy', 'min_samples_split': ..."
2,Random Forest,76,68,74,"{'criterion': 'gini', 'min_samples_split': 4, ..."
3,SVM,80,74,78,"{'C': 0.25, 'degree': 1, 'kernel': 'linear'}"
4,Bayes,85,80,84,{}


## Тестирование алгоритмов

### Проверка Байеса

In [70]:
X7[0]

array([1, 0, 0, 0, 1, 0, 1], dtype=int64)

In [71]:
X_test

array([[56.,  0.,  0., ...,  1.,  0.,  0.],
       [50.,  0.,  1., ...,  1.,  0.,  0.],
       [65.,  0.,  0., ...,  0.,  0.,  1.],
       ...,
       [61.,  0.,  0., ...,  0.,  0.,  1.],
       [18.,  0.,  0., ...,  0.,  0.,  1.],
       [32.,  0.,  0., ...,  0.,  0.,  1.]])

In [78]:
for i in range(10):
    b_accuracy, b_f1, b_auc, b_conf = bayes_test(X7, y)
    print('Accuracy={}, F1={}, AUC={}'.format(b_accuracy, b_f1, b_auc))

Accuracy=79, F1=72, AUC=77
Accuracy=75, F1=70, AUC=74
Accuracy=78, F1=71, AUC=77
Accuracy=79, F1=71, AUC=77
Accuracy=75, F1=66, AUC=74
Accuracy=78, F1=68, AUC=75
Accuracy=78, F1=71, AUC=76
Accuracy=79, F1=70, AUC=76
Accuracy=81, F1=74, AUC=79
Accuracy=79, F1=72, AUC=77


In [79]:
for i in range(10):
    b_accuracy, b_f1, b_auc, b_conf = knn_test(X7, y)
    print('Accuracy={}, F1={}, AUC={}'.format(b_accuracy, b_f1, b_auc))

Accuracy=74, F1=66, AUC=73
Accuracy=79, F1=71, AUC=77
Accuracy=74, F1=67, AUC=73
Accuracy=76, F1=66, AUC=73
Accuracy=81, F1=72, AUC=78
Accuracy=76, F1=64, AUC=72
Accuracy=83, F1=76, AUC=81
Accuracy=78, F1=67, AUC=74
Accuracy=78, F1=68, AUC=75
Accuracy=82, F1=73, AUC=79


In [80]:
for i in range(10):
    b_accuracy, b_f1, b_auc, b_conf = dt_test(X7, y)
    print('Accuracy={}, F1={}, AUC={}'.format(b_accuracy, b_f1, b_auc))

Accuracy=74, F1=58, AUC=69
Accuracy=81, F1=71, AUC=77
Accuracy=81, F1=71, AUC=79
Accuracy=79, F1=70, AUC=76
Accuracy=81, F1=72, AUC=78
Accuracy=78, F1=71, AUC=76
Accuracy=79, F1=70, AUC=76
Accuracy=79, F1=73, AUC=78
Accuracy=74, F1=60, AUC=70
Accuracy=78, F1=62, AUC=72


In [81]:
for i in range(10):
    b_accuracy, b_f1, b_auc, b_conf = rf_test(X7, y)
    print('Accuracy={}, F1={}, AUC={}'.format(b_accuracy, b_f1, b_auc))

Accuracy=78, F1=69, AUC=76
Accuracy=78, F1=73, AUC=77
Accuracy=81, F1=75, AUC=79
Accuracy=82, F1=71, AUC=78
Accuracy=77, F1=64, AUC=72
Accuracy=78, F1=67, AUC=74
Accuracy=78, F1=71, AUC=76
Accuracy=78, F1=70, AUC=76
Accuracy=78, F1=69, AUC=75
Accuracy=78, F1=70, AUC=76


In [83]:
for i in range(10): 
    b_accuracy, b_f1, b_auc, b_conf = svm_test(X7, y, scoring='f1')
    print('Accuracy={}, F1={}, AUC={}, config={}'.format(b_accuracy, b_f1, b_auc, b_conf))

Accuracy=80, F1=69, AUC=76, config={'C': 2.5, 'degree': 1, 'kernel': 'rbf'}
Accuracy=77, F1=70, AUC=75, config={'C': 2.25, 'degree': 2, 'kernel': 'poly'}
Accuracy=79, F1=68, AUC=76, config={'C': 0.25, 'degree': 2, 'kernel': 'poly'}
Accuracy=80, F1=72, AUC=77, config={'C': 0.5, 'degree': 1, 'kernel': 'rbf'}
Accuracy=76, F1=69, AUC=75, config={'C': 1.0, 'degree': 2, 'kernel': 'poly'}
Accuracy=79, F1=72, AUC=77, config={'C': 0.5, 'degree': 2, 'kernel': 'poly'}
Accuracy=80, F1=75, AUC=79, config={'C': 0.25, 'degree': 1, 'kernel': 'rbf'}
Accuracy=77, F1=70, AUC=76, config={'C': 1.5, 'degree': 3, 'kernel': 'poly'}
Accuracy=81, F1=76, AUC=80, config={'C': 0.75, 'degree': 2, 'kernel': 'poly'}
Accuracy=77, F1=69, AUC=75, config={'C': 0.75, 'degree': 3, 'kernel': 'poly'}


In [84]:
for i in range(10): 
    b_accuracy, b_f1, b_auc, b_conf = svm_test(X7, y, scoring='accuracy')
    print('Accuracy={}, F1={}, AUC={}, config={}'.format(b_accuracy, b_f1, b_auc, b_conf))

Accuracy=83, F1=74, AUC=80, config={'C': 0.25, 'degree': 2, 'kernel': 'poly'}
Accuracy=81, F1=74, AUC=79, config={'C': 0.25, 'degree': 2, 'kernel': 'poly'}
Accuracy=78, F1=69, AUC=75, config={'C': 0.5, 'degree': 2, 'kernel': 'poly'}
Accuracy=80, F1=71, AUC=77, config={'C': 1.5, 'degree': 3, 'kernel': 'poly'}
Accuracy=83, F1=76, AUC=80, config={'C': 0.25, 'degree': 2, 'kernel': 'poly'}
Accuracy=78, F1=69, AUC=75, config={'C': 0.25, 'degree': 2, 'kernel': 'poly'}
Accuracy=80, F1=75, AUC=79, config={'C': 0.25, 'degree': 2, 'kernel': 'poly'}
Accuracy=77, F1=69, AUC=75, config={'C': 0.5, 'degree': 2, 'kernel': 'poly'}
Accuracy=82, F1=75, AUC=80, config={'C': 1.5, 'degree': 1, 'kernel': 'rbf'}
Accuracy=79, F1=69, AUC=76, config={'C': 0.25, 'degree': 2, 'kernel': 'poly'}


In [87]:
for i in range(10): 
    b_accuracy, b_f1, b_auc, b_conf = svm_test(X7, y, scoring='roc_auc')
    print('Accuracy={}, F1={}, AUC={}, config={}'.format(b_accuracy, b_f1, b_auc, b_conf))

Accuracy=83, F1=76, AUC=81, config={'C': 2.75, 'degree': 2, 'kernel': 'poly'}
Accuracy=79, F1=72, AUC=77, config={'C': 0.5, 'degree': 2, 'kernel': 'poly'}
Accuracy=83, F1=75, AUC=81, config={'C': 1.0, 'degree': 2, 'kernel': 'poly'}
Accuracy=81, F1=71, AUC=77, config={'C': 1.5, 'degree': 2, 'kernel': 'poly'}
Accuracy=77, F1=66, AUC=73, config={'C': 1.75, 'degree': 2, 'kernel': 'poly'}
Accuracy=81, F1=73, AUC=79, config={'C': 0.25, 'degree': 2, 'kernel': 'poly'}
Accuracy=82, F1=71, AUC=78, config={'C': 1.75, 'degree': 2, 'kernel': 'poly'}
Accuracy=78, F1=68, AUC=75, config={'C': 1.75, 'degree': 1, 'kernel': 'poly'}
Accuracy=76, F1=67, AUC=74, config={'C': 0.25, 'degree': 2, 'kernel': 'poly'}
Accuracy=77, F1=67, AUC=74, config={'C': 1.75, 'degree': 2, 'kernel': 'poly'}


Algo:SVM

Accuracy=83, F1=76, AUC=80, config={'C': 0.25, 'degree': 2, 'kernel': 'poly'}

## Обучение модели

In [89]:
model = SVC(C=0.25, degree=2, kernel='poly')
model.fit(X7, y)

SVC(C=0.25, degree=2, kernel='poly')

In [4]:
df_test = pd.read_csv('test.csv', index_col='PassengerId')

In [91]:
df_test.info(0)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Name      418 non-null    object 
 2   Sex       418 non-null    object 
 3   Age       332 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Ticket    418 non-null    object 
 7   Fare      417 non-null    float64
 8   Cabin     91 non-null     object 
 9   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 35.9+ KB


In [92]:
df_test.isna().sum()

Pclass        0
Name          0
Sex           0
Age          86
SibSp         0
Parch         0
Ticket        0
Fare          1
Cabin       327
Embarked      0
dtype: int64

In [5]:
df_test_math7 = pd.concat([
    df_test[['SibSp', 'Parch']], 
    pd.get_dummies(df_test['Pclass'], prefix='Pclass'), 
    pd.get_dummies(df_test['Sex'], prefix='Sex')
], axis=1)

X7_test = df_test_math7.values

In [6]:
df_test_math7

Unnamed: 0_level_0,SibSp,Parch,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
892,0,0,0,0,1,0,1
893,1,0,0,0,1,1,0
894,0,0,0,1,0,0,1
895,0,0,0,0,1,0,1
896,1,1,0,0,1,1,0
...,...,...,...,...,...,...,...
1305,0,0,0,0,1,0,1
1306,0,0,1,0,0,1,0
1307,0,0,0,0,1,0,1
1308,0,0,0,0,1,0,1


In [95]:
y_predict = model.predict(X7_test)

In [98]:
df_result = df_test_math7.copy()
df_result.drop(df_test_math7.columns, axis=1, inplace=True)
df_result['Survived'] = y_predict

In [100]:
df_result.to_csv('result.csv')

## Dumping the model

In [101]:
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)