In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder as ohe
from sklearn.preprocessing import StandardScaler as ss

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings("ignore")

## Преобработка данных

In [3]:
df = pd.read_csv('train.csv') # загружаем данные

In [4]:
df.head() # знакомимся с данными

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df.rename(columns=str.lower, inplace=True) # приводим названия колонок к нижнему регистру

In [6]:
df.info() # смотрим общую статистику

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  891 non-null    int64  
 1   survived     891 non-null    int64  
 2   pclass       891 non-null    int64  
 3   name         891 non-null    object 
 4   sex          891 non-null    object 
 5   age          714 non-null    float64
 6   sibsp        891 non-null    int64  
 7   parch        891 non-null    int64  
 8   ticket       891 non-null    object 
 9   fare         891 non-null    float64
 10  cabin        204 non-null    object 
 11  embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [7]:
# данные, где больше 50-60% пустые, создают фактор случайности, удаляем такие колонки
df = df.drop(columns='cabin')

In [8]:
# заполняем пустые значения в поле
# выбираем медиану, так как пассажирам не может быть физически 0 и медиана более устойчива к выбросам
df.age.fillna(df.age.median(), inplace=True)

In [9]:
# удаляем строки, где пустые значения в embarked, так как таких строк очень мало - 2/891 = 0.2%
df = df[df.embarked.notnull()]

In [10]:
# убираем переменные, которые не несут никакой ценности и не оказывают влияния на результат
df = df.drop(columns=['ticket', 'name', 'passengerid'])

In [11]:
# проверяем, что в числовых данных не содержатся одинаковые значения
df.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,889.0,889.0,889.0,889.0,889.0,889.0
mean,0.382452,2.311586,29.315152,0.524184,0.382452,32.096681
std,0.48626,0.8347,12.984932,1.103705,0.806761,49.697504
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,22.0,0.0,0.0,7.8958
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,35.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [12]:
# проверяем, что в поле не содержатся одинаковые значения
df.sex.value_counts()

male      577
female    312
Name: sex, dtype: int64

In [13]:
# преобразуем пол в числовые значения
df.sex.replace({'male': 1, 'female': 0}, inplace=True)
df.sex = df.sex.astype('int')

In [14]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,1,22.0,1,0,7.25,S
1,1,1,0,38.0,1,0,71.2833,C
2,1,3,0,26.0,0,0,7.925,S
3,1,1,0,35.0,1,0,53.1,S
4,0,3,1,35.0,0,0,8.05,S


## Формирование выборок для модели

In [15]:
X = df.drop(columns='survived')
y = df.survived

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [17]:
num_features = X_train.select_dtypes(exclude='object').columns.tolist()

In [18]:
for num_feature in num_features:
    if num_feature != 'sex':
        sc = ss()
        X_train[num_feature] = sc.fit_transform(X_train[num_feature].values.reshape(-1, 1))
        X_test[num_feature] = sc.fit_transform(X_test[num_feature].values.reshape(-1, 1))
    else:
        continue

In [19]:
ohenc = ohe(handle_unknown = 'ignore')
cat_feature_train = pd.DataFrame(ohenc.fit_transform(X_train['embarked'].values.reshape(-1,1)).toarray())
cat_feature_train.columns = ohenc.get_feature_names_out().tolist()
cat_feature_train = cat_feature_train.set_index(X_train.index)
X_train = pd.concat([X_train, cat_feature_train], axis=1)

In [20]:
cat_feature_test = pd.DataFrame(ohenc.transform(X_test['embarked'].values.reshape(-1,1)).toarray())
cat_feature_test.columns = ohenc.get_feature_names_out().tolist()
cat_feature_test = cat_feature_test.set_index(X_test.index)
X_test = pd.concat([X_test, cat_feature_test], axis=1)

In [21]:
X_train.drop(columns='embarked', inplace=True)
X_test.drop(columns='embarked', inplace=True)

In [22]:
X_train.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,x0_C,x0_Q,x0_S
621,-1.601461,1,0.967941,0.469761,-0.461495,0.419754,0.0,0.0,1.0
481,-0.391431,1,-0.107871,-0.468441,-0.461495,-0.646083,0.0,0.0,1.0
527,-1.601461,1,-0.107871,-0.468441,-0.461495,3.851758,0.0,0.0,1.0
435,-1.601461,0,-1.183683,0.469761,1.942337,1.787603,0.0,0.0,1.0
797,0.8186,0,0.12266,-0.468441,-0.461495,-0.46998,0.0,0.0,1.0


## Построение моделей с гиперпараметрами по умолчанию

In [23]:
log_model = LogisticRegression(random_state=42)
log_model.fit(X_train, y_train);

In [24]:
log_preds = log_model.predict(X_test)

In [25]:
def get_metrics(model_name, model_preds):
    metrics = {'model':model_name, 'accuracy': round(accuracy_score(y_test, model_preds), 3),\
                'recall': round(recall_score(y_test, model_preds), 3), \
                'precision': round(precision_score(y_test, model_preds), 3), \
                'f1': round(f1_score(y_test, model_preds), 3), \
                'roc_auc': round(roc_auc_score(y_test, model_preds), 3)}
    return metrics

In [26]:
metrics = []
metrics.append(get_metrics('logistic', log_preds))

In [27]:
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train);

In [28]:
knn_preds = knn_model.predict(X_test)

In [29]:
metrics.append(get_metrics('knn', knn_preds))

In [30]:
pd.DataFrame(metrics)

Unnamed: 0,model,accuracy,recall,precision,f1,roc_auc
0,logistic,0.809,0.691,0.783,0.734,0.786
1,knn,0.775,0.662,0.726,0.692,0.754


Логистическая модель предсказывает точнее.

## Подбор гиперпараметров для моделей

In [31]:
log_params = {'fit_intercept':[True, False],'penalty':[None, 'l1', 'l2', 'elasticnet']}
knn_params = {'n_neighbors':[3,5,7], 'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute']}

In [32]:
log_randomized = RandomizedSearchCV(log_model, log_params, random_state=42)
log_search = log_randomized.fit(X_train, y_train)
log_search.best_params_

{'penalty': 'l2', 'fit_intercept': False}

In [33]:
knn_randomized = RandomizedSearchCV(knn_model, knn_params, random_state=42)
knn_search = knn_randomized.fit(X_train, y_train)
knn_search.best_params_

{'n_neighbors': 7, 'algorithm': 'kd_tree'}

## Построение моделей с гиперпараметрами

In [None]:
log_model_2 = LogisticRegression(penalty='l2', fit_intercept=False, random_state=42)
log_model_2.fit(X_train, y_train)
log_preds_2 = log_model_2.predict(X_test)
metrics.append(get_metrics('logistic with custom params', log_preds_2))

In [34]:
knn_model_2 = KNeighborsClassifier(n_neighbors=7, algorithm='kd_tree')
knn_model_2.fit(X_train, y_train)
knn_preds_2 = knn_model_2.predict(X_test)
metrics.append(get_metrics('knn with custom params', knn_preds_2))

In [35]:
pd.DataFrame(metrics)

Unnamed: 0,model,accuracy,recall,precision,f1,roc_auc
0,logistic,0.809,0.691,0.783,0.734,0.786
1,knn,0.775,0.662,0.726,0.692,0.754
2,knn with custom params,0.781,0.691,0.723,0.707,0.764


Логистическая регрессия предсказывает точнее.
С гиперпараметрами точность немного увеличилась за счет того, что:
1. была применена l2 регуляризация, которая отвечает за управление переобучением модели;
2. был убран свободный член, который не объясняет изменение зависимой переменной через изменение предикаторов.