In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, cross_val_score

In [4]:
df = pd.read_csv('./final_data.csv')

Если присутствует деление на регионы, то это категориальный признак

In [6]:
if 'region' in df.columns:
    df = pd.get_dummies(df, columns=['region'], drop_first=True)

In [7]:
X = df.drop(columns='target')
y = df['target']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Логистическая регрессия**

In [9]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f'accuracy = {round(accuracy_score(y_test, y_pred), 2)}')
print(f'precision = {round(precision_score(y_test, y_pred), 2)}')
print(f'recall = {round(recall_score(y_test, y_pred), 2)}')
print(f'f1 = {round(f1_score(y_test, y_pred), 2)}')

accuracy = 0.6
precision = 0.61
recall = 0.68
f1 = 0.64


**Метод ближайших соседей**

In [10]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f'accuracy = {round(accuracy_score(y_test, y_pred), 2)}')
print(f'precision = {round(precision_score(y_test, y_pred), 2)}')
print(f'recall = {round(recall_score(y_test, y_pred), 2)}')
print(f'f1 = {round(f1_score(y_test, y_pred), 2)}')

accuracy = 0.62
precision = 0.64
recall = 0.64
f1 = 0.64


**Случайный лес**

Для подбора гиперпараметров воспользуемся поиском по сетке

In [11]:
params = {
    'criterion': ['gini', 'entropy'],
    'n_estimators': [100, 200, 400]
    }
grid = GridSearchCV(RandomForestClassifier(random_state=42), params, cv=5)
grid.fit(X, y)
grid.best_params_

{'criterion': 'gini', 'n_estimators': 400}

In [12]:
model = RandomForestClassifier(criterion='gini', n_estimators=400)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f'accuracy = {round(accuracy_score(y_test, y_pred), 2)}')
print(f'precision = {round(precision_score(y_test, y_pred), 2)}')
print(f'recall = {round(recall_score(y_test, y_pred), 2)}')
print(f'f1 = {round(f1_score(y_test, y_pred), 2)}')

accuracy = 0.69
precision = 0.73
recall = 0.65
f1 = 0.69


**Градиентный бустинг**

Для подбора гиперпараметров воспользуемся поиском по сетке

In [14]:
params = {
    'n_estimators': [100, 200, 400],
    }
grid = GridSearchCV(GradientBoostingClassifier(random_state=42), params, cv=5)
grid.fit(X, y)
grid.best_params_

{'n_estimators': 400}

In [15]:
model = GradientBoostingClassifier(n_estimators=400)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f'accuracy = {round(accuracy_score(y_test, y_pred), 2)}')
print(f'precision = {round(precision_score(y_test, y_pred), 2)}')
print(f'recall = {round(recall_score(y_test, y_pred), 2)}')
print(f'f1 = {round(f1_score(y_test, y_pred), 2)}')



accuracy = 0.7
precision = 0.72
recall = 0.69
f1 = 0.71
