In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, cross_val_score

**Логистическая регрессия**

In [2]:
def logistic_regression(X, y):

    X_train, X_test, y_train, y_test =\
     train_test_split(X, y, test_size=0.2, random_state=42)

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    return accuracy, precision, recall, f1


**Метод к-ближайших соседей**

In [3]:
def knn(X, y):

    X_train, X_test, y_train, y_test =\
     train_test_split(X, y, test_size=0.2, random_state=42)

    model = KNeighborsClassifier(n_neighbors=5)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f_score = f1_score(y_test, y_pred)

    return accuracy, precision, recall, f_score

**Случайный лес**

In [4]:
def random_forest(X, y):

    #для подбора гиперпараметров воспользуемся поиском по сетке
    params = {
    'criterion': ['gini', 'entropy'],
    'n_estimators': [100, 200, 400]
    }
    grid = GridSearchCV(RandomForestClassifier(random_state=42), params, cv=5)
    grid.fit(X, y)

    best_criterion = grid.best_params_['criterion']
    count_estimators = grid.best_params_['n_estimators']

    X_train, X_test, y_train, y_test =\
     train_test_split(X, y, test_size=0.2, random_state=42)

    model = RandomForestClassifier(criterion=best_criterion, n_estimators=count_estimators)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f_score = f1_score(y_test, y_pred)

    return accuracy, precision, recall, f_score

**Градиентный бустинг**

In [5]:
def gradient_boosting(X, y):

    #для подбора гиперпараметров воспользуемся поиском по сетке
    params = {
    'n_estimators': [100, 200, 400, 500]
    }
    grid = GridSearchCV(GradientBoostingClassifier(random_state=42), params, cv=5)
    grid.fit(X, y)

    count_estimators = grid.best_params_['n_estimators']

    X_train, X_test, y_train, y_test =\
     train_test_split(X, y, test_size=0.2, random_state=42)

    model =  GradientBoostingClassifier(n_estimators=count_estimators)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f_score = f1_score(y_test, y_pred)

    return accuracy, precision, recall, f_score

**Итоговые результаты**

In [6]:
def get_results(data):

    X = data.drop(columns='target')
    y = data['target']

    accuracy_first, precision_first, recall_first, f_score_first = logistic_regression(X, y)

    accuracy_second, precision_second, recall_second, f_score_second = knn(X, y)

    accuracy_third, precision_third, recall_third, f_score_third = random_forest(X, y)

    accuracy_fourth, precision_fourth, recall_fourth, f_score_fourth = gradient_boosting(X, y)

    result_data = pd.DataFrame({'Accuracy': [accuracy_first, accuracy_second,\
                                accuracy_third, accuracy_fourth],
                   'Precision': [precision_first,  precision_second,\
                                  precision_third,  precision_fourth],
                   'Recall': [recall_first, recall_second,\
                              recall_third, recall_fourth],
                   'F-score': [f_score_first, f_score_second, f_score_third,\
                               f_score_fourth]
                    })

    result_data['Accuracy'] = round(result_data['Accuracy'], 2)
    result_data['Precision'] = round(result_data['Precision'], 2)
    result_data['Recall'] = round(result_data['Recall'], 2)
    result_data['F-score'] = round(result_data['F-score'], 2)

    result_data.rename(index={0: 'Логистическая регрессия',
                       1: 'Метод к-ближайших соседей',
                       2: 'Случайный лес',
                       3: 'Градиентный бустинг'}, inplace= True)

    return result_data

# Камчатка

Первый способ предобработки данных

In [7]:
data = pd.read_csv('./Kamchatka_first.csv')
result_data = get_results(data)
result_data.head()

Unnamed: 0,Accuracy,Precision,Recall,F-score
Логистическая регрессия,0.55,0.71,0.48,0.57
Метод к-ближайших соседей,0.57,0.68,0.6,0.64
Случайный лес,0.6,0.74,0.56,0.64
Градиентный бустинг,0.55,0.67,0.56,0.61


Второй способ предобработки данных

In [8]:
data = pd.read_csv('./Kamchatka_second.csv')
result_data = get_results(data)
result_data.head()

Unnamed: 0,Accuracy,Precision,Recall,F-score
Логистическая регрессия,0.56,0.55,0.67,0.6
Метод к-ближайших соседей,0.56,0.54,0.72,0.62
Случайный лес,0.5,0.5,0.61,0.55
Градиентный бустинг,0.56,0.54,0.78,0.64


# Курильские острова

Первый способ предобработки данных

In [9]:
data = pd.read_csv('./Kuril_first.csv')
result_data = get_results(data)
result_data.head()

Unnamed: 0,Accuracy,Precision,Recall,F-score
Логистическая регрессия,0.68,0.8,0.59,0.68
Метод к-ближайших соседей,0.47,0.53,0.53,0.53
Случайный лес,0.58,0.65,0.59,0.62
Градиентный бустинг,0.53,0.61,0.5,0.55


Второй способ предобработки данных

In [10]:
data = pd.read_csv('./Kuril_second.csv')
result_data = get_results(data)
result_data.head()

Unnamed: 0,Accuracy,Precision,Recall,F-score
Логистическая регрессия,0.57,0.55,0.57,0.56
Метод к-ближайших соседей,0.47,0.46,0.57,0.51
Случайный лес,0.64,0.61,0.71,0.66
Градиентный бустинг,0.62,0.61,0.61,0.61


# Турция

Первый способ предобработки данных

In [11]:
data = pd.read_csv('./Turkey_first.csv')
result_data = get_results(data)
result_data.head()

Unnamed: 0,Accuracy,Precision,Recall,F-score
Логистическая регрессия,0.72,0.68,0.77,0.72
Метод к-ближайших соседей,0.57,0.54,0.59,0.57
Случайный лес,0.65,0.61,0.77,0.68
Градиентный бустинг,0.65,0.62,0.68,0.65


Второй способ предобработки данных

In [12]:
data = pd.read_csv('./Turkey_second.csv')
result_data = get_results(data)
result_data.head()

Unnamed: 0,Accuracy,Precision,Recall,F-score
Логистическая регрессия,0.52,0.57,0.44,0.5
Метод к-ближайших соседей,0.61,0.65,0.61,0.63
Случайный лес,0.55,0.6,0.5,0.55
Градиентный бустинг,0.45,0.5,0.39,0.44


# Чили

Первый способ предобработки данных

In [13]:
data = pd.read_csv('./Chile_first.csv')
result_data = get_results(data)
result_data.head()

Unnamed: 0,Accuracy,Precision,Recall,F-score
Логистическая регрессия,0.57,0.55,0.67,0.61
Метод к-ближайших соседей,0.58,0.55,0.81,0.65
Случайный лес,0.68,0.67,0.69,0.68
Градиентный бустинг,0.68,0.66,0.69,0.68


Второй способ предобработки данных

In [14]:
data = pd.read_csv('./Chile_second.csv')
result_data = get_results(data)
result_data.head()

Unnamed: 0,Accuracy,Precision,Recall,F-score
Логистическая регрессия,0.53,0.56,0.56,0.56
Метод к-ближайших соседей,0.49,0.53,0.42,0.47
Случайный лес,0.57,0.62,0.52,0.56
Градиентный бустинг,0.59,0.63,0.57,0.6


# Япония

Первый способ предобработки данных

In [15]:
data = pd.read_csv('./Japan_first.csv')
result_data = get_results(data)
result_data.head()

Unnamed: 0,Accuracy,Precision,Recall,F-score
Логистическая регрессия,0.5,0.52,0.47,0.5
Метод к-ближайших соседей,0.51,0.53,0.55,0.54
Случайный лес,0.59,0.63,0.51,0.56
Градиентный бустинг,0.62,0.66,0.59,0.62


Второй способ предобработки данных

In [16]:
data = pd.read_csv('./Japan_second.csv')
result_data = get_results(data)
result_data.head()

Unnamed: 0,Accuracy,Precision,Recall,F-score
Логистическая регрессия,0.52,0.51,0.6,0.55
Метод к-ближайших соседей,0.47,0.45,0.47,0.46
Случайный лес,0.5,0.49,0.54,0.51
Градиентный бустинг,0.51,0.49,0.55,0.52
