Подключаем необходимые библиотеки

In [1]:
import joblib
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

np.random.seed(42)
warnings.filterwarnings('ignore')

Считываем таблицы features_train.csv и classes_train.csv

In [2]:
features = pd.read_csv('features_train.csv')
classes = pd.read_csv('classes_train.csv')

Добавляем метки классов и удаляем бинарные признаки

In [3]:
new = features.copy()
new['is_bot'] = pd.Series(classes['is_bot'], index = new.index)

Проверка на пропущенные значения

In [4]:
new.isnull().sum()

statuses_count                     0
followers_count                    0
friends_count                      0
favourites_count                   0
listed_count                       0
is_default_profile                 0
is_profile_use_background_image    0
is_verified                        0
user_age                           0
tweets_freq                        0
followers_growth_rate              0
friends_growth_rate                0
favourites_growth_rate             0
listed_growth_rate                 0
followers_friends_ratio            0
screen_name_length                 0
num_digits_in_screen_name          0
length_of_name                     0
num_digits_in_name                 0
description_length                 0
is_bot                             0
dtype: int64

Разделим данные на зависимые (y) и независимые (X)

In [5]:
X = new.drop('is_bot', axis = 1)
y = new.is_bot

Разделяем данные на обучающую и тестовую выборки

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

## 1.1 Обучить модель без каких-либо дополнительных условий, которая должна наилучшим образом отработать на тестовой выборке преподавателя с точки зрения F-меры

Создаём кортеж для сравнения значений F-меры

In [7]:
F = {'Logistic_Regression':0, 
     'K_Neighbors_Classifier':0, 
     'Decision_Tree_Classifier':0,
     'Random_Forest_Classifier':0 }

Логистическая регрессия

In [8]:
model_LR = LogisticRegression() 
scaler = StandardScaler()

X_train1 = scaler.fit_transform(X_train)
X_test1 = scaler.transform(X_test)

model_LR.fit(X_train1, y_train)
y_LR = model_LR.predict(X_test1)

F['Logistic_Regression'] = f1_score(y_test, y_LR, average = "weighted")
F['Logistic_Regression']

0.9983353465631291

K - ближайших соседей

In [9]:
KNN = KNeighborsClassifier()
scaler = StandardScaler()

X_train1 = scaler.fit_transform(X_train)
X_test1 = scaler.transform(X_test)

model_KNN = KNeighborsClassifier()
model_KNN.fit(X_train1, y_train)
y_KNN = model_KNN.predict(X_test1)

F['K_Neighbors_Classifier'] = f1_score(y_test, y_KNN, average = "weighted")
F['K_Neighbors_Classifier']

0.9933169548355198

Decision Tree Classifier

In [10]:
DTC = DecisionTreeClassifier()
scaler = StandardScaler()

X_train1 = scaler.fit_transform(X_train)
X_test1 = scaler.transform(X_test)

model_DTC = DecisionTreeClassifier()
model_DTC.fit(X_train1, y_train)
y_DTC = model_DTC.predict(X_test1)

F['Decision_Tree_Classifier'] = f1_score(y_test, y_DTC, average = "weighted")
F['Decision_Tree_Classifier']

0.9950060396893874

Random Forest Classifier

In [11]:
RFC = RandomForestClassifier()
scaler = StandardScaler()

steps_RFC = [('normalizer', scaler), ('RFC', RFC)]
pipe_RFC = Pipeline(steps_RFC)

model_RFC = RandomForestClassifier()
model_RFC.fit(X_train, y_train)
y_RFC = model_RFC.predict(X_test)

F['Random_Forest_Classifier'] = f1_score(y_test, y_RFC, average = "weighted")
F['Random_Forest_Classifier']

0.9983312974530304

Выбираем лучшую модель

In [12]:
maximum = 0

for i, j in F.items():
    if j > maximum:
        maximum = j
        model_max = i
print ('Наибольшее значение F-меры:', maximum, '.  Модель:', model_max)

Наибольшее значение F-меры: 0.9983353465631291 .  Модель: Logistic_Regression


Сохраняем наилучшую модель

In [13]:
joblib.dump(model_max, 'model_' + model_max +'.joblib')

['model_Logistic_Regression.joblib']