# Лабораторная работа №1
## Базовый контест
### Выполнил Мухин Артем, группа 6233

In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
from typing import List
import pickle

# Подготовка и работа с данным

In [3]:
filename_labels = 'classes_train.csv'
filename_features = 'features_train.csv'

In [4]:
labels_df = pd.read_csv(filename_labels)
features_df = pd.read_csv(filename_features)

In [5]:
labels_df.head()

Unnamed: 0,is_bot
0,0
1,0
2,0
3,0
4,0


Задача: бинарная классификация

In [6]:
np.unique(labels_df.values)

array([0, 1], dtype=int64)

In [7]:
features_df.head()

Unnamed: 0,statuses_count,followers_count,friends_count,favourites_count,listed_count,is_default_profile,is_profile_use_background_image,is_verified,user_age,tweets_freq,followers_growth_rate,friends_growth_rate,favourites_growth_rate,listed_growth_rate,followers_friends_ratio,screen_name_length,num_digits_in_screen_name,length_of_name,num_digits_in_name,description_length
0,5124,6641,3074,13768,126,0,0,1,2711,1.890077,2.44965,1.133899,5.078569,0.046477,2.160377,12,0,14,0,73
1,10672,8465,1575,4561,191,0,1,1,4424,2.412297,1.913427,0.356013,1.030967,0.043174,5.374603,11,0,23,0,142
2,704,3108,454,2400,77,0,0,0,1358,0.518409,2.28866,0.334315,1.767305,0.056701,6.845815,12,0,18,0,90
3,6091,304528,526,599,1515,0,1,1,4214,1.44542,72.265781,0.124822,0.142145,0.359516,578.95057,10,0,11,0,51
4,12967,13693,204,1177,452,0,1,1,3200,4.052187,4.279063,0.06375,0.367812,0.14125,67.122549,13,0,14,0,106


Итого имеем 20 признаков, три из которых бинарные.

In [10]:
features_df.values.dtype, features_df.values.shape

(dtype('float64'), (2400, 20))

Для обучения будем использовать следующий пайплайн

Pipeline definition:
 * Extract data from CSV
 * Split data
 * Data normalization
 * Train
 * Evaluate

## CSV Data extractor

In [13]:
class Extractor:

    def extract_data(self, filename_labels: str, filename_features: str) -> List[np.ndarray]:
        labels_df = pd.read_csv(filename_labels)
        features_df = pd.read_csv(filename_features)
        
        labels = self.extract_labels(labels_df)
        features = self.extract_features(features_df)
        return labels, features

    def extract_labels(self, labels_df: pd.DataFrame) -> np.ndarray:
        labels = np.squeeze(labels_df.values)
        labels = labels.astype(np.uint8, copy=False)

        if labels.ndim != 1:
            raise RuntimeError(f"Labels array must have only two dimensions, \
                but it has {labels.ndim} with shape {labels.shape}")

        return labels

    def extract_features(self, features_df: pd.DataFrame) -> np.ndarray:
        features = features_df.values
        features = features.astype(np.float64, copy=False)
        
        if features.ndim != 2:
            raise RuntimeError(f"Features tensor must have only two dimensions, \
                but it has {features.ndim} with shape {features.shape}")

        return features
        
    def __call__(self, filename_features: str, filename_labels: str) -> List[np.ndarray]:
        return self.extract_data(filename_features, filename_labels)


## Прочитаем и нормируем данные

In [30]:
labels, features = Extractor()(filename_labels, filename_features)
scaler = StandardScaler()
features = scaler.fit_transform(features)

## Подберем параметры для полносвязной нейронной сети

In [31]:
model = MLPClassifier()

In [32]:
parameters = {
    'hidden_layer_sizes': [(10, ), (20, ), (30, ), (50, ), (80, ), (130, ), (210, ), (10, 10), (20, 20)],
    'max_iter': [50, 100, 150, 200, 300, 500]}
clf = GridSearchCV(model, parameters, scoring='f1')

In [33]:
clf.fit(features, labels)



## Результаты

Итого, по результатам эксперимента видно, что обычная полносвязаная нейронная сеть, с небольшим числом параметров - более чем хорошо справляется с задачей

Средний F1 лучшей модели, согласно кросс-валидации равен 0.987.

In [39]:
pd.DataFrame(clf.cv_results_)\
    .sort_values(by=['mean_test_score'], ascending=False)\
    .head(n=10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_hidden_layer_sizes,param_max_iter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,0.242342,0.003883,0.001,7.478899e-07,"(20,)",100,"{'hidden_layer_sizes': (20,), 'max_iter': 100}",0.979592,0.991667,0.987448,0.983471,0.995816,0.987599,0.005749,1
3,0.199179,0.014468,0.000827,0.0004164137,"(10,)",200,"{'hidden_layer_sizes': (10,), 'max_iter': 200}",0.97541,0.991736,0.987552,0.983471,0.995816,0.986797,0.007029,2
17,0.340866,0.031721,0.0006,0.0004897459,"(30,)",500,"{'hidden_layer_sizes': (30,), 'max_iter': 500}",0.97561,0.991597,0.987552,0.983471,0.991597,0.985965,0.00599,3
42,0.086085,0.001473,0.000901,0.0001987948,"(10, 10)",50,"{'hidden_layer_sizes': (10, 10), 'max_iter': 50}",0.983607,0.991667,0.979079,0.983471,0.991597,0.985884,0.004968,4
47,0.218456,0.035214,0.000794,0.0003973571,"(10, 10)",500,"{'hidden_layer_sizes': (10, 10), 'max_iter': 500}",0.97541,0.995816,0.983333,0.983471,0.987342,0.985074,0.006626,5
2,0.182246,0.003989,0.001,5.917394e-07,"(10,)",150,"{'hidden_layer_sizes': (10,), 'max_iter': 150}",0.97166,0.991667,0.983333,0.983471,0.991597,0.984346,0.007333,6
9,0.313887,0.029302,0.0008,0.0003997806,"(20,)",200,"{'hidden_layer_sizes': (20,), 'max_iter': 200}",0.971429,0.991667,0.983333,0.983471,0.991597,0.984299,0.007414,7
13,0.291689,0.020944,0.000801,0.0004002571,"(30,)",100,"{'hidden_layer_sizes': (30,), 'max_iter': 100}",0.979592,0.991736,0.979079,0.983471,0.987342,0.984244,0.004788,8
32,0.665082,0.094366,0.001,0.000633315,"(130,)",150,"{'hidden_layer_sizes': (130,), 'max_iter': 150}",0.97541,0.995816,0.978903,0.983471,0.987342,0.984188,0.00708,9
1,0.127414,0.003525,0.000876,0.0002503949,"(10,)",100,"{'hidden_layer_sizes': (10,), 'max_iter': 100}",0.967742,0.991597,0.991736,0.983471,0.983193,0.983548,0.008738,10


In [40]:
clf.best_estimator_.get_params()

{'activation': 'relu',
 'alpha': 0.0001,
 'batch_size': 'auto',
 'beta_1': 0.9,
 'beta_2': 0.999,
 'early_stopping': False,
 'epsilon': 1e-08,
 'hidden_layer_sizes': (20,),
 'learning_rate': 'constant',
 'learning_rate_init': 0.001,
 'max_fun': 15000,
 'max_iter': 100,
 'momentum': 0.9,
 'n_iter_no_change': 10,
 'nesterovs_momentum': True,
 'power_t': 0.5,
 'random_state': None,
 'shuffle': True,
 'solver': 'adam',
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': False,
 'warm_start': False}

# Сохраним данную модель

In [48]:
with open('models/model.pkl', 'wb') as f:
    pickle.dump(clf.best_estimator_, f)