In [2]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
DATA_PATH = "../data/train_preprocessed.pkl"
RANDOM_STATE = 42

# Чтение исходного датасета

In [5]:
df = pd.read_pickle(DATA_PATH)
df.columns

Index(['text', 'class', 'spacy', 'lemmatized', 'stemmed',
       'lemmatized_stopwords', 'stemmed_stopwords'],
      dtype='object')

# Предобработка текста

In [6]:
tfidf = TfidfVectorizer(sublinear_tf=True)
text_vectorized = tfidf.fit_transform(df['lemmatized'])
text_vectorized

<15449x15692 sparse matrix of type '<class 'numpy.float64'>'
	with 351600 stored elements in Compressed Sparse Row format>

# Разделим выборку на обучающую и валидационную

In [7]:
X_train, X_val, y_train, y_val = train_test_split(
    text_vectorized, df['class'], test_size=0.2, random_state=RANDOM_STATE
)

# Обучение моделей

## Логистическая регрессия

In [8]:
X_train.shape

(12359, 15692)

In [9]:
%%time
logistic_clf = LogisticRegression(random_state=RANDOM_STATE)
logistic_clf.fit(X_train, y_train)

CPU times: total: 56.3 s
Wall time: 48.7 s


LogisticRegression(random_state=42)

In [10]:
logistic_clf.score(X_val, y_val)

0.6951456310679611

In [11]:
logistic_clf.score(X_train, y_train)

0.8315397685896917

In [12]:
print(metrics.precision_score(y_val, logistic_clf.predict(X_val), average='weighted'))
print(metrics.recall_score(y_val, logistic_clf.predict(X_val), average='weighted'))
print(metrics.f1_score(y_val, logistic_clf.predict(X_val), average='weighted'))

0.7202948866430333
0.6951456310679611
0.6768765006852916


## LinearSVC

In [13]:
%%time
linear_svc = LinearSVC(random_state=RANDOM_STATE)
linear_svc.fit(X_train, y_train)

CPU times: total: 3.28 s
Wall time: 3.28 s


LinearSVC(random_state=42)

In [14]:
linear_svc.score(X_val, y_val)

0.7462783171521036

In [15]:
linear_svc.score(X_train, y_train)

0.988186746500526

In [16]:
print(metrics.precision_score(y_val, linear_svc.predict(X_val), average='weighted'))
print(metrics.recall_score(y_val, linear_svc.predict(X_val), average='weighted'))
print(metrics.f1_score(y_val, linear_svc.predict(X_val), average='weighted'))

0.7435887219962359
0.7462783171521036
0.7353587789260342


In [17]:
%%time

svc = LinearSVC(random_state=RANDOM_STATE)
parameters = {'C': np.arange(0.4, 0.6, 0.01)}
clf = GridSearchCV(svc, parameters)
clf.fit(X_train, y_train)
print(clf.best_score_)
print(clf.best_params_)

0.7325026292042711
{'C': 0.5200000000000001}
CPU times: total: 3min 26s
Wall time: 3min 27s


In [18]:
print(metrics.precision_score(y_val, clf.best_estimator_.predict(X_val), average='weighted'))
print(metrics.recall_score(y_val, clf.best_estimator_.predict(X_val), average='weighted'))
print(metrics.f1_score(y_val, clf.best_estimator_.predict(X_val), average='weighted'))

0.7526116237464396
0.7517799352750809
0.7384425907600496


## Random Forest

In [19]:
%%time
random_forest = RandomForestClassifier(random_state=RANDOM_STATE,
                                       n_estimators=60,
                                       max_depth=150,
                                       n_jobs=-1)
random_forest.fit(X_train, y_train)

CPU times: total: 37.3 s
Wall time: 6.31 s


RandomForestClassifier(max_depth=150, n_estimators=60, n_jobs=-1,
                       random_state=42)

In [20]:
random_forest.score(X_val, y_val)

0.6592233009708738

In [21]:
random_forest.score(X_train, y_train)

0.9988672222671737

In [22]:
print(metrics.precision_score(y_val, random_forest.predict(X_val), average='weighted'))
print(metrics.recall_score(y_val, random_forest.predict(X_val), average='weighted'))
print(metrics.f1_score(y_val, random_forest.predict(X_val), average='weighted'))

0.686763749178562
0.6592233009708738
0.6357131052668307


## K-Neighbours Classifier

In [23]:
%%time
neighbours = KNeighborsClassifier(n_neighbors=1, n_jobs=-1)
neighbours.fit(X_train, y_train)

CPU times: total: 46.9 ms
Wall time: 40 ms


KNeighborsClassifier(n_jobs=-1, n_neighbors=1)

In [24]:
neighbours.score(X_val, y_val)

0.5436893203883495

In [25]:
neighbours.score(X_train, y_train)

0.9987863095719719

In [26]:
print(metrics.precision_score(y_val, neighbours.predict(X_val), average='weighted'))
print(metrics.recall_score(y_val, neighbours.predict(X_val), average='weighted'))
print(metrics.f1_score(y_val, neighbours.predict(X_val), average='weighted'))

0.5421431434895635
0.5436893203883495
0.5356209747269443


## XGBoost Classifier

In [27]:
%%time
xgb = XGBClassifier(random_state=RANDOM_STATE, n_jobs=-1)
xgb.fit(X_train, y_train)

CPU times: total: 15min 13s
Wall time: 2min 27s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=-1, num_parallel_tree=1,
              objective='multi:softprob', random_state=42, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [28]:
xgb.score(X_val, y_val)

0.6880258899676376

In [29]:
xgb.score(X_train, y_train)

0.9988672222671737

In [30]:
print(metrics.precision_score(y_val, xgb.predict(X_val), average='weighted'))
print(metrics.recall_score(y_val, xgb.predict(X_val), average='weighted'))
print(metrics.f1_score(y_val, xgb.predict(X_val), average='weighted'))

0.6901739061253217
0.6880258899676376
0.6820635644929403


## LGBMClassifier

In [31]:
%%time
lgbm = LGBMClassifier(random_state=RANDOM_STATE, n_jobs=-1)
lgbm.fit(X_train, y_train)

CPU times: total: 10min 20s
Wall time: 1min 47s


LGBMClassifier(random_state=42)

In [32]:
lgbm.score(X_val, y_val)

0.7029126213592233

In [33]:
lgbm.score(X_train, y_train)

0.9988672222671737

In [34]:
print(metrics.precision_score(y_val, lgbm.predict(X_val), average='weighted'))
print(metrics.recall_score(y_val, lgbm.predict(X_val), average='weighted'))
print(metrics.f1_score(y_val, lgbm.predict(X_val), average='weighted'))

0.7116871072761117
0.7029126213592233
0.6959707154566489
