In [1]:
import pandas as pd
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
DATA_PATH = "../data/train_preprocessed.pkl"
RANDOM_STATE = 42

# Чтение исходного датасета

In [4]:
df = pd.read_pickle(DATA_PATH)
df.columns

Index(['text', 'class', 'spacy', 'lemmatized', 'stemmed',
       'lemmatized_stopwords', 'stemmed_stopwords'],
      dtype='object')

# Предобработка текста

In [5]:
tfidf = TfidfVectorizer(sublinear_tf=True)
text_vectorized = tfidf.fit_transform(df['lemmatized_stopwords'])
text_vectorized

<15449x18878 sparse matrix of type '<class 'numpy.float64'>'
	with 275209 stored elements in Compressed Sparse Row format>

# Разделим выборку на обучающую и валидационную

In [6]:
X_train, X_val, y_train, y_val = train_test_split(
    text_vectorized, df['class'], test_size=0.2, random_state=RANDOM_STATE
)

# Обучение моделей

## 1. Логистическая регрессия

In [7]:
X_train.shape

(12359, 18878)

In [8]:
%%time
logistic_clf = LogisticRegression(random_state=RANDOM_STATE)
logistic_clf.fit(X_train, y_train)

CPU times: total: 59.5 s
Wall time: 47.2 s


LogisticRegression(random_state=42)

In [9]:
logistic_clf.score(X_val, y_val)

0.6737864077669903

In [10]:
logistic_clf.score(X_train, y_train)

0.8113115947892224

In [11]:
print(metrics.precision_score(y_val, logistic_clf.predict(X_val), average='weighted'))
print(metrics.recall_score(y_val, logistic_clf.predict(X_val), average='weighted'))
print(metrics.f1_score(y_val, logistic_clf.predict(X_val), average='weighted'))

0.7074223877849981
0.6737864077669903
0.6555612653578783


# 2. LinearSVC

In [12]:
%%time
linear_svc = LinearSVC(random_state=RANDOM_STATE)
linear_svc.fit(X_train, y_train)

CPU times: total: 2.16 s
Wall time: 2.17 s


LinearSVC(random_state=42)

In [13]:
linear_svc.score(X_val, y_val)

0.7291262135922331

In [14]:
linear_svc.score(X_train, y_train)

0.9857593656444696

In [15]:
print(metrics.precision_score(y_val, linear_svc.predict(X_val), average='weighted'))
print(metrics.recall_score(y_val, linear_svc.predict(X_val), average='weighted'))
print(metrics.f1_score(y_val, linear_svc.predict(X_val), average='weighted'))

0.7286112676917453
0.7291262135922331
0.7182029784161587


# 3. Random Forest

In [16]:
%%time
random_forest = RandomForestClassifier(random_state=RANDOM_STATE,
                                       n_estimators=60,
                                       max_depth=150,
                                       n_jobs=-1)
random_forest.fit(X_train, y_train)

CPU times: total: 40.7 s
Wall time: 7.16 s


RandomForestClassifier(max_depth=150, n_estimators=60, n_jobs=-1,
                       random_state=42)

In [17]:
random_forest.score(X_val, y_val)

0.6566343042071198

In [18]:
random_forest.score(X_train, y_train)

0.9976535318391455

In [19]:
print(metrics.precision_score(y_val, random_forest.predict(X_val), average='weighted'))
print(metrics.recall_score(y_val, random_forest.predict(X_val), average='weighted'))
print(metrics.f1_score(y_val, random_forest.predict(X_val), average='weighted'))

0.7000315681125266
0.6566343042071198
0.6376598324138238


# 5. K-Neighbours Classifier

In [20]:
%%time
neighbours = KNeighborsClassifier(n_neighbors=1, n_jobs=-1)
neighbours.fit(X_train, y_train)

CPU times: total: 31.2 ms
Wall time: 36 ms


KNeighborsClassifier(n_jobs=-1, n_neighbors=1)

In [21]:
neighbours.score(X_val, y_val)

0.5271844660194175

In [22]:
neighbours.score(X_train, y_train)

0.99870539687677

In [23]:
print(metrics.precision_score(y_val, neighbours.predict(X_val), average='weighted'))
print(metrics.recall_score(y_val, neighbours.predict(X_val), average='weighted'))
print(metrics.f1_score(y_val, neighbours.predict(X_val), average='weighted'))

0.5293421157208686
0.5271844660194175
0.5197316613133615


# 6. XGBoost Classifier

In [24]:
%%time
xgb = XGBClassifier(random_state=RANDOM_STATE, n_jobs=-1)
xgb.fit(X_train, y_train)

CPU times: total: 12min 54s
Wall time: 2min 14s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=-1, num_parallel_tree=1,
              objective='multi:softprob', random_state=42, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [25]:
xgb.score(X_val, y_val)

0.6711974110032363

In [26]:
xgb.score(X_train, y_train)

0.9951452382878874

In [27]:
print(metrics.precision_score(y_val, xgb.predict(X_val), average='weighted'))
print(metrics.recall_score(y_val, xgb.predict(X_val), average='weighted'))
print(metrics.f1_score(y_val, xgb.predict(X_val), average='weighted'))

0.6762649694132405
0.6711974110032363
0.6644826723315318


# LGBMClassifier


In [28]:
%%time
lgbm = LGBMClassifier(random_state=RANDOM_STATE, n_estimators=25, n_jobs=-1)
lgbm.fit(X_train, y_train)

CPU times: total: 2min 15s
Wall time: 20.6 s


LGBMClassifier(n_estimators=25, random_state=42)

In [29]:
lgbm.score(X_val, y_val)

0.6453074433656958

In [30]:
lgbm.score(X_train, y_train)

0.9889149607573429

In [31]:
print(metrics.precision_score(y_val, lgbm.predict(X_val), average='weighted'))
print(metrics.recall_score(y_val, lgbm.predict(X_val), average='weighted'))
print(metrics.f1_score(y_val, lgbm.predict(X_val), average='weighted'))


0.651278118847996
0.6453074433656958
0.6378618865202134
