In [1]:
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from utils import TextTransformer

In [2]:
import warnings

warnings.filterwarnings("ignore")

In [3]:
DATA_PATH = "../data/train.csv"
RANDOM_STATE = 42

# Чтение исходного датасета

In [4]:
df = pd.read_csv(DATA_PATH)
df.columns

Index(['text', 'class'], dtype='object')

# Обучение моделей

## Логистическая регрессия

In [5]:
%%time

logistic_clf = Pipeline([
    ('text_transformer', TextTransformer()),
    ('tf_idf', TfidfVectorizer(sublinear_tf=True)),
    ('classifier', LogisticRegression(random_state=RANDOM_STATE))
])

logistic_clf.fit(df['text'], df['class'])

100%|██████████| 15449/15449 [06:43<00:00, 38.33it/s]
100%|██████████| 15449/15449 [00:00<00:00, 23350.17it/s]
100%|██████████| 15449/15449 [00:00<00:00, 112404.66it/s]


CPU times: total: 7min 13s
Wall time: 7min 13s


Pipeline(steps=[('text_transformer',
                 <utils.TextTransformer object at 0x000001D92C6C22E0>),
                ('tf_idf', TfidfVectorizer(sublinear_tf=True)),
                ('classifier', LogisticRegression(random_state=42))])

## LinearSVC

In [7]:
%%time

linear_svc = Pipeline([
    ('text_transformer', TextTransformer()),
    ('tf_idf', TfidfVectorizer(sublinear_tf=True)),
    ('classifier', LinearSVC(random_state=RANDOM_STATE))
])

linear_svc.fit(df['text'], df['class'])

100%|██████████| 15449/15449 [06:12<00:00, 41.48it/s]
100%|██████████| 15449/15449 [00:00<00:00, 26925.45it/s]
100%|██████████| 15449/15449 [00:00<00:00, 124826.00it/s]


CPU times: total: 6min 11s
Wall time: 6min 16s


Pipeline(steps=[('text_transformer',
                 <utils.TextTransformer object at 0x000001D956FAE460>),
                ('tf_idf', TfidfVectorizer(sublinear_tf=True)),
                ('classifier', LinearSVC(random_state=42))])

## Random Forest

In [8]:
%%time

random_forest = Pipeline([
    ('text_transformer', TextTransformer()),
    ('tf_idf', TfidfVectorizer(sublinear_tf=True)),
    ('classifier', RandomForestClassifier(random_state=RANDOM_STATE,
                                          n_estimators=60,
                                          max_depth=150,
                                          n_jobs=-1))
])

random_forest.fit(df['text'], df['class'])

100%|██████████| 15449/15449 [06:06<00:00, 42.11it/s]
100%|██████████| 15449/15449 [00:00<00:00, 25845.94it/s]
100%|██████████| 15449/15449 [00:00<00:00, 122604.22it/s]


CPU times: total: 6min 33s
Wall time: 6min 13s


Pipeline(steps=[('text_transformer',
                 <utils.TextTransformer object at 0x000001D95ACE0DC0>),
                ('tf_idf', TfidfVectorizer(sublinear_tf=True)),
                ('classifier',
                 RandomForestClassifier(max_depth=150, n_estimators=60,
                                        n_jobs=-1, random_state=42))])

## K-Neighbours Classifier

In [9]:
%%time

neighbours = Pipeline([
    ('text_transformer', TextTransformer()),
    ('tf_idf', TfidfVectorizer(sublinear_tf=True)),
    ('classifier', KNeighborsClassifier(n_neighbors=1, n_jobs=-1))
])

neighbours.fit(df['text'], df['class'])

100%|██████████| 15449/15449 [06:16<00:00, 41.00it/s]
100%|██████████| 15449/15449 [00:00<00:00, 20584.07it/s]
100%|██████████| 15449/15449 [00:00<00:00, 109698.68it/s]


CPU times: total: 6min 11s
Wall time: 6min 18s


Pipeline(steps=[('text_transformer',
                 <utils.TextTransformer object at 0x000001D955F99490>),
                ('tf_idf', TfidfVectorizer(sublinear_tf=True)),
                ('classifier', KNeighborsClassifier(n_jobs=-1, n_neighbors=1))])

## XGBoost Classifier

In [10]:
%%time

xgb = Pipeline([
    ('text_transformer', TextTransformer()),
    ('tf_idf', TfidfVectorizer(sublinear_tf=True)),
    ('classifier', XGBClassifier(random_state=RANDOM_STATE, n_jobs=-1))
])

xgb.fit(df['text'], df['class'])

100%|██████████| 15449/15449 [06:07<00:00, 42.05it/s]
100%|██████████| 15449/15449 [00:00<00:00, 24410.76it/s]
100%|██████████| 15449/15449 [00:00<00:00, 66425.15it/s]


CPU times: total: 22min 30s
Wall time: 8min 24s


Pipeline(steps=[('text_transformer',
                 <utils.TextTransformer object at 0x000001D9575CDB50>),
                ('tf_idf', TfidfVectorizer(sublinear_tf=True)),
                ('classifier',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, gpu_id=-1,
                               importance_type='gain',
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=-1, num_parallel_tree=1,
                               objective='multi:softprob', random_state=42,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
                         

## LGBMClassifier

In [11]:
%%time

lgbm = Pipeline([
    ('text_transformer', TextTransformer()),
    ('tf_idf', TfidfVectorizer(sublinear_tf=True)),
    ('classifier', LGBMClassifier(random_state=RANDOM_STATE, n_jobs=-1))
])

lgbm.fit(df['text'], df['class'])

100%|██████████| 15449/15449 [06:04<00:00, 42.33it/s]
100%|██████████| 15449/15449 [00:00<00:00, 21943.96it/s]
100%|██████████| 15449/15449 [00:00<00:00, 67657.76it/s]


CPU times: total: 19min 15s
Wall time: 7min 59s


Pipeline(steps=[('text_transformer',
                 <utils.TextTransformer object at 0x000001D957692D30>),
                ('tf_idf', TfidfVectorizer(sublinear_tf=True)),
                ('classifier', LGBMClassifier(random_state=42))])

# Сохранение моделей

In [12]:
import pickle

with open('logr.pkl', 'wb') as file:
    pickle.dump(logistic_clf, file)

with open('svm.pkl', 'wb') as file:
    pickle.dump(linear_svc, file)

with open('xgb.pkl', 'wb') as file:
    pickle.dump(xgb, file)

with open('rf.pkl', 'wb') as file:
    pickle.dump(random_forest, file)

with open('knn.pkl', 'wb') as file:
    pickle.dump(neighbours, file)

with open('lgbm.pkl', 'wb') as file:
    pickle.dump(lgbm, file)