In [19]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
import joblib
import warnings
warnings.filterwarnings('ignore')

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

symb = {',' : ' ', '/' : ' ', '\\' : ' ', '-' : ' ', '(' : ' ', ')' : ' ', '.' : ' ', '|' : ' ', '!' : ' ', '?' : ' ', ':' : ' ', '"' : ' ', "'" : ' '}
tr_train = []
for i in range(len(train['Query'])):
    text = train['Query'][i]
    for u in symb:
        text = text.replace(u, symb[u])
    x = text.split()
    cur = ""
    for i in x:
        if len(i) > 1:
            cur += i
            cur += ' '
    tr_train.append(cur)
tr_test = []
for i in range(len(test['Query'])):
    text = test['Query'][i]
    for u in symb:
        text = text.replace(u, symb[u])
    x = text.split()
    cur = ""
    for i in x:
        if len(i) > 1:
            cur += i
            cur += ' '
    tr_test.append(cur)
train['Query'] = tr_train
test['Query'] = tr_test
# exit(0)
X_train = train['Query'].astype(str)
y_train = train['CategoryID'].astype(int)
X_test = test['Query'].astype(str)

word_vec = TfidfVectorizer(
    analyzer='word', ngram_range=(1, 2), min_df=1, sublinear_tf=True, lowercase=True
)
char_vec = TfidfVectorizer(
    analyzer='char', ngram_range=(3, 5), min_df=1, sublinear_tf=True, lowercase=True
)
vectoriser = FeatureUnion([
    ('word', word_vec),
    ('char', char_vec)
])

clf = LogisticRegression(max_iter=400, class_weight='balanced', n_jobs=-1, solver='liblinear', random_state=42)

pipe: Pipeline = Pipeline([
    ('vect', vectoriser),
    ('clf', clf)
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scores = []
for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
    pipe.fit(X_train.iloc[tr_idx], y_train.iloc[tr_idx])
    val_pred = pipe.predict(X_train.iloc[val_idx])
    score = f1_score(y_train.iloc[val_idx], val_pred, average='macro')
    f1_scores.append(score)
    print(f'  Fold {fold}: {score:.4f}')
print(f'oF1: {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}')

print('Training on full data…')
pipe.fit(X_train, y_train)

print('Predicting test set…')
test_pred = pipe.predict(X_test)
submission = pd.read_csv('sample_submission.csv')
submission['CategoryID'] = test_pred
submission.to_csv('submission.csv', index=False)

  Fold 1: 0.9177
  Fold 2: 0.9126
  Fold 3: 0.9025
  Fold 4: 0.9297
  Fold 5: 0.8942
oF1: 0.9113 ± 0.0123
Training on full data…
Predicting test set…


In [17]:
pd.options.display.max_rows = 1000

train.head(1000)

Unnamed: 0,ID,CategoryID,Query
0,1,1,edifier
1,2,2,умная колонка яндекс станция лайт
2,3,3,чехол iphone 14 pro
3,4,4,zte
4,5,5,ноутбук huawei matebook b3 520 53012kfg
5,6,1,мультимедиа акустика sven ms 2050bl
6,7,6,ssd samsung 870 evo 500gb
7,8,7,кабели переходники baseus
8,9,8,переходник сетевой robiton travelenergy
9,10,9,держатели для смартфонов авто
