In [1]:
from copy import deepcopy
from tqdm import tqdm
from collections import Counter
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, accuracy_score
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import plotly.express as px

In [2]:
df = pd.read_csv('data.csv')

In [3]:
RANDOM_STATE = 42

TRAIN_SIZE = 0.6
VAL_SIZE = 0.15
TEST_SIZE = 0.25

TARGET = 'attr_value_name'

In [55]:
def calc_metrics(
        model: LogisticRegression,
        vect: CountVectorizer | TfidfVectorizer,
        sample_part: str = 'test',
        multi_class: str = 'ovr'
):
    """Функция для расчета метрики
    """
    descr_vect = vect.transform(df.loc[df['sample_part'] == sample_part, 'description'])
    proba = model.predict_proba(descr_vect)
    prediction = model.predict(descr_vect)
    target = df.loc[df['sample_part'] == sample_part, TARGET]

    auc_macro = roc_auc_score(
        y_score=proba,
        y_true=target,
        average='macro',
        multi_class=multi_class
    )
    auc_micro = roc_auc_score(
        y_score=proba,
        y_true=target,
        average='micro',
        multi_class=multi_class
    )

    accuracy = accuracy_score(
        y_pred=prediction,
        y_true=target
    )

    print(f'Metrics on {sample_part}:\n'
          f'\tAUC (macro): {auc_macro:.3f}\n'
          f'\tAUC (micro): {auc_micro:.3f}\n'
          f'\tAccuracy: {accuracy:.3f}\n')

Разбиение выборки

In [9]:
df_splitting = pd.read_csv('splitting_sample.csv')
df = df.merge(df_splitting, on='item_id', how='left')

Подгружаем разметку базелином

In [10]:
df_baseline = pd.read_csv('baseline_prediction.csv')
df = df.merge(df_baseline, on='item_id', how='left')

# BoW (без предобработки теста)

In [25]:
russian_stopwords = stopwords.words("russian")
russian_stopwords[:10]

['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с', 'со']

In [26]:
bow_vect = CountVectorizer(
    lowercase=True,
    preprocessor=None,
    tokenizer=None,
    stop_words=russian_stopwords,
    token_pattern='(?u)\\b\\w\\w+\\b',
    ngram_range=(1, 3),
    analyzer='word',
    max_df=1.0,
    min_df=100,
    max_features=None
)

bow_vect.fit(df.loc[df['sample_part'] == 'train', 'description'])

In [31]:
len(bow_vect.vocabulary_)

17449

Обучаем логрег

In [37]:
lr_bow = LogisticRegression(
    random_state=RANDOM_STATE,
    max_iter=500,
    n_jobs=-1,
    multi_class='ovr'
)

lr_bow.fit(
    X=bow_vect.transform(df.loc[df['sample_part'] == 'train', 'description']),
    y=df.loc[df['sample_part'] == 'train', TARGET]
)

In [48]:
calc_metrics(
    model=lr_bow,
    vect=bow_vect,
    sample_part='test'
)

Metrics on test:
	AUC (macro): 0.818
	AUC (micro): 0.856
	Accuracy: 0.632



Неплохо.. Во всяком случае базелин побит

# TF-IDF (без предобработки текста)

In [51]:
tfidf_vect = TfidfVectorizer(
    lowercase=True,
    preprocessor=None,
    tokenizer=None,
    stop_words=russian_stopwords,
    token_pattern='(?u)\\b\\w\\w+\\b',
    ngram_range=(1, 3),
    analyzer='word',
    max_df=1.0,
    min_df=100,
    max_features=None,
    norm='l2'
)

tfidf_vect.fit(df.loc[df['sample_part'] == 'train', 'description'])
len(bow_vect.vocabulary_)

17449

In [52]:
lr_tfidf = LogisticRegression(
    random_state=RANDOM_STATE,
    max_iter=500,
    n_jobs=-1,
    multi_class='ovr'
)

lr_tfidf.fit(
    X=tfidf_vect.transform(df.loc[df['sample_part'] == 'train', 'description']),
    y=df.loc[df['sample_part'] == 'train', TARGET]
)

In [53]:
calc_metrics(
    model=lr_tfidf,
    vect=tfidf_vect,
    sample_part='test'
)

Metrics on test:
	AUC (macro): 0.855
	AUC (micro): 0.883
	Accuracy: 0.668



Есть улучшения

# TF-IDF (со стеммингом)

In [None]:
class StemmedTfidfVectorizer(TfidfVectorizer):
    ru_stemmer = SnowballStemmer('russian')
    
    def build_analyzer(self):
        analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
        return lambda doc: (StemmedTfidfVectorizer.ru_stemmer.stem(w) for w in analyzer(doc))

In [None]:
tfidf = StemmedTfidfVectorizer(
    encoding='utf-8',
    lowercase=True,
    stop_words=russian_stopwords,
    token_pattern='(?u)\\b\\w\\w+\\b',
    ngram_range=(1, 2),
    analyzer='word',
    max_df=1.0,
    min_df=1
)

In [None]:
count_vect = CountVectorizer(
    encoding='utf-8',
    lowercase=True,
    stop_words=russian_stopwords,
    token_pattern='(?u)\\b\\w\\w+\\b',
    ngram_range=(1, 2),
    analyzer='word',
    max_df=1.0,
    min_df=1
)
tokenizer = count_vect.build_tokenizer()

stemmer = SnowballStemmer('russian') 

text_tokens = [tokenizer(text) for text in tqdm(df['description'].values)]
text_tokens_stem = [
    [stemmer.stem(word) for word in text] for text in tqdm(text_tokens)
    ]
df['description_tokens_stem'] = text_tokens_stem