In [1]:
!pip install pymorphy2
!wget -O pymorphy2-dicts-ru.tar.gz https://files.pythonhosted.org/packages/b2/b4/732ff6eeac8c9ea22e7e1c7a321b21b6f3ba19d5e0a8925f35da9c8ebbb2/pymorphy2-dicts-ru-2.4.404381.4453942.tar.gz

Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 911 kB/s 
[?25hCollecting dawg-python>=0.7.1
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[K     |████████████████████████████████| 8.2 MB 2.6 MB/s 
[?25hInstalling collected packages: dawg-python, pymorphy2-dicts-ru, pymorphy2
Successfully installed dawg-python-0.7.2 pymorphy2-0.9.1 pymorphy2-dicts-ru-2.4.417127.4579844
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m
--2024-08-27 08:01:15--  https://files.pythonhosted.org/packages/b2/b4/732ff6eeac8c9ea22e7e1c7a321b21b6f3ba19d5e0a8925f35da9c8ebbb2/pymorphy2-dicts-ru-2.4.404381.4453942.tar.gz
Resolving files.pythonhosted.org (files.pythonhosted.org)... 199.232.96.223, 2a04:4e42:2000::223, 2a04:4e42:3000::2

In [2]:
import pickle
import pandas as pd
import tarfile
import gc
import ast
import re

from tqdm import tqdm
from multiprocessing import Pool

from string import punctuation

import fasttext
import pymorphy2

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [3]:
TRAIN_MODEL = True
PREPROCESS_DATA = not TRAIN_MODEL

In [4]:
stop = stopwords.words('russian')
punkt = [p for p in punctuation] + ["`", "``" , "''", "'"]

rudict = tarfile.open('./pymorphy2-dicts-ru.tar.gz')
rudict.extractall()
rudict.close()

lemmatizer = pymorphy2.MorphAnalyzer(
    path='./pymorphy2-dicts-ru-2.4.404381.4453942/pymorphy2_dicts_ru/data', 
    lang='ru'
)

In [5]:
text_and_bert = pd.read_parquet(
    '/kaggle/input/extracted_data/text_and_bert.parquet', 
    columns=['name', 'description'], 
    engine='pyarrow'
)

attrs = pd.read_parquet(
    '/kaggle/input/extracted_data/attributes.parquet', 
    columns=['variantid', 'categories'], 
    engine='pyarrow'
)

train_pairs = pd.read_parquet(
    '/kaggle/input/extracted_data/train.parquet', 
    engine='pyarrow'
)

In [6]:
data = pd.concat([attrs, text_and_bert], axis=1)

del text_and_bert, attrs
gc.collect()

28

In [7]:
def extract_categories(df: pd.DataFrame) -> pd.DataFrame:
    categories = pd.json_normalize(df['categories'].apply(ast.literal_eval))
    categories.columns = [f'category_level_{i+1}' for i in range(categories.shape[1])]
    return df.drop(columns=['categories']).join(categories)

data = extract_categories(data)

In [8]:
train_pairs.rename(
    columns={
        'variantid1': 'variantid_1',
        'variantid2': 'variantid_2'
    }, inplace=True
)

train_df = train_pairs.merge(
    data.add_suffix('_1'), 
    on='variantid_1'
).merge(
    data.add_suffix('_2'), 
    on='variantid_2'
)

train_df['category_level_2'] = train_df['category_level_2_1']

In [9]:
train_df['text'] = (
    train_df['category_level_2'].astype(str) + "  __SEP__ " +
    "Первый товар: " + train_df['name_1'].astype(str) + ", " +
    train_df['category_level_3_1'].astype(str) + ", " +
    train_df['category_level_4_1'].astype(str) + ", " +
    train_df['description_1'].astype(str).apply(lambda s: s[:1000]) + "  __SEP__ " +
    "Второй товар: " + train_df['name_2'].astype(str) + ", " +
    train_df['category_level_3_2'].astype(str) + ", " +
    train_df['category_level_4_2'].astype(str) + ", " +
    train_df['description_2'].astype(str).apply(lambda s: s[:1000])
)

text = train_df['text']
target = train_df['target']

In [10]:
del train_df
gc.collect()

516

In [11]:
def remove_html_tags_and_emoji(text):
    if text is None:
        return None
    clean = re.compile('<.*?>')
    text = re.sub(clean, '', text)
    text = text.replace('\n', ' ')
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"
                               u"\U0001F300-\U0001F5FF"
                               u"\U0001F680-\U0001F6FF"
                               u"\U0001F1E0-\U0001F1FF"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

text = [remove_html_tags_and_emoji(t) for t in text]
target = target.tolist()

In [12]:
def tokenize(sent):
    try:
        sent = word_tokenize(sent)
        return [word for word in sent if word not in stop and word not in punkt]
    except:
        return []

def lemmatize(sent):
    try:
        return ' '.join([lemmatizer.normal_forms(word)[0] for word in sent])
    except:
        return ' '

def preprocess_sent(sent):
    return lemmatize(tokenize(sent))

In [13]:
if PREPROCESS_DATA:
    with Pool(16) as p:
        titles_preprocessed = list(
            tqdm(
                p.imap(
                    preprocess_sent, 
                    text
                ),
                total=len(text)
            )
        )
        
    with open('titles_preprocessed.pkl', 'wb') as write_titles:
        pickle.dump(titles_preprocessed, write_titles)

In [14]:
if TRAIN_MODEL:
    with open('/kaggle/input/fasttext-ozon/titles_preprocessed.pkl', 'rb') as file: 
        titles_preprocessed = pickle.load(file) 

    test_indices = pd.read_csv('/kaggle/input/test-indices/index.csv')['0'].tolist()
    
    title_train = [title for i, title in enumerate(titles_preprocessed) if i not in test_indices]
    y_train = [y for i, y in enumerate(target) if i not in test_indices]
    
    title_test = [titles_preprocessed[i] for i in test_indices]
    y_test = [target[i] for i in test_indices]
    
    del titles_preprocessed, test_indices, target
    gc.collect()
    
    with open('train_data_titles.txt', 'w+', encoding='utf-8') as tr:
        for idx in range(len(title_train)):
            tr.write('__label__' + str(y_train[idx]) + ' ' + title_train[idx] + '\n')

    with open('test_data_titles.txt', 'w+', encoding='utf-8') as te:
        for idx in range(len(title_test)):
            te.write('__label__' + str(y_test[idx]) + ' ' + title_test[idx] + '\n')      

    ft_model = fasttext.train_supervised(input='train_data_titles.txt') # юзать тест для оптимизиации
    ft_model.save_model('fast_ozon.model')

In [15]:
def get_prediction_w_preprocess(text):
    return int(ft_model.predict(preprocess_sent(text))[0][0][-1])

def get_prediction(text):
    return int(ft_model.predict(text)[0][0][-1])

In [16]:
test_preds = []
for text in tqdm(title_test):
    test_preds.append(get_prediction(text))

100%|██████████| 50000/50000 [00:04<00:00, 11427.60it/s]


In [17]:
from sklearn.metrics import precision_recall_curve, auc

precision, recall, thresholds = precision_recall_curve(y_test, test_preds)
pr_auc = auc(recall, precision)

print(f"PR AUC: {pr_auc}")

PR AUC: 0.7922310662860605


In [18]:
# from concurrent.futures import ThreadPoolExecutor

# test_preds_parallel = []
# with ThreadPoolExecutor() as executor:
#     futures = [executor.submit(get_prediction, text) for text in title_test]
#     for future in tqdm(futures):
#         test_preds_parallel.append(future.result())