In [1]:
!pip install razdel
!pip install pymorphy3
!pip install -U pymorphy2-dicts-ru

Collecting razdel
  Downloading razdel-0.5.0-py3-none-any.whl.metadata (10.0 kB)
Downloading razdel-0.5.0-py3-none-any.whl (21 kB)
Installing collected packages: razdel
Successfully installed razdel-0.5.0
Collecting pymorphy3
  Downloading pymorphy3-2.0.3-py3-none-any.whl.metadata (1.9 kB)
Collecting dawg2-python>=0.8.0 (from pymorphy3)
  Downloading dawg2_python-0.9.0-py3-none-any.whl.metadata (7.5 kB)
Collecting pymorphy3-dicts-ru (from pymorphy3)
  Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl.metadata (2.0 kB)
Downloading pymorphy3-2.0.3-py3-none-any.whl (53 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.8/53.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dawg2_python-0.9.0-py3-none-any.whl (9.3 kB)
Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl (8.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m76.8 MB/s[0m eta [36m0:00:00[0m
[?25hIn

In [2]:
import os
import gc

import pandas as pd
import numpy as np
from tqdm import tqdm

import pickle
import tarfile
from functools import cache

import re
import nltk
import pymorphy3 as pm
from nltk.corpus import stopwords
from razdel import tokenize
from string import punctuation

import fasttext

from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import precision_recall_curve, auc

In [3]:
tqdm.pandas()

In [4]:
# 1st stage

# PREPROCESS_DATA = True
# DO_SWAP = False
# TRAIN_MODEL = False

# GET_REV_OOFS = False

# DO_INFER = False
# DO_INFER_REV = False

# DESC_N = None

In [5]:
# 2nd stage

# PREPROCESS_DATA = True
# DO_SWAP = True
# TRAIN_MODEL = False

# GET_REV_OOFS = False

# DO_INFER = False
# DO_INFER_REV = False

# DESC_N = None

In [6]:
# 3rd stage

# PREPROCESS_DATA = False
# DO_SWAP = False
# TRAIN_MODEL = True # use default data as train data

# GET_REV_OOFS = False

# DO_INFER = False
# DO_INFER_REV = False

# DESC_N = None

In [7]:
# 4th stage

# PREPROCESS_DATA = False
# DO_SWAP = False
# TRAIN_MODEL = True # use reversed data as train data

# GET_REV_OOFS = False

# DO_INFER = False
# DO_INFER_REV = False

# DESC_N = None

In [8]:
# 5th stage

PREPROCESS_DATA = False
DO_SWAP = False
TRAIN_MODEL = False

GET_REV_OOFS = False

DO_INFER = True
DO_INFER_REV = True

DESC_N = None

In [9]:
nltk.download('stopwords')
stop = stopwords.words('russian')
punkt = [p for p in punctuation] + ["`", "``" , "''", "'"]

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
morph = pm.MorphAnalyzer(lang='ru')

In [11]:
# @cache
def tokenize_(sent):
    sent = tokenize(sent)
    return [word.text for word in sent if word.text not in stop and word.text not in punkt]

@cache
def normalize(word):
    try:
        return morph.normal_forms(word)[0]
    except:
        return ''

def lemmatize(sent):
    return ' '.join([normalize(word) for word in sent])

# @cache
def preprocess_sent(sent):
    return lemmatize(tokenize_(sent))

def prepare(row):
    ret = ' '.join([
        ' _'.join([''] + preprocess_sent(row['name_1']).split()),
        '~'.join([''] + row['category_level_1_1'].split()) if row['category_level_1_1'] else '',
        '!'.join([''] + row['category_level_2_1'].split()) if row['category_level_2_1'] else '',
        '@'.join([''] + row['category_level_3_1'].split()) if row['category_level_3_1'] else '',
        '#'.join([''] + row['category_level_4_1'].split()) if row['category_level_4_1'] else '',
        preprocess_sent(row['description_1'].replace('\n', ' ')),
        ' _'.join([''] + preprocess_sent(row['name_2']).upper().split()),
        '~'.join([''] + row['category_level_1_2'].upper().split()) if row['category_level_1_2'] else '',
        '!'.join([''] + row['category_level_2_2'].upper().split()) if row['category_level_2_2'] else '',
        '@'.join([''] + row['category_level_3_2'].upper().split()) if row['category_level_3_2'] else '',
        '#'.join([''] + row['category_level_4_2'].upper().split()) if row['category_level_4_2'] else '',
        preprocess_sent(row['description_2'].replace('\n', ' ')).upper(),
    ])
    return ret

In [12]:
def remove_html_tags_and_emoji(text):
    if text is None:
        return None
    clean = re.compile('<.*?>')
    text = re.sub(clean, '', text)
    text = text.replace('\n', ' ')
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"
                               u"\U0001F300-\U0001F5FF"
                               u"\U0001F680-\U0001F6FF"
                               u"\U0001F1E0-\U0001F1FF"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
if PREPROCESS_DATA:
    train_df1 = pd.read_parquet('../data/preprocessed/train_texts.parquet')

    if DO_SWAP:
        cols_to_swap = [
            # ('variantid_1', 'variantid_2'),
            ('name_1', 'name_2'),
            ('description_1', 'description_2'),
            ('category_level_1_1', 'category_level_1_2'),
            ('category_level_2_1', 'category_level_2_2'),
            ('category_level_3_1', 'category_level_3_2'),
            ('category_level_4_1', 'category_level_4_2'),
            ('characteristic_attributes_mapping_1', 'characteristic_attributes_mapping_2')
        ]
    
        rename_map = {}
        for col1, col2 in cols_to_swap:
            rename_map[col1] = col2
            rename_map[col2] = col1
    
        train_df2 = train_df1.copy()
        train_df2 = train_df2.rename(columns=rename_map)
        train_df2 = train_df2[train_df1.columns]
    
        train_df = train_df2
        name_f = 'titles_preprocessed_fasttext_rev.pkl'

        del train_df1, train_df2
        gc.collect()
    else:
        train_df = train_df1
        name_f = 'titles_preprocessed_fasttext.pkl'
        
        del train_df1
        gc.collect()

    train_df = train_df.sort_values(by=['variantid_1', 'variantid_2'])
    train_df = train_df.sample(len(train_df), random_state=42).reset_index(drop=True)
    
    text = train_df.progress_apply(prepare, axis=1)
    text = [remove_html_tags_and_emoji(t) for t in tqdm(text)]

    with open(name_f, 'wb') as write_titles:
        pickle.dump(text, write_titles)

In [None]:
if TRAIN_MODEL:
    train_df = pd.read_parquet('../data/preprocessed/train_texts.parquet')
    
    with open('titles_preprocessed_fasttext_rev.pkl', 'rb') as f:
        text = pickle.load(f)

    train_df = train_df.sort_values(by=['variantid_1', 'variantid_2'])
    train_df = train_df.sample(len(train_df), random_state=42).reset_index(drop=True)
    train_df['text'] = text

    text = train_df['text']
    target = train_df['is_double']
    groups = train_df['group_id']

    del train_df
    gc.collect()

    gkf = StratifiedGroupKFold(n_splits=5)
    oof_preds = np.zeros(len(text))
    
    _it = tqdm(enumerate(gkf.split(text, target, groups)), total=5)
    
    for fold, (train_idx, val_idx) in _it:
        text_train = [text[i] for i in train_idx]
        y_train = [target[i] for i in train_idx]
        text_val = [text[i] for i in val_idx]
        y_val = [target[i] for i in val_idx]
    
        train_file = f'train_data_fold{fold}.txt'
        # val_file = f'val_data_fold{fold}.txt'
        
        _it.set_description('writing train to file')
        with open(train_file, 'w+', encoding='utf-8') as tr:
            for idx in range(len(text_train)):
                tr.write('__label__' + str(y_train[idx]) + ' ' + text_train[idx] + '\n')
    
        # _it.set_description('writing val to file')
        # with open(val_file, 'w+', encoding='utf-8') as valf:
        #     for idx in range(len(text_val)):
        #         valf.write('__label__' + str(y_val[idx]) + ' ' + text_val[idx] + '\n')
        
        _it.set_description('training')
        ft_model = fasttext.train_supervised(
            input=train_file,
            dim=300,
        )
        _it.set_description('saving model')
        ft_model.save_model(f'fast_avito_fold{fold}.model')
    
        _it.set_description('predicting')
        val_preds = []
        for text_ in text_val:
            pred = ft_model.predict(text_)
            val_preds.append(pred[1][0] if pred[0][0][-1] == '1' else 1-pred[1][0])
        oof_preds[val_idx] = np.array(val_preds)
    
        precision, recall, thresholds = precision_recall_curve(y_val, val_preds)
        oof_pr_auc = auc(recall, precision)
        print(f'for {fold}\'th fold {oof_pr_auc=}')
    
        os.remove(train_file)
        # os.remove(val_file)

        del text_train, y_train, text_val, y_val, ft_model, val_preds
        gc.collect()
        
    _it.close()

    with open('oof_preds.pkl', 'wb') as w:
        pickle.dump(oof_preds, w)

    precision, recall, thresholds = precision_recall_curve(target, oof_preds)
    oof_pr_auc = auc(recall, precision)
    print(f'{oof_pr_auc=}')

In [None]:
if GET_REV_OOFS:
    train_df = pd.read_parquet('../data/preprocessed/train_texts.parquet')
    
    with open('titles_preprocessed_fasttext_rev.pkl', 'rb') as f:
        text = pickle.load(f)

    train_df = train_df.sort_values(by=['variantid_1', 'variantid_2'])
    train_df = train_df.sample(len(train_df), random_state=42).reset_index(drop=True)
    train_df['text'] = text

    text = train_df['text']
    target = train_df['is_double']
    groups = train_df['group_id']

    del train_df
    gc.collect()

    gkf = StratifiedGroupKFold(n_splits=5)
    oof_preds_rev = np.zeros(len(text))
    
    _it = tqdm(enumerate(gkf.split(text, target, groups)), total=5)
    
    for fold, (train_idx, val_idx) in _it:
        text_val = [text[i] for i in val_idx]
        y_val = [target[i] for i in val_idx]
    
        _it.set_description('loading model')
        ft_model = fasttext.load_model(f'fast_avito_fold{fold}.model')
    
        _it.set_description('predicting')
        val_preds = []
        for text_ in text_val:
            pred = ft_model.predict(text_)
            val_preds.append(pred[1][0] if pred[0][0][-1] == '1' else 1-pred[1][0])
        oof_preds_rev[val_idx] = np.array(val_preds)
    
        precision, recall, thresholds = precision_recall_curve(y_val, val_preds)
        oof_pr_auc = auc(recall, precision)
        print(f'for {fold}\'th fold {oof_pr_auc=}')
    
        os.remove(train_file)

        del text_val, y_val, ft_model, val_preds
        gc.collect()
        
    _it.close()

    with open('oof_preds_rev.pkl', 'wb') as w:
        pickle.dump(oof_preds_rev, w)

    precision, recall, thresholds = precision_recall_curve(target, oof_preds_rev)
    oof_pr_auc = auc(recall, precision)
    print(f'{oof_pr_auc=}')

In [16]:
def get_batch_predictions(texts, model_paths):
    predictions = np.zeros((len(texts), len(model_paths)))
    
    for model_idx, path in enumerate(tqdm(model_paths)):
        model = fasttext.load_model(path)
        for text_idx, text in enumerate(texts):
            p = model.predict(text)
            predictions[text_idx, model_idx] = p[1][0] if p[0][0][-1] == '1' else 1 - p[1][0]
        del model
        gc.collect()
    avg_predictions = np.mean(predictions, axis=1)
    
    return avg_predictions

In [None]:
if DO_INFER:
    test_df = pd.read_parquet('../data/preprocessed/test_texts.parquet')
    test_df = test_df.sort_values(by=['variantid_1', 'variantid_2']).reset_index(drop=True)
    
    test_texts = test_df.progress_apply(prepare, axis=1)
    test_texts_no_emj = test_texts.progress_apply(remove_html_tags_and_emoji)

    del test_texts, test_df
    gc.collect()

    model_paths = [f'fast_avito_fold{i}.model' for i in range(5)]

    test_preds = get_batch_predictions(test_texts_no_emj, model_paths)
    
    with open('test_preds.pkl', 'wb') as w:
        pickle.dump(test_preds, w)

100%|██████████| 500000/500000 [24:31<00:00, 339.86it/s]
100%|██████████| 500000/500000 [00:25<00:00, 19266.40it/s]
100%|██████████| 5/5 [07:16<00:00, 87.20s/it]


In [None]:
if DO_INFER_REV:
    test_df_rev = pd.read_parquet('../data/preprocessed/test_texts.parquet')
    test_df_rev = test_df_rev.sort_values(by=['variantid_1', 'variantid_2']).reset_index(drop=True)

    cols_to_swap = [
        ('variantid_1', 'variantid_2'),
        ('name_1', 'name_2'),
        ('description_1', 'description_2'),
        ('category_level_1_1', 'category_level_1_2'),
        ('category_level_2_1', 'category_level_2_2'),
        ('category_level_3_1', 'category_level_3_2'),
        ('category_level_4_1', 'category_level_4_2'),
        ('characteristic_attributes_mapping_1', 'characteristic_attributes_mapping_2')
    ]

    rename_map = {}
    for col1, col2 in cols_to_swap:
        rename_map[col1] = col2
        rename_map[col2] = col1

    test_df_rev = test_df_rev.rename(columns=rename_map)
    test_texts_rev = test_df_rev.progress_apply(prepare, axis=1)
    test_texts_no_emj_rev = test_texts_rev.progress_apply(remove_html_tags_and_emoji)

    del test_texts_rev, test_df_rev
    gc.collect()

    model_paths = [f'fast_avito_fold{i}.model' for i in range(5)]

    test_preds_rev = get_batch_predictions(test_texts_no_emj_rev, model_paths)
    
    with open('test_preds_rev.pkl', 'wb') as w:
        pickle.dump(test_preds_rev, w)

100%|██████████| 500000/500000 [21:20<00:00, 390.58it/s]
100%|██████████| 500000/500000 [00:26<00:00, 18572.17it/s]
100%|██████████| 5/5 [07:44<00:00, 93.00s/it]
