In [None]:
import gc
import os
from collections import defaultdict
from pathlib import Path

import sys
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from lightautoml.automl.presets.tabular_presets import (TabularAutoML,
                                                        TabularUtilizedAutoML)
from lightautoml.report.report_deco import ReportDeco
from lightautoml.tasks import Task
from sklearn.metrics import auc, precision_recall_curve
from sklearn.model_selection import (GroupKFold, GroupShuffleSplit,
                                     StratifiedGroupKFold)
from tqdm.notebook import tqdm

In [2]:
ROOT_DIR_PATH = ''

MAIN_PATH = Path(f'{ROOT_DIR_PATH}data/avito-merged-dataset')
ECOM_PRETRAIN = Path(f'{ROOT_DIR_PATH}data/clip-marqofashionsiglip-marqoecom-top2kaggle')
RESNET_PATH = Path(f'{ROOT_DIR_PATH}data/resnet-cossim/')
KAGGLE_TOP5 = Path(f'{ROOT_DIR_PATH}data/top5-kaggle')
BERTA_PATH = Path(f'{ROOT_DIR_PATH}data/berta-pretrained-cossims')
RUBERT_TINY_OOF_PATH = Path(f'{ROOT_DIR_PATH}data/rubert-folds')
RUBERT_TINY_PREDS_PATH = Path(f'{ROOT_DIR_PATH}data/rubert-test-preds')
E5LARGE_OOF_PATH = Path(f'{ROOT_DIR_PATH}data/avito-e5-large-pretrain')
E5LARGE_PREDS_PATH = Path(f'{ROOT_DIR_PATH}data/avito-e5-large-test')
REV_RUBERT_TINY_OOF_PATH = Path(f'{ROOT_DIR_PATH}data/name_desc_bert_oof_rev') 
REV_RUBERT_TINY_PREDS_PATH = Path(f'{ROOT_DIR_PATH}data/name-desc-bert-preds-rev')
USERBGE_COSSIMS_PATH = Path(f'{ROOT_DIR_PATH}data/userbge-cossims') 
RUBERT_BASE_TEST_PREDS_PATH = Path(f'{ROOT_DIR_PATH}data/rubert-fixed-test-preds') 
RUBERT_BASE_TEST_PREDS_REV_PATH = Path(f'{ROOT_DIR_PATH}data/rubert-fixed-test-preds-rev')
FT_PREDS_PATH = Path(f'{ROOT_DIR_PATH}data/ft-preds2')
RUBERT_BASE_OOF_PATH = Path(f'{ROOT_DIR_PATH}data/trained-rubert-base-preds')
RUBERT_BASE_OOF_REV_PATH = Path(f'{ROOT_DIR_PATH}data/trained-rubert-base-preds-rev')
ROUGE_PATH = Path(f'{ROOT_DIR_PATH}data/rouge-avito')

USE_MEAN_BASE_AND_REV = False

In [3]:
assert os.path.exists(MAIN_PATH)
assert os.path.exists(ECOM_PRETRAIN)
assert os.path.exists(RESNET_PATH)
assert os.path.exists(KAGGLE_TOP5)
assert os.path.exists(BERTA_PATH)
assert os.path.exists(RUBERT_TINY_OOF_PATH)
assert os.path.exists(RUBERT_TINY_PREDS_PATH)
assert os.path.exists(E5LARGE_OOF_PATH)
assert os.path.exists(E5LARGE_PREDS_PATH)
assert os.path.exists(REV_RUBERT_TINY_OOF_PATH)
assert os.path.exists(REV_RUBERT_TINY_PREDS_PATH)
assert os.path.exists(USERBGE_COSSIMS_PATH)
assert os.path.exists(RUBERT_BASE_TEST_PREDS_PATH)
assert os.path.exists(RUBERT_BASE_TEST_PREDS_REV_PATH)
assert os.path.exists(FT_PREDS_PATH)
assert os.path.exists(RUBERT_BASE_OOF_PATH)
assert os.path.exists(RUBERT_BASE_OOF_REV_PATH)

In [4]:
cat_features = [
    'is_same_location',
    'is_same_region',
    'category_level_1_match',
    'category_level_2_match',
    'category_level_3_match',
    'category_level_4_match',
    'category_level_3_fillness',
    'category_level_4_fillness',
    'n_images_fillness',
    'unique_cat_1',
    'unique_cat_2',
    'unique_cat_3',
    'unique_cat_4',
]

In [5]:
train = pd.read_parquet(MAIN_PATH / 'train_df.parquet')
test = pd.read_parquet(MAIN_PATH / 'test_df.parquet')

In [6]:
to_drop = [
    'category_level_1_1', 'category_level_1_2',
    'category_level_2_1', 'category_level_2_2',
    'category_level_3_1', 'category_level_3_2',
    'category_level_4_1', 'category_level_4_2',
]

train[to_drop] = train[to_drop].fillna('none')

train['unique_cat_1'] = train['category_level_1_1'] + '_' + train['category_level_1_2']
train['unique_cat_2'] = train['category_level_2_1'] + '_' + train['category_level_2_2']
train['unique_cat_3'] = train['category_level_3_1'] + '_' + train['category_level_3_2']
train['unique_cat_4'] = train['category_level_4_1'] + '_' + train['category_level_4_2']

test[to_drop] = test[to_drop].fillna('none')

test['unique_cat_1'] = test['category_level_1_1'] + '_' + test['category_level_1_2']
test['unique_cat_2'] = test['category_level_2_1'] + '_' + test['category_level_2_2']
test['unique_cat_3'] = test['category_level_3_1'] + '_' + test['category_level_3_2']
test['unique_cat_4'] = test['category_level_4_1'] + '_' + test['category_level_4_2']

train.drop(columns=to_drop, axis=1, inplace=True)
gc.collect()

test.drop(columns=to_drop, axis=1, inplace=True)
gc.collect()

0

In [7]:
numerical_features = train.select_dtypes(
    include=['float16', 'float32', 'float64', 'int8', 'int16', 'int32', 'int64']
).columns.to_list()

cols_with_inf = train[numerical_features].columns[np.isinf(train[numerical_features]).any()].tolist()

print("Колонки с inf:", cols_with_inf)

train.replace([np.inf, -np.inf], np.nan, inplace=True)
test.replace([np.inf, -np.inf], np.nan, inplace=True)

Колонки с inf: ['name_tanimoto', 'name_norm_tanimoto', 'name_en_tanimoto', 'name_mix_tanimoto', 'description_en_tanimoto', 'description_mix_tanimoto', 'name_tokens_w_digits_tanimoto', 'description_tokens_w_digits_tanimoto']


In [8]:
train[cat_features] = train[cat_features].astype(str)
test[cat_features] = test[cat_features].astype(str)

In [9]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                    gc.collect()
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                    gc.collect()
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                    gc.collect()
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
                    gc.collect()
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                    gc.collect()
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                    gc.collect()
                else:
                    df[col] = df[col].astype(np.float64)
                    gc.collect()
        else:
            if df[col].nunique() == 2:
                df[col] = df[col].astype('bool')
            gc.collect()

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    gc.collect()
    
    return df

In [10]:
train = reduce_mem_usage(train)

Memory usage of dataframe is 6546.15 MB
Memory usage after optimization is: 1803.24 MB
Decreased by 72.5%


In [11]:
test = reduce_mem_usage(test)

Memory usage of dataframe is 1741.41 MB
Memory usage after optimization is: 472.55 MB
Decreased by 72.9%


In [12]:
train = train.sort_values(by=['variantid_1', 'variantid_2'])
test = test.sort_values(by=['variantid_1', 'variantid_2'])

In [13]:
# --- IMG FEATURES ---

# pretrain clip
train_clip = pd.read_parquet(ECOM_PRETRAIN / 'cossim_final_embeddings_train_CLIP.parquet')
test_clip = pd.read_parquet(ECOM_PRETRAIN / 'cossim_final_embeddings_test_CLIP.parquet')

train_clip = train_clip.sort_values(by=['variantid_1', 'variantid_2'])
test_clip = test_clip.sort_values(by=['variantid_1', 'variantid_2'])
train['clip_cosine_sim'] = train_clip['cosine_sim']
test['clip_cosine_sim'] = test_clip['cosine_sim']
del train_clip, test_clip

# pretrain fashion siglip
train_fashionsiglip = pd.read_parquet(ECOM_PRETRAIN / 'cossim_final_embeddings_fashion_clip_train.parquet')
test_fashionsiglip = pd.read_parquet(ECOM_PRETRAIN / 'cossim_final_embeddings_fashion_clip_test.parquet')

train_fashionsiglip = train_fashionsiglip.sort_values(by=['variantid_1', 'variantid_2'])
test_fashionsiglip = test_fashionsiglip.sort_values(by=['variantid_1', 'variantid_2'])
train['fashionsiglip_cosine_sim'] = train_fashionsiglip['cosine_sim']
test['fashionsiglip_cosine_sim'] = test_fashionsiglip['cosine_sim']
del train_fashionsiglip, test_fashionsiglip

# pretrain marqo ecom
train_ecom = pd.read_parquet(ECOM_PRETRAIN / 'cossim_final_embeddings_ecomm_train.parquet')
test_ecom = pd.read_parquet(ECOM_PRETRAIN / 'cossim_final_embeddings_ecomm_test.parquet')

train_ecom = train_ecom.sort_values(by=['variantid_1', 'variantid_2'])
test_ecom = test_ecom.sort_values(by=['variantid_1', 'variantid_2'])
train['ecom_cosine_sim'] = train_ecom['cosine_sim']
test['ecom_cosine_sim'] = test_ecom['cosine_sim']
del train_ecom, test_ecom

# kaggle top2 model, bugged
# train_top2kaggle = pd.read_parquet(ECOM_PRETRAIN + 'cossim_final_top2_kaggle_train.parquet')
# test_top2kaggle = pd.read_parquet(ECOM_PRETRAIN + 'cossim_final_top2_kaggle_test.parquet')

# train_top2kaggle = train_top2kaggle.sort_values(by=['variantid_1', 'variantid_2'])
# test_top2kaggle = test_top2kaggle.sort_values(by=['variantid_1', 'variantid_2'])
# train['top2kaggle_cosine_sim'] = train_top2kaggle['cosine_sim']
# test['top2kaggle_cosine_sim'] = test_top2kaggle['cosine_sim']
# del train_top2kaggle, test_top2kaggle

# kaggle top5 model
train_top5kaggle = pd.read_parquet(KAGGLE_TOP5 / 'cossim_final_concat_train.parquet')
test_top5kaggle = pd.read_parquet(KAGGLE_TOP5 / 'cossim_final_concat_test.parquet')

train_top5kaggle = train_top5kaggle.sort_values(by=['variantid_1', 'variantid_2'])
test_top5kaggle = test_top5kaggle.sort_values(by=['variantid_1', 'variantid_2'])
train['top5kaggle_cosine_sim'] = train_top5kaggle['cosine_sim']
test['top5kaggle_cosine_sim'] = test_top5kaggle['cosine_sim']
del train_top5kaggle, test_top5kaggle

# trained resnet
train_resnet = pd.read_parquet(RESNET_PATH / 'train_resnet_cossim.parquet')
test_resnet = pd.read_parquet(RESNET_PATH / 'test_resnet_cossim.parquet')

train_resnet = train_resnet.sort_values(by=['variantid_1', 'variantid_2']).reset_index(drop=True)
test_resnet = test_resnet.sort_values(by=['variantid_1', 'variantid_2']).reset_index(drop=True)
train['resnet_cosine_sim'] = train_resnet['cossims_resnet']
test['resnet_cosine_sim'] = test_resnet['cossims_resnet']
del train_resnet, test_resnet

gc.collect()

# --- TEXT FEATURES ---

# pretrain berta
berta_cossims_train_part1 = pd.read_parquet(BERTA_PATH / 'berta_cossims_train_part1.parquet')
berta_cossims_train_part2 = pd.read_parquet(BERTA_PATH / 'berta_cossims_train_part2.parquet')
berta_cossims_train = pd.concat([berta_cossims_train_part1, berta_cossims_train_part2])
berta_cossims_test = pd.read_parquet(BERTA_PATH / 'berta_cossims_test.parquet')

berta_cossims_train = berta_cossims_train.sort_values(by=['variantid_1', 'variantid_2'])
berta_cossims_test = berta_cossims_test.sort_values(by=['variantid_1', 'variantid_2'])
train['berta_cossim'] = berta_cossims_train['berta_cossim']
test['berta_cossim'] = berta_cossims_test['berta_cossim']

del berta_cossims_train_part1, berta_cossims_train_part2, berta_cossims_train, berta_cossims_test
gc.collect()

# trained rubert (cherez zhopu rukamu obuchen)
rubert_oof_fold0 = pd.read_parquet(RUBERT_TINY_OOF_PATH / 'name_desc_bert_fold0.parquet')
rubert_oof_fold1 = pd.read_parquet(RUBERT_TINY_OOF_PATH / 'name_desc_bert_fold1.parquet')
rubert_oof_fold2 = pd.read_parquet(RUBERT_TINY_OOF_PATH / 'name_desc_bert_fold2.parquet')
rubert_oof_fold3 = pd.read_parquet(RUBERT_TINY_OOF_PATH / 'name_desc_bert_fold3.parquet')
rubert_oof_fold4 = pd.read_parquet(RUBERT_TINY_OOF_PATH / 'name_desc_bert_fold4.parquet')

rubert_test_pred_fold0 = pd.read_parquet(RUBERT_TINY_PREDS_PATH / 'name_desc_rubert_tiny_turbo_2048_wce_0.parquet')
rubert_test_pred_fold1 = pd.read_parquet(RUBERT_TINY_PREDS_PATH / 'name_desc_rubert_tiny_turbo_2048_wce_1.parquet')
rubert_test_pred_fold2 = pd.read_parquet(RUBERT_TINY_PREDS_PATH / 'name_desc_rubert_tiny_turbo_2048_wce_2.parquet')
rubert_test_pred_fold3 = pd.read_parquet(RUBERT_TINY_PREDS_PATH / 'name_desc_rubert_tiny_turbo_2048_wce_3.parquet')
rubert_test_pred_fold4 = pd.read_parquet(RUBERT_TINY_PREDS_PATH / 'name_desc_rubert_tiny_turbo_2048_wce_4.parquet')

rubert_oof_fold0.rename(columns={'name_desc_bert_oof1': 'name_desc_rubert_tiny_turbo_2048_wce'}, inplace=True)
rubert_oof_fold1.rename(columns={'name_desc_bert_oof4': 'name_desc_rubert_tiny_turbo_2048_wce'}, inplace=True)
rubert_oof_fold2.rename(columns={'name_desc_bert_oof4': 'name_desc_rubert_tiny_turbo_2048_wce'}, inplace=True)
rubert_oof_fold3.rename(columns={'name_desc_bert_oof4': 'name_desc_rubert_tiny_turbo_2048_wce'}, inplace=True)
rubert_oof_fold4.rename(columns={'name_desc_bert_oof4': 'name_desc_rubert_tiny_turbo_2048_wce'}, inplace=True)

rubert_oof_fold0 = rubert_oof_fold0.sort_values(by=['variantid_1', 'variantid_2'])
rubert_oof_fold1 = rubert_oof_fold1.sort_values(by=['variantid_1', 'variantid_2'])
rubert_oof_fold2 = rubert_oof_fold2.sort_values(by=['variantid_1', 'variantid_2'])
rubert_oof_fold3 = rubert_oof_fold3.sort_values(by=['variantid_1', 'variantid_2'])
rubert_oof_fold4 = rubert_oof_fold4.sort_values(by=['variantid_1', 'variantid_2'])

rubert_oof = rubert_oof_fold0['name_desc_rubert_tiny_turbo_2048_wce'] + \
    rubert_oof_fold1['name_desc_rubert_tiny_turbo_2048_wce'] + \
    rubert_oof_fold2['name_desc_rubert_tiny_turbo_2048_wce'] + \
    rubert_oof_fold3['name_desc_rubert_tiny_turbo_2048_wce'] + \
    rubert_oof_fold4['name_desc_rubert_tiny_turbo_2048_wce']

rubert_test_pred_fold0 = rubert_test_pred_fold0.sort_values(by=['variantid_1', 'variantid_2'])
rubert_test_pred_fold1 = rubert_test_pred_fold1.sort_values(by=['variantid_1', 'variantid_2'])
rubert_test_pred_fold2 = rubert_test_pred_fold2.sort_values(by=['variantid_1', 'variantid_2'])
rubert_test_pred_fold3 = rubert_test_pred_fold3.sort_values(by=['variantid_1', 'variantid_2'])
rubert_test_pred_fold4 = rubert_test_pred_fold4.sort_values(by=['variantid_1', 'variantid_2'])

rubert_preds = (
    rubert_test_pred_fold0['name_desc_rubert_tiny_turbo_2048_wce_0'] + 
    rubert_test_pred_fold1['name_desc_rubert_tiny_turbo_2048_wce_1'] + 
    rubert_test_pred_fold2['name_desc_rubert_tiny_turbo_2048_wce_2'] + 
    rubert_test_pred_fold3['name_desc_rubert_tiny_turbo_2048_wce_3'] + 
    rubert_test_pred_fold4['name_desc_rubert_tiny_turbo_2048_wce_4']
    ) / 5

train['name_desc_rubert_tiny_turbo_2048_wce'] = rubert_oof
test['name_desc_rubert_tiny_turbo_2048_wce'] = rubert_preds

del rubert_oof_fold0, rubert_oof_fold1, rubert_oof_fold2, rubert_oof_fold3, rubert_oof_fold4
del rubert_oof
del rubert_test_pred_fold0, rubert_test_pred_fold1, rubert_test_pred_fold2, rubert_test_pred_fold3, rubert_test_pred_fold4
del rubert_preds
gc.collect()

# pretrain e5large
e5large_cossims_train_part1 = pd.read_parquet(E5LARGE_OOF_PATH / 'e5large_cossims_fold0.parquet')
e5large_cossims_train_part2 = pd.read_parquet(E5LARGE_OOF_PATH / 'e5large_cossims_fold1.parquet')
e5large_cossims_train_part3 = pd.read_parquet(E5LARGE_OOF_PATH / 'e5large_cossims_fold2.parquet')
e5large_cossims_train_part4 = pd.read_parquet(E5LARGE_OOF_PATH / 'e5large_cossims_fold3.parquet')
e5large_cossims_train_part5 = pd.read_parquet(E5LARGE_OOF_PATH / 'e5large_cossims_fold4.parquet')

e5large_cossims_train = pd.concat([
    e5large_cossims_train_part1,
    e5large_cossims_train_part2,
    e5large_cossims_train_part3,
    e5large_cossims_train_part4,
    e5large_cossims_train_part5
])
e5large_cossims_train = e5large_cossims_train.sort_values(by=['variantid_1', 'variantid_2'])

del e5large_cossims_train_part1, e5large_cossims_train_part2, e5large_cossims_train_part3, e5large_cossims_train_part4, e5large_cossims_train_part5
gc.collect()

e5large_cossims_test_part1 = pd.read_parquet(E5LARGE_PREDS_PATH / 'e5large_cossims_part1.parquet')
e5large_cossims_test_part2 = pd.read_parquet(E5LARGE_PREDS_PATH / 'e5large_cossims_part2.parquet')

e5large_cossims_test = pd.concat([
    e5large_cossims_test_part1,
    e5large_cossims_test_part2,
])
e5large_cossims_test = e5large_cossims_test.sort_values(by=['variantid_1', 'variantid_2'])

del e5large_cossims_test_part1, e5large_cossims_test_part2
gc.collect()

train['e5large_cossim'] = e5large_cossims_train['e5large_cossim']
test['e5large_cossim'] = e5large_cossims_test['e5large_cossim']

del e5large_cossims_train, e5large_cossims_test
gc.collect()

# rev trained rubert (cherez zhopu rukamu obuchen)
rubert_oof_rev = pd.read_parquet(REV_RUBERT_TINY_OOF_PATH / 'name_desc_bert_oof_rev.parquet')
rubert_preds_rev = pd.read_parquet(REV_RUBERT_TINY_PREDS_PATH / 'name_desc_bert_preds_rev.parquet')

rubert_oof_rev = rubert_oof_rev.sort_values(by=['variantid_1', 'variantid_2'])
rubert_preds_rev = rubert_preds_rev.sort_values(by=['variantid_1', 'variantid_2'])
train['name_desc_rubert_tiny_turbo_2048_wce_rev'] = rubert_oof_rev['name_desc_bert_oof_rev']
test['name_desc_rubert_tiny_turbo_2048_wce_rev'] = rubert_preds_rev['name_desc_bert_preds_rev']
del rubert_oof_rev, rubert_preds_rev

gc.collect()

# rubert tta
if USE_MEAN_BASE_AND_REV:
    train['name_desc_rubert_tiny_turbo_2048_wce_tta'] = (train['name_desc_rubert_tiny_turbo_2048_wce'] / train['name_desc_rubert_tiny_turbo_2048_wce_rev']) / 2
    test['name_desc_rubert_tiny_turbo_2048_wce_tta'] = (test['name_desc_rubert_tiny_turbo_2048_wce'] / test['name_desc_rubert_tiny_turbo_2048_wce_rev']) / 2
    del train['name_desc_rubert_tiny_turbo_2048_wce'], train['name_desc_rubert_tiny_turbo_2048_wce_rev']
    del test['name_desc_rubert_tiny_turbo_2048_wce'], test['name_desc_rubert_tiny_turbo_2048_wce_rev']
    gc.collect()

# pretrain userbge
userbge_cossims_train = pd.DataFrame()
userbge_cossims_test = pd.DataFrame()

for i in range(1, 16):
    curr_df = pd.read_parquet(USERBGE_COSSIMS_PATH / f'userbge_cossims_train_part{i}.parquet')
    userbge_cossims_train = pd.concat([userbge_cossims_train, curr_df])
    print(f'{userbge_cossims_train.shape=}')
userbge_cossims_train = userbge_cossims_train.sort_values(by=['variantid_1', 'variantid_2'])

for i in range(1, 5):
    curr_df = pd.read_parquet(USERBGE_COSSIMS_PATH / f'userbge_cossims_test_part{i}.parquet')
    userbge_cossims_test = pd.concat([userbge_cossims_test, curr_df])
    print(f'{userbge_cossims_test.shape=}')
userbge_cossims_test = userbge_cossims_test.sort_values(by=['variantid_1', 'variantid_2'])

del curr_df
gc.collect()

train['userbge_cossim'] = userbge_cossims_train['userbge_cossim']
test['userbge_cossim'] = userbge_cossims_test['userbge_cossim']

del userbge_cossims_train, userbge_cossims_test
gc.collect()

# --- тут именно до сэмпла! ---

# trained rubert-base test preds
rubert_base_pred = pd.read_csv(
    RUBERT_BASE_TEST_PREDS_PATH / 'rubert_ZAEBAL_SUKA.csv'
).rename(columns={'base_id': 'variantid_1', 'cand_id': 'variantid_2'})
rubert_base_pred = rubert_base_pred.sort_values(by=['variantid_1', 'variantid_2']).reset_index(drop=True)
test['rubert_base_trained'] = rubert_base_pred['probability']

rubert_base_pred_rev = pd.read_csv(
    RUBERT_BASE_TEST_PREDS_REV_PATH / 'rubert_ZAEBAL_SUKA_REV.csv'
).rename(columns={'base_id': 'variantid_1', 'cand_id': 'variantid_2'})
rubert_base_pred_rev = rubert_base_pred_rev.sort_values(by=['variantid_1', 'variantid_2']).reset_index(drop=True)
test['rubert_base_trained_rev'] = rubert_base_pred_rev['probability']

del rubert_base_pred, rubert_base_pred_rev
gc.collect()

# rubert base test tta
if USE_MEAN_BASE_AND_REV:
    test['rubert_base_trained_tta'] = (test['rubert_base_trained'] + test['rubert_base_trained_rev']) / 2
    del test['rubert_base_trained'],  test['rubert_base_trained_rev']
    gc.collect()

# --- sample!!! ---
train = train.sample(len(train), random_state=42)

# trained ft (именно после сэмпла! тут уже все карты в нужном порядке разложены)
fasttext_train = joblib.load(FT_PREDS_PATH / 'oof_preds.pkl')
fasttext_test = joblib.load(FT_PREDS_PATH / 'test_preds.pkl')
fasttext_train_rev = joblib.load(FT_PREDS_PATH / 'oof_preds_rev.pkl')
fasttext_test_rev = joblib.load(FT_PREDS_PATH / 'test_preds_rev.pkl')

train['fasttext'] = fasttext_train
test['fasttext'] = fasttext_test
train['fasttext_rev'] = fasttext_train_rev
test['fasttext_rev'] = fasttext_test_rev

del fasttext_train, fasttext_test, fasttext_train_rev, fasttext_test_rev
gc.collect()

# fasttext tta
if USE_MEAN_BASE_AND_REV:
    train['fasttext_tta'] = (train['fasttext'] + train['fasttext_rev']) / 2
    test['fasttext_tta'] = (test['fasttext'] + test['fasttext_rev']) / 2
    del train['fasttext'], train['fasttext_rev']
    del test['fasttext'], test['fasttext_rev']
    gc.collect()

# trained rubert-base train (именно после сэмпла! тут уже все карты в нужном порядке разложены)
rubert_base_oof = joblib.load(RUBERT_BASE_OOF_PATH / 'oof_preds_rubert_base.joblib')
train['rubert_base_trained'] = rubert_base_oof

rubert_base_oof_rev = joblib.load(RUBERT_BASE_OOF_REV_PATH / 'oof_preds_rubert_base_rev.joblib')
train['rubert_base_trained_rev'] = rubert_base_oof_rev

del rubert_base_oof, rubert_base_oof_rev
gc.collect()

# rubert train test tta
if USE_MEAN_BASE_AND_REV:
    train['rubert_base_trained_tta'] = (train['rubert_base_trained'] + train['rubert_base_trained_rev']) / 2
    del train['rubert_base_trained'],  train['rubert_base_trained_rev']
    gc.collect()

# add rouge features (w/ tta)

train_rouge = pd.read_csv(ROUGE_PATH / 'train_rouge.csv')
test_rouge = pd.read_csv(ROUGE_PATH / 'test_rouge.csv')

train_rouge = train_rouge.sort_values(by=['variantid_1', 'variantid_2']).reset_index(drop=True)
test_rouge = test_rouge.sort_values(by=['variantid_1', 'variantid_2']).reset_index(drop=True)

train['rouge_1'] = train_rouge['rouge_1']
train['rouge_2'] = train_rouge['rouge_2']
train['rouge_3'] = train_rouge['rouge_3']
train['rouge_4'] = train_rouge['rouge_4']
train['rouge_s4'] = train_rouge['rouge_s4']
train['rouge_su4'] = train_rouge['rouge_su4']

test['rouge_1'] = test_rouge['rouge_1']
test['rouge_2'] = test_rouge['rouge_2']
test['rouge_3'] = test_rouge['rouge_3']
test['rouge_4'] = test_rouge['rouge_4']
test['rouge_s4'] = test_rouge['rouge_s4']
test['rouge_su4'] = test_rouge['rouge_su4']

userbge_cossims_train.shape=(125303, 3)
userbge_cossims_train.shape=(250606, 3)
userbge_cossims_train.shape=(375909, 3)
userbge_cossims_train.shape=(501212, 3)
userbge_cossims_train.shape=(626515, 3)
userbge_cossims_train.shape=(751818, 3)
userbge_cossims_train.shape=(877121, 3)
userbge_cossims_train.shape=(1002424, 3)
userbge_cossims_train.shape=(1127727, 3)
userbge_cossims_train.shape=(1253030, 3)
userbge_cossims_train.shape=(1378333, 3)
userbge_cossims_train.shape=(1503636, 3)
userbge_cossims_train.shape=(1628939, 3)
userbge_cossims_train.shape=(1754242, 3)
userbge_cossims_train.shape=(1879555, 3)
userbge_cossims_test.shape=(125000, 3)
userbge_cossims_test.shape=(250000, 3)
userbge_cossims_test.shape=(375000, 3)
userbge_cossims_test.shape=(500000, 3)


In [14]:
train.drop(columns=['variantid_1', 'variantid_2', 'base_title_image', 'cand_title_image'], axis=1, inplace=True)
test.drop(columns=['variantid_1', 'variantid_2', 'base_title_image', 'cand_title_image'], axis=1, inplace=True)
train.drop(columns=['action_date'], inplace=True)

In [15]:
features = [col for col in train.columns if col not in ['group_id', 'is_double']]
target = 'is_double'

In [16]:
def pr_auc(y_true, y_pred):
    precision, recall, _ = precision_recall_curve(y_true, y_pred)
    return auc(recall, precision)

In [17]:
def prauc_metric(y_true, y_pred, sample_weight, **kwargs):
    precision, recall, _ = precision_recall_curve(y_true, y_pred)
    return auc(recall, precision)

In [None]:
N_THREADS = 6
N_FOLDS = 10
RANDOM_STATE = 42
TIMEOUT = 4 * 24 * 3600
TARGET_NAME = 'is_double'
GROUP = 'group_id'

MEMORY_LIMIT = 32

In [19]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [20]:
task = Task('binary', metric=prauc_metric, greater_is_better=True)

In [21]:
roles = {
    'target': TARGET_NAME,
    'drop': [GROUP],
    'category': cat_features,
    'group': [GROUP]
}

In [22]:
automl = TabularUtilizedAutoML(
    task=task, 
    timeout=TIMEOUT,
    cpu_limit=N_THREADS,
    memory_limit=MEMORY_LIMIT,
    tuning_params={'max_tuning_time': 3600 * 8}, # 8ч на тюн каждой залупы
    reader_params = {
        'n_jobs': N_THREADS, 
        'cv': N_FOLDS, 
        'random_state': RANDOM_STATE
    },
    selection_params={'mode': 0},
    general_params = {
        'use_algos': [[
            'lgb', 'lgb_tuned', # таргет энкодинг не надо! все и так ок
            'cb', 'cb_tuned', 
            'xgb', 'xgb_tuned'
    ]]
    },
    gpu_ids='0'
)

In [None]:
%%time 
oof_pred = automl.fit_predict(train, roles=roles, verbose=3, log_file='automl_train.log')

In [24]:
print(f'OOF score: {pr_auc(train[TARGET_NAME].values, oof_pred.data[:, 0])}')

OOF score: 0.6441786282375546


In [26]:
test_pred = automl.predict(test)

In [27]:
joblib.dump(test_pred, 'test_pred.joblib')

['test_pred.joblib']

In [25]:
print(automl.create_model_str_desc())

Final prediction for new objects = 
	0.35189 * 1 averaged models with config = "C:\Program Files\Python311\Lib\site-packages\lightautoml\automl\presets\tabular_configs\conf_0_sel_type_0.yml" and different CV random_states. Their structures: 

	    Model #0.
		Final prediction for new objects (level 0) = 
			 0.06644 * (10 averaged models Lvl_0_Pipe_0_Mod_0_LightGBM) +
			 0.26423 * (10 averaged models Lvl_0_Pipe_0_Mod_1_Tuned_LightGBM) +
			 0.29731 * (10 averaged models Lvl_0_Pipe_0_Mod_3_Tuned_CatBoost) +
			 0.06856 * (10 averaged models Lvl_0_Pipe_0_Mod_4_XGBoost) +
			 0.30345 * (10 averaged models Lvl_0_Pipe_0_Mod_5_Tuned_XGBoost) 


	+ 0.22403 * 1 averaged models with config = "C:\Program Files\Python311\Lib\site-packages\lightautoml\automl\presets\tabular_configs\conf_1_sel_type_1.yml" and different CV random_states. Their structures: 

	    Model #0.
		Final prediction for new objects (level 0) = 
			 0.06293 * (10 averaged models Lvl_0_Pipe_0_Mod_0_LightGBM) +
			 0.20055 * (

In [28]:
joblib.dump(automl, 'automl_27052025_0.6441786282375546.joblib')

['automl_27052025_0.6441786282375546.joblib']