In [None]:
import os
import random
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_curve, auc
from sklearn.model_selection import StratifiedGroupKFold
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
import torch
import gc
from tqdm import tqdm

tqdm.pandas()

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # torch.cuda.manual_seed_all(seed) 
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [3]:
seed_everything(42)

In [None]:
train = pd.read_parquet('avito-for-dl-train-rev.parquet')

train = train.sort_values(by=['variantid_1', 'variantid_2'])
train = train.sample(len(train), random_state=42)

In [5]:
target_col = 'is_double'
group_col = 'group_id'
text_col = 'product_row'

In [None]:
sgkf = StratifiedGroupKFold(n_splits=5)

batch_size = 256
oof = np.zeros(len(train))

model_name = 'sergeyzh/rubert-tiny-turbo'
pretrain_pathes = {
    '0': 'name_desc_bert_fold0_epoch1_prauc0.36452.pth',
    '1': 'name_desc_bert_fold1_epoch1_prauc0.30395.pth',
    '2': 'name_desc_bert_fold2_epoch1_prauc0.29470.pth',
    '3': 'name_desc_bert_fold3_epoch1_prauc0.29884.pth',
    '4': 'name_desc_bert_fold4_epoch1_prauc0.40106.pth'
}

max_length = 2048

for ifold, (tr, va) in enumerate(sgkf.split(train, train[target_col], groups=train[group_col])):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).cuda()
    model.load_state_dict(torch.load(pretrain_pathes[str(ifold)]))

    eval_targets = []
    eval_scores = []
    with torch.no_grad():
        for s_idx in tqdm(range(0, len(va), batch_size), desc='validation', leave=False):
            e_idx = min(s_idx + batch_size, len(va))
            batch_idxs = va[s_idx:e_idx]

            if len(batch_idxs) == 0:
                continue

            products = train.iloc[batch_idxs][text_col].tolist()
            targets = train.iloc[batch_idxs][target_col].tolist()

            tks = tokenizer(
                products,
                max_length=max_length,
                padding=True,
                truncation=True,
                return_tensors='pt'
            )

            input_ids = tks['input_ids'].to(model.device)
            attention_mask = tks['attention_mask'].to(model.device)
            
            token_type_ids = tks.get('token_type_ids')
            if token_type_ids is not None:
                token_type_ids = token_type_ids.to(model.device)

            logits = model(
                input_ids,
                attention_mask=attention_mask,
                **({'token_type_ids': token_type_ids} if token_type_ids is not None else {})
            ).logits

            scores = logits[:, 1].cpu().numpy()

            eval_targets.extend(targets)
            eval_scores.extend(scores.tolist())

            if len(oof[batch_idxs]) == len(scores):
                 oof[batch_idxs] = scores
            else:
                print('пиздец')

    precision, recall, _ = precision_recall_curve(eval_targets, eval_scores)
    oof_prauc = auc(recall, precision)
    print('fold', ifold, 'pr-auc', round(oof_prauc, 5))

    del model
    gc.collect()

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.41M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/712 [00:00<?, ?B/s]

2025-05-08 17:10:32.574789: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746724232.738177      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746724232.783508      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/117M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sergeyzh/rubert-tiny-turbo and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(pretrain_pathes[str(ifold)]))
                                                               

fold 0 pr-auc 0.36368


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sergeyzh/rubert-tiny-turbo and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(pretrain_pathes[str(ifold)]))
                                                               

fold 1 pr-auc 0.30403


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sergeyzh/rubert-tiny-turbo and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(pretrain_pathes[str(ifold)]))
                                                               

fold 2 pr-auc 0.29433


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sergeyzh/rubert-tiny-turbo and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(pretrain_pathes[str(ifold)]))
                                                               

fold 3 pr-auc 0.29927


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sergeyzh/rubert-tiny-turbo and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(pretrain_pathes[str(ifold)]))
                                                               

fold 4 pr-auc 0.40159


In [7]:
train_for_oofs = train[['variantid_1', 'variantid_2', 'is_double']]
train_for_oofs[f'name_desc_bert_oof_rev'] = oof
train_for_oofs.to_parquet(f'name_desc_bert_oof_rev.parquet')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_for_oofs[f'name_desc_bert_oof_rev'] = oof
