In [None]:
import os
import random
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
import torch
import gc
from tqdm import tqdm

tqdm.pandas()

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # torch.cuda.manual_seed_all(seed) 
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [3]:
seed_everything(42)

In [None]:
test = pd.read_parquet('avito-for-dl-test-rev.parquet')

In [5]:
group_col = 'group_id'
text_col = 'product_row'

In [None]:
batch_size = 256
preds = np.zeros(len(test))

model_name = 'sergeyzh/rubert-tiny-turbo'
pretrain_pathes = {
    '0': 'name_desc_bert_fold0_epoch1_prauc0.36452.pth',
    '1': 'name_desc_bert_fold1_epoch1_prauc0.30395.pth',
    '2': 'name_desc_bert_fold2_epoch1_prauc0.29470.pth',
    '3': 'name_desc_bert_fold3_epoch1_prauc0.29884.pth',
    '4': 'name_desc_bert_fold4_epoch1_prauc0.40106.pth'
}

max_length = 2048

for i in range(5):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).cuda()
    model.load_state_dict(torch.load(pretrain_pathes[str(i)]))

    with torch.no_grad():
        for s_idx in tqdm(range(0, len(test), batch_size), desc='validation', leave=False):
            e_idx = min(s_idx + batch_size, len(test))
            
            products = test.iloc[s_idx:e_idx][text_col].tolist()

            tks = tokenizer(
                products,
                max_length=max_length,
                padding=True,
                truncation=True,
                return_tensors='pt'
            )

            input_ids = tks['input_ids'].to(model.device)
            attention_mask = tks['attention_mask'].to(model.device)
            
            token_type_ids = tks.get('token_type_ids')
            if token_type_ids is not None:
                token_type_ids = token_type_ids.to(model.device)

            logits = model(
                input_ids,
                attention_mask=attention_mask,
                **({'token_type_ids': token_type_ids} if token_type_ids is not None else {})
            ).logits

            scores = logits[:, 1].cpu().numpy()
            preds[s_idx:e_idx] += scores 

    del model
    gc.collect()

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.41M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/712 [00:00<?, ?B/s]

2025-05-08 17:20:31.944046: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746724832.111761      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746724832.162463      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/117M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sergeyzh/rubert-tiny-turbo and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(pretrain_pathes[str(i)]))
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sergeyzh/rubert-tiny-turbo and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sergeyzh/rubert-tiny-turbo and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceCl

In [7]:
test_for_preds = test[['variantid_1', 'variantid_2']]
test_for_preds[f'name_desc_bert_preds_rev'] = preds / 5
test_for_preds.to_parquet(f'name_desc_bert_preds_rev.parquet')