In [None]:
import os
import random
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_curve, auc
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.utils.class_weight import compute_class_weight
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
import torch
from tqdm import tqdm

tqdm.pandas()

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # torch.cuda.manual_seed_all(seed) 
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [3]:
seed_everything(42)

In [None]:
train = pd.read_parquet('avito-for-dl-train.parquet')

train = train.sort_values(by=['variantid_1', 'variantid_2'])
train = train.sample(len(train), random_state=42)

In [5]:
target_col = 'is_double'
group_col = 'group_id'
text_col = 'product_row'

In [6]:
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train[target_col]),
    y=train[target_col]
)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32).cuda()

In [None]:
sgkf = StratifiedGroupKFold(n_splits=5)

batch_size = 64
oof = np.zeros(len(train))

model_name = 'sergeyzh/rubert-tiny-turbo'
# ozon_pretrain_path = '3epoch_1024_name_desc_bert_full.pth'

epochs = 1
max_length = 2048

for ifold, (tr, va) in enumerate(sgkf.split(train, train[target_col], groups=train[group_col])):
    # if ifold == 1:
    #     break
        
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).cuda()
    # model.load_state_dict(torch.load(ozon_pretrain_path))
    optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
    total_steps = (len(tr) // batch_size + (1 if len(tr) % batch_size != 0 else 0)) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )
    loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)

    for ep in range(epochs):
        model.train()
        train_losses = []
        curr_tr = tr.copy()
        np.random.shuffle(curr_tr)
        pbar = tqdm(range(0, len(curr_tr), batch_size), desc=f'fold {ifold} epoch {ep} loss 0.000', leave=False)
        for s_idx in pbar:
            e_idx = min(s_idx + batch_size, len(curr_tr))
            batch_idxs = curr_tr[s_idx:e_idx]

            products = train.iloc[batch_idxs][text_col].tolist()
            targets = train.iloc[batch_idxs][target_col].tolist()

            tks = tokenizer(
                products,
                max_length=max_length,
                padding=True,
                truncation=True,
                return_tensors='pt'
            )

            input_ids = tks['input_ids'].to(model.device)
            attention_mask = tks['attention_mask'].to(model.device)

            token_type_ids = tks.get('token_type_ids')
            if token_type_ids is not None:
                token_type_ids = token_type_ids.to(model.device)

            target_tensor = torch.tensor(targets, dtype=torch.long).to(model.device)

            optimizer.zero_grad() 

            outputs = model(
                input_ids,
                attention_mask=attention_mask,
                **({'token_type_ids': token_type_ids} if token_type_ids is not None else {})
            )

            loss = loss_fn(outputs.logits, target_tensor)

            loss.backward()
            optimizer.step()
            scheduler.step()

            current_loss = loss.item()
            train_losses.append(current_loss)
            pbar.set_description(f'fold {ifold} epoch {ep} loss {current_loss:.4f}')

        avg_train_loss = np.mean(train_losses)
        print(f'avg train loss: {avg_train_loss:.4f}')

        model.eval()
        eval_targets = []
        eval_scores = []
        val_batch_size = batch_size * 2

        with torch.no_grad():
            for s_idx in tqdm(range(0, len(va), val_batch_size), desc='validation', leave=False):
                e_idx = min(s_idx + val_batch_size, len(va))
                batch_idxs = va[s_idx:e_idx]

                if len(batch_idxs) == 0:
                    continue

                products = train.iloc[batch_idxs][text_col].tolist()
                targets = train.iloc[batch_idxs][target_col].tolist()

                tks = tokenizer(
                    products,
                    max_length=max_length,
                    padding=True,
                    truncation=True,
                    return_tensors='pt'
                )

                input_ids = tks['input_ids'].to(model.device)
                attention_mask = tks['attention_mask'].to(model.device)
                
                token_type_ids = tks.get('token_type_ids')
                if token_type_ids is not None:
                    token_type_ids = token_type_ids.to(model.device)

                logits = model(
                    input_ids,
                    attention_mask=attention_mask,
                    **({'token_type_ids': token_type_ids} if token_type_ids is not None else {})
                ).logits

                scores = logits[:, 1].cpu().numpy()

                eval_targets.extend(targets)
                eval_scores.extend(scores.tolist())

                if len(oof[batch_idxs]) == len(scores):
                     oof[batch_idxs] = scores
                else:
                    print('пиздец')

        precision, recall, _ = precision_recall_curve(eval_targets, eval_scores)
        oof_prauc = auc(recall, precision)
        print('fold', ifold, 'epoch', ep, 'pr-auc', round(oof_prauc, 5))

        model_save_path = f'name_desc_bert_fold{ifold}_epoch{ep+1}_prauc{oof_prauc:.5f}.pth'
        torch.save(model.state_dict(), model_save_path)

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.41M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/712 [00:00<?, ?B/s]

2025-05-03 13:30:51.595251: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746279051.799105      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746279051.855922      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/117M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sergeyzh/rubert-tiny-turbo and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                                                   

avg train loss: 0.3787


                                                               

fold 0 epoch 0 pr-auc 0.36452


In [None]:
train_for_oofs = train[['variantid_1', 'variantid_2', 'is_double']]
train_for_oofs[f'name_desc_bert_oof'] = oof
train_for_oofs.to_parquet(f'name_desc_bert_oof.parquet')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_for_oofs[f'name_desc_bert_oof{ifold}'] = oof
