In [1]:
import os
import pandas as pd

from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import precision_recall_curve, auc

from dataclasses import dataclass
from datasets import Dataset

import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EvalPrediction,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)

2025-05-14 12:10:36.307533: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747224636.492344      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747224636.546735      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
import numpy as np
import _codecs

torch.serialization.add_safe_globals([
    np.core.multiarray.scalar, 
    np.dtype, 
    np.dtypes.Float64DType, 
    np.dtypes.UInt32DType,
    np.core.multiarray._reconstruct,
    np.ndarray,
    _codecs.encode, 
])

In [None]:
VER = 1
N_FOLD = 0

os.environ['WANDB_PROJECT'] = f'fixed-clf-bert-fold{N_FOLD}'
os.environ['WANDB_NOTES'] = f'fixed-clf-bert-fold{N_FOLD}'
os.environ['WANDB_NAME'] = f'fixed-clf-bert-fold{N_FOLD}'

In [None]:
df = pd.read_parquet(
    'train_texts.parquet',
    columns=['variantid_1', 'variantid_2', 'group_id', 'is_double']
)

df = df.sort_values(by=['variantid_1', 'variantid_2'])
df = df.sample(len(df), random_state=42)

sgkf = StratifiedGroupKFold(n_splits=5)

fold_mapping = {
    '0': {
        'train_idxs': [],
        'val_idxs': [],
    },
    '1': {
        'train_idxs': [],
        'val_idxs': [],
    },
    '2': {
        'train_idxs': [],
        'val_idxs': [],
    },
    '3': {
        'train_idxs': [],
        'val_idxs': [],
    },
    '4': {
        'train_idxs': [],
        'val_idxs': [],
    },
}

for fold, (train_idx, val_idx) in enumerate(sgkf.split(df, df['is_double'], groups=df['group_id'])):
    fold_mapping[str(fold)]['train_idxs'] = train_idx
    fold_mapping[str(fold)]['val_idxs'] = val_idx

In [None]:
dataset = Dataset.from_parquet('avito-for-dl-train-rev.parquet')
dataset = dataset.rename_column('is_double', 'label')

Generating train split: 0 examples [00:00, ? examples/s]

In [6]:
train_dataset = dataset.select(fold_mapping[str(N_FOLD)]['train_idxs'].tolist())
eval_dataset = dataset.select(fold_mapping[str(N_FOLD)]['val_idxs'].tolist())

In [7]:
len(train_dataset), len(eval_dataset)

(1503646, 375909)

In [8]:
train_dataset[0]

{'variantid_1': '0000102c4b265b3c6346920adc5970d50d4a9ec1ad27096ed6a9879aed919ac4',
 'variantid_2': '1a395516c4f953404b82c568803bc98c880053731cbcfa6855678776f177a91c',
 'product_row': 'NAME1: Подсвечник давленный 5; NAME2: Угловая полка доя икон\nCATEGORY1: Для дома и дачи->Мебель и интерьер->Предметы интерьера, искусство->Ароматы для дома и свечи; CATEGORY2: Для дома и дачи->Мебель и интерьер->Другое->none\nDESCRIPTION1: Подсвечник давленный номер 5 высота  22-23 см . Без доставки . Самовывоз; DESCRIPTION2: 35 x35x50.  Забрать можно сегодня до 12.00',
 'group_id': 134041,
 'label': 0}

In [9]:
@dataclass
class Config:
    output_dir: str = 'output'
    checkpoint: str = 'DeepPavlov/rubert-base-cased'
    num_labels: int = 2
    max_length: int = 512
    optim_type: str = 'adamw_torch'
    per_device_train_batch_size: int = 32
    gradient_accumulation_steps: int = 1
    per_device_eval_batch_size: int = 32
    n_epochs: int = 1
    lr: float = 2e-5
    warmup_ratio: int = 0.03
    
config = Config()

In [10]:
training_args = TrainingArguments(
    output_dir=f'output-{VER}',
    overwrite_output_dir=True,
    report_to='wandb',
    num_train_epochs=config.n_epochs,
    per_device_train_batch_size=config.per_device_train_batch_size,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    # per_device_eval_batch_size=config.per_device_eval_batch_size,
    logging_steps=1,
    # eval_strategy='epoch',
    save_strategy='steps',
    save_steps=500,
    save_total_limit=4,
    optim=config.optim_type,
    learning_rate=config.lr,
    warmup_ratio=config.warmup_ratio,
    remove_unused_columns=False,
    lr_scheduler_type='cosine',  # 'cosine' or 'linear' or 'constant' (default is 'linear')
)

In [11]:
tokenizer = AutoTokenizer.from_pretrained(config.checkpoint)

tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [12]:
model = AutoModelForSequenceClassification.from_pretrained(
    config.checkpoint,
    num_labels=config.num_labels,
    device_map='cuda:0',
    trust_remote_code=True
)

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(df['is_double']),
    y=df['is_double']
)

In [14]:
def collate_fn(batch):
    texts = [item['product_row'] for item in batch]
    labels = torch.tensor([item['label'] for item in batch])
    
    result = tokenizer(
        texts, 
        padding=True, 
        truncation=True, 
        max_length=config.max_length,
        return_tensors="pt"
    )
    
    result["labels"] = labels
    
    return result

In [15]:
class WeightedCETrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        if class_weights is not None:
            self.class_weights = torch.tensor(class_weights, dtype=torch.float)

    def compute_loss(self, model, inputs, num_items_in_batch=None, return_outputs=False):
        labels = inputs.get('labels')
        outputs = model(**inputs)
        logits = outputs.get('logits')
        
        loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights.to(self.model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [16]:
def compute_metrics(eval_preds: EvalPrediction) -> dict:
    y_true = eval_preds.label_ids
    y_pred = eval_preds.predictions[:, 1]
    
    precision, recall, _ = precision_recall_curve(y_true, y_pred)
    prauc = auc(recall, precision)

    return {
        'prauc': prauc
    }

In [17]:
import time
from transformers import TrainerCallback

class TimeLimitCallback(TrainerCallback):
    def __init__(self, time_limit_hours):
        self.time_limit_seconds = time_limit_hours * 3600
        self.start_time = None

    def on_train_begin(self, args, state, control, **kwargs):
        self.start_time = time.time()

    def on_step_end(self, args, state, control, **kwargs):
        elapsed_time = time.time() - self.start_time
        if elapsed_time >= self.time_limit_seconds:
            control.should_training_stop = True

In [None]:
trainer = WeightedCETrainer(
    args=training_args, 
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    # eval_dataset=eval_dataset,
    # compute_metrics=compute_metrics,
    data_collator=collate_fn,
    class_weights=class_weights,
    # callbacks=[TimeLimitCallback(time_limit_hours=11.5)]
)

  super().__init__(*args, **kwargs)


In [None]:
trainer.train('/kaggle/input/rubert-folds0/output-1/checkpoint-23495')

[34m[1mwandb[0m: Currently logged in as: [33mlightsource-[0m ([33mlightsource-unk[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Tracking run with wandb version 0.19.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250514_121313-45mqerii[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33moutput-1[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/lightsource-unk/fixed-clf-bert-fold2[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/lightsource-unk/fixed-clf-bert-fold2/runs/45mqerii[0m


Step,Training Loss


TrainOutput(global_step=23495, training_loss=0.0, metrics={'train_runtime': 2.0429, 'train_samples_per_second': 736048.762, 'train_steps_per_second': 11501.022, 'total_flos': 3.9562588594784256e+17, 'train_loss': 0.0, 'epoch': 1.0})

In [None]:
test_df = pd.read_parquet('avito-for-dl-test-rev.parquet')
test_df['label'] = 0
test_df.loc[228, 'label'] = 1
test_df.loc[1337, 'label'] = 1
test_dataset = Dataset.from_pandas(test_df)

In [21]:
training_args.per_device_eval_batch_size = config.per_device_eval_batch_size
training_args.eval_strategy = 'epoch'
trainer.eval_dataset = test_dataset
trainer.compute_metrics = compute_metrics

In [22]:
output = trainer.predict(test_dataset=test_dataset)



In [23]:
predictions = output.predictions
metrics = output.metrics

In [24]:
import joblib

joblib.dump(predictions, 'predictions_rev.joblib')
joblib.dump(metrics, 'metrics_rev.joblib')

['metrics_rev.joblib']

In [25]:
# final_output_dir = f'{config.output_dir}/final_fold{N_FOLD}'
# model.save(final_output_dir)