In [1]:
# Переменные
import torch
SEED = 1488

# ---------МОДЕЛЬ----------
MODEL_NAME = 'MODEL_BACKBONE_NAME' # timm
FC_DIM = 768

# ---------ОБУЧЕНИЕ---------
N_EPOCH = 1
IMG_SIZE = 224
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
CURRENT_FOLD = 1

# ---------ДАННЫЕ-----------
PATH_TO_TRAIN_PARQUETS = r'C:\avito\tables\train/'
PATH_TO_TRAIN_IMAGES = r'C:\avito\images\train\images/'

PATH_TO_TEST_PARQUETS = r'C:\avito\tables\test/'
PATH_TO_TEST_IMAGES = r'C:\avito\images\test\images/'

PATH_TO_TXT = './train_images_zip_paths.txt'

COLUMNS = ['base_item_id',
           'cand_item_id',
           'base_subcategory_name',
           'cand_subcategory_name',
           'group_id', 
           'action_date', 
           'base_title_image', 
           'cand_title_image', 
           'is_double']


In [2]:
import os
VER = 1
N_FOLD = 0

main_name = f'trunk_{MODEL_NAME}_imgsz_{IMG_SIZE}_'

os.environ['WANDB_API_KEY'] = 'YOUR_API_KEY_HERE'
os.environ['WANDB_PROJECT'] = f'{main_name}fold_{N_FOLD}-OnlineContrastive0.6'
os.environ['WANDB_NOTES'] = f'{main_name}fold_{N_FOLD}-OnlineContrastive0.6'
os.environ['WANDB_NAME'] = f'{main_name}fold_{N_FOLD}-OnlineContrastive0.6'

In [3]:
from dataclasses import dataclass

@dataclass
class Config:
    output_dir: str = 'output'
    num_labels: int = 2
    max_length: int = 1024
    optim_type: str = 'adamw_torch'
    per_device_train_batch_size: int = 16
    gradient_accumulation_steps: int = 1
    per_device_eval_batch_size: int = 128
    n_epochs: int = 1
    lr: float = 5e-5
    warmup_ratio: int = 0.0001
    
config = Config()

In [4]:
import torch
import timm
import cv2
import os

import numpy as np
import pandas as pd 
import torch.nn as nn

import torch.nn.functional as F

import albumentations as A
from albumentations.pytorch import ToTensorV2

from timm.scheduler import CosineLRScheduler

from torch.utils.data import Dataset, DataLoader

from sklearn.utils import shuffle
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import precision_recall_curve, auc, average_precision_score

from IPython.display import display

from PIL import Image
from tqdm import tqdm

import glob
import random
from gc import collect as garbage_collector



INFO:albumentations.check_version:A new version of Albumentations is available: 2.0.7 (you have 1.4.12). Upgrade using: pip install -U albumentations. To disable automatic update checks, set the environment variable NO_ALBUMENTATIONS_UPDATE to 1.


In [5]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EvalPrediction,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)


INFO:datasets:PyTorch version 2.6.0+cu124 available.
INFO:datasets:TensorFlow version 2.18.0 available.


In [6]:
from safetensors.torch import load_file

In [7]:
class TrunkModel(nn.Module):
    def __init__(self, model_name,):
        super().__init__()
        self.trunk_model = timm.create_model(
            model_name=model_name,
            pretrained=True,
            num_classes=0
        )
        
        #self.__delete_mlp_head()
        print('----TRUNK----')
        print(self.trunk_model)

    def __delete_mlp_head(self):
        model_layers = list(self.trunk_model.named_modules())
        mlp_layer_name = None
        for i, (name, module) in enumerate(model_layers):
            if isinstance(module, nn.Linear):
                mlp_layer_name = name
                mlp_module_in_features = module.in_features
                break

        self.mlp_module_in_features = mlp_module_in_features

        delattr(self.trunk_model, mlp_layer_name)
        
        print(
            "MLP head was deleted"
        )


    def forward(self, x):
        return self.trunk_model(x)

In [8]:
class HeadModel(nn.Module):
    def __init__(self, mlp_architecture):
        super().__init__()
        self.head = nn.Sequential(*mlp_architecture)
        self._init_params()
        print('----HEAD----')
        print(self.head)
        
    def _init_params(self):
        nn.init.xavier_normal_(self.head[0].weight)
        nn.init.constant_(self.head[0].bias, 0)
        nn.init.constant_(self.head[1].weight, 1)
        nn.init.constant_(self.head[1].bias, 0)
        
    def forward(self, x):
        return self.head(x)

In [9]:
class EmbeddingModel(nn.Module):
    def __init__(self, trunk, head):
        super().__init__()
        self.trunk = trunk
        self.head = head
        
    def forward(self, x):
        x = self.trunk(x)
        x = self.head(x)
        return F.normalize(x)

In [10]:
class KakUchitToBlya(Dataset):

    def __init__(self, dataframe, transforms, path_to_images):
        self.data = self.create_image_names_pairs(dataframe)
        self.transforms = transforms
        self.path = path_to_images
            
    @staticmethod
    def create_image_names_pairs(df):
        return list(zip(df.base_title_image, df.cand_title_image, df.is_double))

        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img1, img2, target = self.data[idx]
        
        try:
            image1 = cv2.imread(self.path + img1 + '.jpg')
            image2 = cv2.imread(self.path + img2 + '.jpg')
    
            image1 = cv2.cvtColor(image1, cv2.COLOR_BGR2RGB)
            image2 = cv2.cvtColor(image2, cv2.COLOR_BGR2RGB)
            
            if self.transforms:
                image1 = self.transforms(image=image1)['image']
                image2 = self.transforms(image=image2)['image']
        except:
            image1 = torch.zeros((3, IMG_SIZE, IMG_SIZE))
            image2 = torch.zeros((3, IMG_SIZE, IMG_SIZE))
            target = 255
        
        return {
            "image1": image1,
            "image2": image2,
            "labels": torch.tensor(target, dtype=torch.long)
        }

In [11]:
class OnlineContrastiveLoss(nn.Module):
    def __init__(
        self, margin: float = 0.5, strategy='online'
    ) -> None:

        super().__init__()
        self.margin = margin
        self.distance_metric = lambda x, y: 1 - F.cosine_similarity(x, y)
        self.strategy = strategy
        
    def forward(self, embedding1, embedding2, labels, size_average=True):

        mask = labels != 255
        labels = labels[mask]
        embedding1 = embedding1[mask]
        embedding2 = embedding2[mask]
        
        
        distance_matrix = self.distance_metric(embedding1, embedding2)
        
        if self.strategy == 'online':
            negs = distance_matrix[labels == 0]
            poss = distance_matrix[labels == 1]
        
            negative_pairs = negs[negs < (poss.max() if len(poss) > 1 else negs.mean())]
            positive_pairs = poss[poss > (negs.min() if len(negs) > 1 else poss.mean())]
    
            positive_loss = positive_pairs.pow(2).sum()
            negative_loss = F.relu(self.margin - negative_pairs).pow(2).sum()
            loss = positive_loss + negative_loss
            if size_average:
                loss /= (len(negative_pairs) + len(positive_pairs) + 1e-8)
            return loss
        elif self.strategy == 'common':
            losses = 0.5 * (
            labels.float() * distance_matrix.pow(2) + (1 - labels).float() * F.relu(self.margin - distance_matrix).pow(2)
        )
            return losses.mean() if size_average else losses.sum()
        elif self.strategy == 'combined':
            negs = distance_matrix[labels == 0]
            poss = distance_matrix[labels == 1]
        
            negative_pairs = negs[negs < (poss.max() if len(poss) > 1 else negs.mean())]
            positive_pairs = poss[poss > (negs.min() if len(negs) > 1 else poss.mean())]
    
            positive_loss = positive_pairs.pow(2).sum()
            negative_loss = F.relu(self.margin - negative_pairs).pow(2).sum()
            loss = positive_loss + negative_loss
            loss /= (len(negative_pairs) + len(positive_pairs) + 1e-8)
            
            losses = 0.5 * (
            labels.float() * distance_matrix.pow(2) + (1 - labels).float() * F.relu(self.margin - distance_matrix).pow(2)
        )
            losses = losses.mean()
            return (loss + losses) * 0.5

In [12]:
class OnlineContrastiveTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fn = OnlineContrastiveLoss(strategy='common', margin=0.6)
        
    def compute_loss(self, model, inputs, num_items_in_batch=None, return_outputs=False):

        img1 = inputs.pop('image1')
        img2 = inputs.pop('image2')
        labels = inputs.pop('labels')
        
        batch = torch.cat([img1, img2], dim=0)
        
        assert batch.shape[0] % 2 == 0
        
        outputs = model(batch)
        emb1, emb2 = torch.chunk(outputs, 2)
        
        loss = self.loss_fn(emb1, emb2, labels)
        return (loss, outputs) if return_outputs else loss

    def prediction_step(self, model, inputs, prediction_loss_only=False, ignore_keys=None):
        model.eval()
        with torch.no_grad():
            img1 = inputs.pop('image1')
            img2 = inputs.pop('image2')
            labels = inputs.pop('labels')
            
            batch = torch.cat([img1, img2], dim=0)
            
            assert batch.shape[0] % 2 == 0
            
            outputs = model(batch)

            emb1, emb2 = torch.chunk(outputs, 2)

        return None, (emb1, emb2), labels
            


In [13]:
def get_merged_df_from_path(path, columns_for_filter=None):
    all_data = []
    for parquet in glob.glob(path + '*.parquet'):
        all_data.append(
            pd.read_parquet(parquet, engine='pyarrow')
        )
    all_data_df = pd.concat(all_data, ignore_index=True)

    return all_data_df[columns_for_filter] if columns_for_filter else all_data_df

In [14]:
df = get_merged_df_from_path(PATH_TO_TRAIN_PARQUETS, COLUMNS).rename(columns={'base_item_id': 'variantid_1', 'cand_item_id': 'variantid_2'})

In [18]:
# df = pd.read_parquet(
#     '/kaggle/input/for-fasttext-and-bert-avito/train_zalupa.parquet',
#     columns=['variantid_1', 'variantid_2', 'group_id', 'is_double']
# )

df = df.sort_values(by=['variantid_1', 'variantid_2'])
df = df.sample(len(df), random_state=42)

sgkf = StratifiedGroupKFold(n_splits=5)

fold_mapping = {
    '0': {
        'train_idxs': [],
        'val_idxs': [],
    },
    '1': {
        'train_idxs': [],
        'val_idxs': [],
    },
    '2': {
        'train_idxs': [],
        'val_idxs': [],
    },
    '3': {
        'train_idxs': [],
        'val_idxs': [],
    },
    '4': {
        'train_idxs': [],
        'val_idxs': [],
    },
}

for fold, (train_idx, val_idx) in enumerate(sgkf.split(df, df['is_double'], groups=df['group_id'])):
    fold_mapping[str(fold)]['train_idxs'] = train_idx
    fold_mapping[str(fold)]['val_idxs'] = val_idx

In [19]:
train_idx = fold_mapping[str(N_FOLD)]['train_idxs']
val_idx = fold_mapping[str(N_FOLD)]['val_idxs']

In [20]:
train_transforms = A.Compose(
    [
        A.Resize(IMG_SIZE, IMG_SIZE),
        #A.HorizontalFlip(),
        #A.VerticalFlip(),
        #A.Rotate(), 
        A.Normalize(),
        ToTensorV2(),
    ]
)

test_transforms = A.Compose(
    [
        A.Resize(IMG_SIZE, IMG_SIZE),
        A.Normalize(),
        ToTensorV2(),
    ]
)


In [21]:
train_dataset = KakUchitToBlya(dataframe=df.iloc[train_idx], transforms=train_transforms, path_to_images=PATH_TO_TRAIN_IMAGES)
val_dataset = KakUchitToBlya(dataframe=df.iloc[val_idx], transforms=test_transforms, path_to_images=PATH_TO_TRAIN_IMAGES)

In [22]:
training_args = TrainingArguments(
    output_dir=f'output-{VER}',
    overwrite_output_dir=True,
    report_to='wandb',
    num_train_epochs=config.n_epochs,
    #num_train_epochs=10,
    per_device_train_batch_size=config.per_device_train_batch_size,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    per_device_eval_batch_size=config.per_device_eval_batch_size,
    logging_steps=10,
    #eval_strategy='epoch',
    save_strategy='steps',
    save_steps=1000,
    save_total_limit=4,
    optim=config.optim_type,
    learning_rate=config.lr,
    warmup_ratio=config.warmup_ratio,
    lr_scheduler_type='cosine',
    remove_unused_columns=False,
    do_eval=True
)

In [23]:
trunk_model = TrunkModel(MODEL_NAME)
head_model_architecture = nn.Sequential(
        nn.Linear(trunk_model.trunk_model.num_features, FC_DIM),
        nn.BatchNorm1d(FC_DIM)
       
)
head_model = HeadModel(head_model_architecture)
model = EmbeddingModel(trunk_model, head_model).to(DEVICE)

INFO:timm.models._builder:Loading pretrained weights from Hugging Face hub (timm/tf_efficientnet_b5.in1k)
INFO:timm.models._hub:[timm/tf_efficientnet_b5.in1k] Safe alternative available for 'pytorch_model.bin' (as 'model.safetensors'). Loading weights using safetensors.


----TRUNK----
EfficientNet(
  (conv_stem): Conv2dSame(3, 48, kernel_size=(3, 3), stride=(2, 2), bias=False)
  (bn1): BatchNormAct2d(
    48, eps=0.001, momentum=0.1, affine=True, track_running_stats=True
    (drop): Identity()
    (act): SiLU(inplace=True)
  )
  (blocks): Sequential(
    (0): Sequential(
      (0): DepthwiseSeparableConv(
        (conv_dw): Conv2d(48, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=48, bias=False)
        (bn1): BatchNormAct2d(
          48, eps=0.001, momentum=0.1, affine=True, track_running_stats=True
          (drop): Identity()
          (act): SiLU(inplace=True)
        )
        (aa): Identity()
        (se): SqueezeExcite(
          (conv_reduce): Conv2d(48, 12, kernel_size=(1, 1), stride=(1, 1))
          (act1): SiLU(inplace=True)
          (conv_expand): Conv2d(12, 48, kernel_size=(1, 1), stride=(1, 1))
          (gate): Sigmoid()
        )
        (conv_pw): Conv2d(48, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
        

In [28]:
def compute_metrics(eval_preds):

    emb1, emb2 = eval_preds.predictions
    labels = eval_preds.label_ids

    cossims = F.cosine_similarity(torch.from_numpy(emb1), torch.from_numpy(emb2)).numpy()

    mask = labels != 255
    cossims = cossims[mask]
    labels = labels[mask]

    pr, rec, _ = precision_recall_curve(labels, cossims)
    prauc = auc(rec, pr)
    return {'prauc': prauc}

In [25]:
trainer = OnlineContrastiveTrainer(
    args=training_args, 
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset, 
    compute_metrics=compute_metrics
    
)

In [None]:
trainer.train()

In [29]:
trainer.evaluate()

{'eval_prauc': 0.18988509233024065}