<a href="https://colab.research.google.com/github/juliawol/WB_Embedder/blob/main/Fine_tuned_Embedder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets transformers torch

In [3]:
from datasets import load_dataset
import pandas as pd

# Paths for separate datasets
CARDS_DATASET = "JuliaWolken/WB_CARDS"
TRIPLETS_DATASET = "JuliaWolken/WB_TRIPLETS"
BRANDS_DATASET = "JuliaWolken/WB_BRANDS"

# Load datasets
print("Loading main dataset (cards)...")
data_sampled = load_dataset(CARDS_DATASET)["train"]
data_sampled_df = data_sampled.to_pandas()

print("Loading triplet dataset...")
triplet_candidates = load_dataset(TRIPLETS_DATASET)["train"]
triplet_candidates_df = triplet_candidates.to_pandas()

print("Loading brand dataset...")
brand_candidates = load_dataset(BRANDS_DATASET)["train"]
brand_candidates_df = brand_candidates.to_pandas()

# Validate loaded data
print("\nMain dataset (data_sampled_30.csv):")
print(data_sampled_df.head())

print("\nTriplet candidates (triplet_candidates.csv):")
print(triplet_candidates_df.head())

print("\nBrand candidates (brand_candidates.csv):")
print(brand_candidates_df.head())



Loading main dataset (cards)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


data_sampled_30.csv:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/137814 [00:00<?, ? examples/s]

Loading triplet dataset...


triplet_candidates.csv:   0%|          | 0.00/843M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/127892 [00:00<?, ? examples/s]

Loading brand dataset...


brand_candidates.csv:   0%|          | 0.00/612M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/22735953 [00:00<?, ? examples/s]


Main dataset (data_sampled_30.csv):
                             aggregated_charc_values  \
0  Материал изделия: ЛДСП\nВес с упаковкой (кг): ...   
1  Цвет: красный\nШирина упаковки: 10 см \nСовмес...   
2  Высота предмета: 200 см \nСтиль дизайна: Миним...   
3  Высота предмета: 200 см \nСтиль дизайна: Миним...   
4  Ставка НДС: Без НДС\nВес без упаковки (кг): 13...   

                                               title  \
0              Набор для увеличения кровати - белый    
1           Чехол-книжка Tecno Spark 9Pro Спарк 9Про   
2  Шкаф пенал двухдверный распашной серый витрина...   
3  Шкаф пенал двухдверный распашной серый витрина...   
4  Комплект барных стульев Loft со спинкой для ку...   

                                         description  \
0  Отличный вариант для тех, кто не хочет расстав...   
1  НА ФОТО ОБРАЗЕЦ ЧЕХЛА!!! ВАМ ПРИДЕТ ЧЕХОЛ В СО...   
2  Принцесса Мелания Шкаф-витрина  - это идеально...   
3  Принцесса Мелания Шкаф-витрина  - это идеально...   
4  ВНИМАН

In [7]:
import os
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset
import pandas as pd

# Configuration
MODEL_NAME = "DeepPavlov/rubert-base-cased"
BATCH_SIZE = 64
MAX_LENGTH = 256
EPOCHS = 3
LEARNING_RATE = 1e-5
WARMUP_STEPS = 500
RANDOM_SEED = 42
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Set random seed for reproducibility
random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
model = MultiTaskModel(MODEL_NAME).to(DEVICE)

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
class MultiTaskModel(nn.Module):
    def __init__(self, model_name):
        super(MultiTaskModel, self).__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.category_head = nn.Linear(self.encoder.config.hidden_size, 60)  # Adjust for number of categories
        self.sentiment_head = nn.Linear(self.encoder.config.hidden_size, 2)  # Binary classification
        self.ranking_head = nn.Linear(self.encoder.config.hidden_size, 1)  # Used for brand similarity

    def forward(self, input_ids, attention_mask, task="category"):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls_emb = outputs.last_hidden_state[:, 0, :]
        if task == "category":
            return self.category_head(cls_emb)
        elif task == "sentiment":
            return self.sentiment_head(cls_emb)
        elif task == "ranking":
            return self.ranking_head(cls_emb)
        else:
            raise ValueError("Unknown task")

def contrastive_loss(anchor_emb, positive_emb, negative_emb, margin=0.2):
    return F.triplet_margin_loss(anchor_emb, positive_emb, negative_emb, margin=margin)


class TripletDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        anchor, positive, negative = row["Anchor"], row["Positive"], row["Negative"]

        anchor_enc = self.tokenizer(anchor, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
        positive_enc = self.tokenizer(positive, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
        negative_enc = self.tokenizer(negative, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")

        return {
            "anchor_input_ids": anchor_enc["input_ids"].squeeze(0),
            "anchor_attention_mask": anchor_enc["attention_mask"].squeeze(0),
            "positive_input_ids": positive_enc["input_ids"].squeeze(0),
            "positive_attention_mask": positive_enc["attention_mask"].squeeze(0),
            "negative_input_ids": negative_enc["input_ids"].squeeze(0),
            "negative_attention_mask": negative_enc["attention_mask"].squeeze(0),
        }

def triplet_collate_fn(batch):
    batch = [x for x in batch if x is not None]
    if len(batch) == 0:
        return None
    return {
        "anchor_input_ids": torch.stack([x["anchor_input_ids"] for x in batch]),
        "anchor_attention_mask": torch.stack([x["anchor_attention_mask"] for x in batch]),
        "positive_input_ids": torch.stack([x["positive_input_ids"] for x in batch]),
        "positive_attention_mask": torch.stack([x["positive_attention_mask"] for x in batch]),
        "negative_input_ids": torch.stack([x["negative_input_ids"] for x in batch]),
        "negative_attention_mask": torch.stack([x["negative_attention_mask"] for x in batch]),
    }
triplet_loader = DataLoader(
    TripletDataset(triplet_candidates_df, tokenizer, MAX_LENGTH),
    batch_size=BATCH_SIZE,
    shuffle=True
)

class BrandDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        brand1, brand2, label = row["Brand1"], row["Brand2"], row["Label"]

        try:
            brand1_enc = self.tokenizer(brand1, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
            brand2_enc = self.tokenizer(brand2, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
        except Exception as e:
            print(f"Error processing row {idx}: {e}")
            return None

        return {
            "brand1_input_ids": brand1_enc["input_ids"].squeeze(0),
            "brand1_attention_mask": brand1_enc["attention_mask"].squeeze(0),
            "brand2_input_ids": brand2_enc["input_ids"].squeeze(0),
            "brand2_attention_mask": brand2_enc["attention_mask"].squeeze(0),
            "label": float(label),  # Ensure label is numeric
        }


def brand_collate_fn(batch):
    batch = [x for x in batch if x is not None]
    if len(batch) == 0:
        return None
    return {
        "brand1_input_ids": torch.stack([x["brand1_input_ids"] for x in batch]),
        "brand1_attention_mask": torch.stack([x["brand1_attention_mask"] for x in batch]),
        "brand2_input_ids": torch.stack([x["brand2_input_ids"] for x in batch]),
        "brand2_attention_mask": torch.stack([x["brand2_attention_mask"] for x in batch]),
        "labels": torch.tensor([x["label"] for x in batch]),
    }


brand_loader = DataLoader(
    BrandDataset(brand_candidates_df, tokenizer, MAX_LENGTH),
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=brand_collate_fn
)

class CardsDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_length, task):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.task = task

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        if self.task == "category_classification":
            text = (row["title"] or "") + " " + (row["description"] or "")
            label = row["parentname"]
        elif self.task == "sentiment_classification":
            text = row["description"]
            label = 1 if row["view_cl_cnt"] > 5 else 0
        else:
            raise ValueError("Unknown task")

        try:
            text_enc = self.tokenizer(text, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
        except Exception as e:
            print(f"Error processing row {idx}: {e}")
            return None

        return {
            "input_ids": text_enc["input_ids"].squeeze(0),
            "attention_mask": text_enc["attention_mask"].squeeze(0),
            "label": label,
        }

def cards_collate_fn(batch):
    batch = [x for x in batch if x is not None]
    if len(batch) == 0:
        return None
    return {
        "input_ids": torch.stack([x["input_ids"] for x in batch]),
        "attention_mask": torch.stack([x["attention_mask"] for x in batch]),
        "labels": torch.tensor([x["label"] for x in batch]),
    }

category_loader = DataLoader(
    CardsDataset(data_sampled_df, tokenizer, MAX_LENGTH, task="category_classification"),
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=cards_collate_fn
)

sentiment_loader = DataLoader(
    CardsDataset(data_sampled_df, tokenizer, MAX_LENGTH, task="sentiment_classification"),
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=cards_collate_fn
)

loss_fns = {
    "category": nn.CrossEntropyLoss(),
    "sentiment": nn.CrossEntropyLoss(),
    "ranking": nn.MSELoss(),
    "triplet": contrastive_loss,
}
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=(len(triplet_loader) + len(brand_loader) + len(category_loader) + len(sentiment_loader)) * EPOCHS)


In [None]:
# Fine-Tuning for Triplets

print("Starting fine-tuning for triplets...")
model.train()

for epoch in range(EPOCHS):
    for step, batch in enumerate(triplet_loader):
        if batch is None:
            continue

        anchor_input_ids = batch['anchor_input_ids'].to(DEVICE)
        anchor_attention_mask = batch['anchor_attention_mask'].to(DEVICE)
        positive_input_ids = batch['positive_input_ids'].to(DEVICE)
        positive_attention_mask = batch['positive_attention_mask'].to(DEVICE)
        negative_input_ids = batch['negative_input_ids'].to(DEVICE)
        negative_attention_mask = batch['negative_attention_mask'].to(DEVICE)

        # Encode embeddings
        anchor_emb = model.encoder(input_ids=anchor_input_ids, attention_mask=anchor_attention_mask).last_hidden_state[:, 0, :]
        positive_emb = model.encoder(input_ids=positive_input_ids, attention_mask=positive_attention_mask).last_hidden_state[:, 0, :]
        negative_emb = model.encoder(input_ids=negative_input_ids, attention_mask=negative_attention_mask).last_hidden_state[:, 0, :]

        # Normalize embeddings
        anchor_emb = F.normalize(anchor_emb, p=2, dim=1)
        positive_emb = F.normalize(positive_emb, p=2, dim=1)
        negative_emb = F.normalize(negative_emb, p=2, dim=1)

        # Compute loss
        loss = contrastive_loss(anchor_emb, positive_emb, negative_emb)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        if step % 10 == 0:
            print(f"Epoch {epoch+1}/{EPOCHS}, Step {step}, Loss: {loss.item():.4f}")

# Save Triplets Model
os.makedirs("fine_tuned_triplets", exist_ok=True)
model.encoder.save_pretrained("fine_tuned_triplets")

print("Fine-tuning for triplets complete. Model saved.")


Epoch 2/3, Step 160, Loss: 0.0017


In [None]:
# Load Model from Step 1
model.encoder = AutoModel.from_pretrained("fine_tuned_triplets").to(DEVICE)

# Fine-Tuning for Brand Similarity
print("Starting fine-tuning for brand similarity...")
model.train()

for epoch in range(EPOCHS):
    for step, batch in enumerate(brand_loader):
        if batch is None:
            continue

        brand1_input_ids = batch['brand1_input_ids'].to(DEVICE)
        brand1_attention_mask = batch['brand1_attention_mask'].to(DEVICE)
        brand2_input_ids = batch['brand2_input_ids'].to(DEVICE)
        brand2_attention_mask = batch['brand2_attention_mask'].to(DEVICE)
        labels = batch['labels'].float().to(DEVICE)

        # Encode embeddings
        brand1_emb = model.encoder(input_ids=brand1_input_ids, attention_mask=brand1_attention_mask).last_hidden_state[:, 0, :]
        brand2_emb = model.encoder(input_ids=brand2_input_ids, attention_mask=brand2_attention_mask).last_hidden_state[:, 0, :]

        # Normalize embeddings
        brand1_emb = F.normalize(brand1_emb, p=2, dim=1)
        brand2_emb = F.normalize(brand2_emb, p=2, dim=1)

        # Compute similarity scores
        similarity_scores = (brand1_emb * brand2_emb).sum(dim=1)
        loss = F.mse_loss(similarity_scores, labels)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        if step % 10 == 0:
            print(f"Epoch {epoch+1}/{EPOCHS}, Step {step}, Loss: {loss.item():.4f}")

# Save Brand Similarity Model
os.makedirs("fine_tuned_brands", exist_ok=True)
model.encoder.save_pretrained("fine_tuned_brands")

print("Fine-tuning for brand similarity complete. Model saved.")


In [None]:
# Load Model from Step 2
model.encoder = AutoModel.from_pretrained("fine_tuned_brands").to(DEVICE)


# Fine-Tuning for Classification Tasks
print("Starting fine-tuning for classification tasks...")
model.train()

for epoch in range(EPOCHS):
    for step, batch in enumerate(category_loader):  # Iterate through category tasks
        if batch is None:
            continue

        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, task="category")

        # Compute loss
        loss = nn.CrossEntropyLoss()(outputs, labels)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        if step % 10 == 0:
            print(f"Epoch {epoch+1}/{EPOCHS}, Task category classification, Step {step}, Loss: {loss.item():.4f}")

# Save Final Model
os.makedirs("fine_tuned_final", exist_ok=True)
model.encoder.save_pretrained("fine_tuned_final")

print("Fine-tuning for classification tasks complete. Final model saved.")
