In [1]:
import math

import torch
import torch.nn.functional as F
from pytorch_lightning import LightningModule
from torchvision import models
from transformers import get_cosine_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer


import plotly.express as px
import plotly.graph_objects as go
from sklearn.manifold import TSNE

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import torch
from tqdm import tqdm

from sklearn.manifold import TSNE


from pathlib import Path
import os

In [2]:
RANDOM_STATE = 23115
np.random.seed(RANDOM_STATE)

In [3]:
TOKENIZER = BertTokenizer.from_pretrained("bert-base-uncased")

CLS_IDX = TOKENIZER.convert_tokens_to_ids("[CLS]")
PAD_IDX = TOKENIZER.convert_tokens_to_ids("[PAD]")
SEP_IDX = TOKENIZER.convert_tokens_to_ids("[SEP]")


vocab_size = TOKENIZER.vocab_size

def tokenize(text: str):
    raw_tokens = TOKENIZER.encode(text)
    return raw_tokens


def pad_list(
    list_integers, context_size: int = 90, pad_val: int = PAD_IDX, mode="right"
):
    list_integers = list_integers[:context_size]

    if len(list_integers) < context_size:
        if mode == "left":
            list_integers = [pad_val] * (
                context_size - len(list_integers)
            ) + list_integers
        else:
            list_integers = list_integers + [pad_val] * (
                context_size - len(list_integers)
            )

    return list_integers

In [4]:
vocab_size

30522

In [5]:
class PositionalEncoding(torch.nn.Module):

    def __init__(self,
            d_model: int,
            dropout: float = 0.1,
            max_len: int = 5000,
        ):
        super().__init__()
        self.dropout = torch.nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)
        )
        pe = torch.zeros(1, max_len, d_model)
        pe[0:, :, 0::2] = torch.sin(position * div_term)
        pe[0:, :, 1::2] = torch.cos(position * div_term)
        # позиционное кодирование
        self.register_buffer("pe", pe)

        self.d_model = d_model

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """

        x = x + self.pe[:, : x.size(1)] / math.sqrt(self.d_model)

        return self.dropout(x)

In [6]:
class Cola(LightningModule):

    def __init__(
        self,
        lr=0.001,
        use_pretrained=False,
        dropout=0.2,
        d_model=128,
        n_vocab=30_522,
        smoothing=0.1,
    ):
        super().__init__()
        self.dropout = dropout

        self.lr = lr
        self.d_model = d_model
        self.n_vocab = n_vocab
        self.smoothing = smoothing

        # Text embeddings and encoder
        self.item_embeddings = torch.nn.Embedding(self.n_vocab, self.d_model)
        self.pos_encoder = PositionalEncoding(
            d_model=self.d_model, dropout=self.dropout
        )
        encoder_layer = torch.nn.TransformerEncoderLayer(
            d_model=self.d_model, nhead=4, dropout=self.dropout, batch_first=True
        )
        self.encoder = torch.nn.TransformerEncoder(encoder_layer, num_layers=4)

        # Output layer to project to vocabulary size
        self.output_layer = torch.nn.Linear(self.d_model, self.n_vocab)

        self.save_hyperparameters()


    def encode_text(self, x):
        x = self.item_embeddings(x)
        x = self.pos_encoder(x)
        x = self.encoder(x)
        x = self.output_layer(x)  # Add projection to vocab size

        return x  # Return full sequence output for language modeling

    def forward(self, x):
        x = self.item_embeddings(x)
        x = self.pos_encoder(x)
        x = self.encoder(x)
        x = self.output_layer(x)  # Project to vocab size
        return x

# Dataset

In [7]:
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, pad_list):
        self.texts = texts
        self.tokenizer = tokenizer
        self.pad_list = pad_list

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        text_t = self.tokenizer(text)
        text_t = torch.tensor(self.pad_list(text_t), dtype=torch.long)
        return text_t

In [8]:
BASE_PATH = Path("./data/")

In [9]:
# Load data
df = pd.read_csv(
    BASE_PATH / "articles.csv",
    nrows=None,
    dtype={
        "article_id": str,
    },
)

df["text"] = df.apply(
    lambda x: " ".join(
        [
            str(x["prod_name"]),
            str(x["product_type_name"]),
            str(x["product_group_name"]),
            str(x["graphical_appearance_name"]),
            str(x["colour_group_name"]),
            str(x["perceived_colour_value_name"]),
            str(x["index_name"]),
            str(x["section_name"]),
            str(x["detail_desc"]),
        ]
    ),
    axis=1,
)

# Train

In [10]:
# Split data into train and validation sets
train_texts, val_texts = train_test_split(
    df["text"].values, test_size=0.2, random_state=42
)

# Create datasets
train_dataset = TextDataset(train_texts, tokenize, pad_list)
val_dataset = TextDataset(val_texts, tokenize, pad_list)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Initialize model
model = Cola(lr=1e-4, n_vocab=vocab_size)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=model.lr)
criterion = torch.nn.CrossEntropyLoss()

In [11]:
model.load_state_dict(state_dict=torch.load("model_train_0_370_val_0_27.pth", map_location=device))

  model.load_state_dict(state_dict=torch.load("model_train_0_370_val_0_27.pth", map_location=device))


FileNotFoundError: [Errno 2] No such file or directory: 'model_train_0_370_val_0_27.pth'

In [12]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for texts in tqdm(train_loader):
            texts = texts.to(device)
            optimizer.zero_grad()
            outputs = model(texts)

            targets = texts[:, 1:].contiguous().view(-1)
            outputs = outputs[:, :-1].contiguous().view(-1, outputs.size(-1))
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * texts.size(0)

        train_loss = train_loss / len(train_loader.dataset)

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for texts in val_loader:
                texts = texts.to(device)
                outputs = model(texts)
                # In your training loop
                targets = texts[:, 1:].contiguous().view(-1)
                outputs = outputs[:, :-1, :].contiguous().view(-1, outputs.size(-1))
                loss = criterion(outputs, targets)
                val_loss += loss.item() * texts.size(0)

        val_loss = val_loss / len(val_loader.dataset)

        print(
            f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}"
        )

In [19]:
# Train the model
train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=5)

100%|██████████| 1320/1320 [02:49<00:00,  7.79it/s]


Epoch 1/5, Train Loss: 0.2809, Val Loss: 0.2021


100%|██████████| 1320/1320 [02:49<00:00,  7.78it/s]


Epoch 2/5, Train Loss: 0.2780, Val Loss: 0.1998


100%|██████████| 1320/1320 [02:49<00:00,  7.78it/s]


Epoch 3/5, Train Loss: 0.2755, Val Loss: 0.1986


100%|██████████| 1320/1320 [02:49<00:00,  7.79it/s]


Epoch 4/5, Train Loss: 0.2733, Val Loss: 0.1949


100%|██████████| 1320/1320 [02:49<00:00,  7.79it/s]


Epoch 5/5, Train Loss: 0.2705, Val Loss: 0.1933


In [20]:
try:
    torch.save(model.state_dict(), "model_0_2705_0_1933.pth")
except Exception as e:
    print(e)

# Embeddings results

In [12]:
MODEL_PATH = Path("model_0_2705_0_1933.pth")

In [13]:
from sklearn.manifold import TSNE

In [14]:
df = pd.read_csv(
    BASE_PATH / "articles.csv",
    nrows=None,
    dtype={
        "article_id": str,
    },
)

df["text"] = df.apply(
    lambda x: " ".join(
        [
            str(x["prod_name"]),
            str(x["product_type_name"]),
            str(x["product_group_name"]),
            str(x["graphical_appearance_name"]),
            str(x["colour_group_name"]),
            str(x["perceived_colour_value_name"]),
            str(x["index_name"]),
            str(x["section_name"]),
            str(x["detail_desc"])
        ]
    ),
    axis=1,
)

df = df.sample(n=100, random_state=RANDOM_STATE)

In [15]:
model = Cola(lr=1e-4)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.load_state_dict(state_dict=torch.load("model_0_2705_0_1933.pth", map_location=device))

model.to(device)

model.eval()

  model.load_state_dict(state_dict=torch.load("model_0_2705_0_1933.pth", map_location=device))


Cola(
  (item_embeddings): Embedding(30522, 128)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=2048, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=2048, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
    )
  )
  (output_layer): Linear(in_features=128, out_features=30522, bias=True)
)

In [16]:
text_embeddings = []

for text in tqdm(
     df.text.values,
    total=len(df),
):


    text_t = tokenize(text)
    text_t = torch.tensor(pad_list(text_t), dtype=torch.long, device=device).unsqueeze(0)

    with torch.no_grad():
        text_embed = model.encode_text(text_t)

        text_embed = text_embed.squeeze().cpu().tolist()

    text_embeddings.append(text_embed)

text_embeddings = np.array(text_embeddings)

100%|██████████| 100/100 [00:05<00:00, 18.12it/s]


In [17]:
len(text_embeddings), len(text_embeddings[2]), len(text_embeddings[0][0])

(100, 90, 30522)

In [18]:
# Flatten embeddings: (100, 90, 30522) -> (100, 90*30522)
flattened_embeddings = text_embeddings.reshape(len(text_embeddings), -1)

# Apply TSNE
tsne = TSNE(
    n_components=3,
    init="random",
    random_state=0,
    learning_rate="auto",
    n_iter=300,
)
Y = tsne.fit_transform(flattened_embeddings)

# Create visualization
fig = px.scatter_3d(
    x=Y[:, 0],
    y=Y[:, 1],
    z=Y[:, 2],
    color=df.index_name,
    hover_data=[df.prod_name, df.product_type_name],
    title="Text Embeddings Visualization (t-SNE)",
    labels={"color": "Index Category"},
    opacity=0.7,
)

# Improve layout
fig.update_layout(
    scene=dict(xaxis_title="t-SNE 1", yaxis_title="t-SNE 2", zaxis_title="t-SNE 3"),
    width=1000,
    height=800,
)

fig.show()

