In [1]:
import math

import torch
import torch.nn.functional as F
from pytorch_lightning import LightningModule
from torchvision import models
from transformers import get_cosine_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer


import plotly.express as px
import plotly.graph_objects as go
from sklearn.manifold import TSNE

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import torch
from tqdm import tqdm

from sklearn.manifold import TSNE


from pathlib import Path
import os

import sys
sys.path.append("../")
from recsys import config

[32m2024-12-18 01:51:01.679[0m | [1mINFO    [0m | [36mrecsys.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /home/artem216/transformer-recsys[0m


In [2]:
RANDOM_STATE = 23115
np.random.seed(RANDOM_STATE)

In [3]:
TOKENIZER = BertTokenizer.from_pretrained("bert-base-uncased")

CLS_IDX = TOKENIZER.convert_tokens_to_ids("[CLS]")
PAD_IDX = TOKENIZER.convert_tokens_to_ids("[PAD]")
SEP_IDX = TOKENIZER.convert_tokens_to_ids("[SEP]")


vocab_size = TOKENIZER.vocab_size

def tokenize(text: str):
    raw_tokens = TOKENIZER.encode(text)
    return raw_tokens


def pad_list(
    list_integers, context_size: int = 90, pad_val: int = PAD_IDX, mode="right"
):
    list_integers = list_integers[:context_size]

    if len(list_integers) < context_size:
        if mode == "left":
            list_integers = [pad_val] * (
                context_size - len(list_integers)
            ) + list_integers
        else:
            list_integers = list_integers + [pad_val] * (
                context_size - len(list_integers)
            )

    return list_integers

In [4]:
vocab_size

30522

In [5]:
class PositionalEncoding(torch.nn.Module):

    def __init__(self,
            d_model: int,
            dropout: float = 0.1,
            max_len: int = 5000,
        ):
        super().__init__()
        self.dropout = torch.nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)
        )
        pe = torch.zeros(1, max_len, d_model)
        pe[0:, :, 0::2] = torch.sin(position * div_term)
        pe[0:, :, 1::2] = torch.cos(position * div_term)
        # позиционное кодирование
        self.register_buffer("pe", pe)

        self.d_model = d_model

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """

        x = x + self.pe[:, : x.size(1)] / math.sqrt(self.d_model)

        return self.dropout(x)

In [6]:
class Cola(LightningModule):

    def __init__(
        self,
        lr=0.001,
        use_pretrained=False,
        dropout=0.2,
        d_model=128,
        n_vocab=30_522,
        smoothing=0.1,
    ):
        super().__init__()
        self.dropout = dropout

        self.lr = lr
        self.d_model = d_model
        self.n_vocab = n_vocab
        self.smoothing = smoothing

        # Text embeddings and encoder
        self.item_embeddings = torch.nn.Embedding(self.n_vocab, self.d_model)
        self.pos_encoder = PositionalEncoding(
            d_model=self.d_model, dropout=self.dropout
        )
        encoder_layer = torch.nn.TransformerEncoderLayer(
            d_model=self.d_model, nhead=4, dropout=self.dropout, batch_first=True
        )
        self.encoder = torch.nn.TransformerEncoder(encoder_layer, num_layers=4)

        # Output layer to project to vocabulary size
        self.output_layer = torch.nn.Linear(self.d_model, self.n_vocab)

        self.save_hyperparameters()


    def encode_text(self, x):
        x = self.item_embeddings(x)
        x = self.pos_encoder(x)
        x = self.encoder(x)
        x = self.output_layer(x)  # Add projection to vocab size

        return x  # Return full sequence output for language modeling

    def forward(self, x):
        x = self.item_embeddings(x)
        x = self.pos_encoder(x)
        x = self.encoder(x)
        x = self.output_layer(x)  # Project to vocab size
        return x

# Dataset

In [7]:
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, pad_list):
        self.texts = texts
        self.tokenizer = tokenizer
        self.pad_list = pad_list

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        text_t = self.tokenizer(text)
        text_t = torch.tensor(self.pad_list(text_t), dtype=torch.long)
        return text_t

In [9]:
# Load data
df = pd.read_csv(
    config.PROCESSED_DATA_DIR / "articles.csv",
    nrows=None,
    dtype={
        "article_id": str,
    },
)

df["text"] = df.apply(
    lambda x: " ".join(
        [
            str(x["prod_name"]),
            str(x["product_type_name"]),
            str(x["product_group_name"]),
            str(x["graphical_appearance_name"]),
            str(x["colour_group_name"]),
            str(x["perceived_colour_value_name"]),
            str(x["index_name"]),
            str(x["section_name"]),
            str(x["detail_desc"]),
        ]
    ),
    axis=1,
)

In [11]:
df["text"]

0      Strap top Vest top Garment Upper body Solid Bl...
1      Strap top Vest top Garment Upper body Solid Wh...
2      20 den 1p Stockings Underwear Tights Socks & T...
3      Shape Up 30 den 1p Tights Leggings/Tights Garm...
4      Support 40 den 1p Tights Underwear Tights Sock...
                             ...                        
995    Glamping Shorts Garment Lower body Solid Black...
996    Eleonor button dress Dress Garment Full body S...
997    Agneta jumpsuit Trousers Garment Lower body St...
998    CS Paula dress Dress Garment Full body Other s...
999    STRONG HW seamless tights Leggings/Tights Garm...
Name: text, Length: 1000, dtype: object

In [12]:
import torch
from transformers import AutoTokenizer, AutoModel

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
model = model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [14]:
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding='max_length')
    inputs = inputs.to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    sentence_embeddings = outputs[0]
    sentence_embeddings = sentence_embeddings.mean(dim=1)
    sentence_embeddings = sentence_embeddings.cpu().numpy()
    return sentence_embeddings[0]

In [15]:
tqdm.pandas()
df['Embed_comb_text'] = df['text'].progress_map(lambda x: get_embeddings(x))

100%|██████████| 1000/1000 [00:11<00:00, 88.30it/s]


In [19]:
df.to_csv(config.PROCESSED_DATA_DIR / "articles_embedings.csv")

In [None]:
from MulticoreTSNE import MulticoreTSNE

tsne = MulticoreTSNE(n_jobs=4) 
embedding_tsne = tsne.fit_transform(np.stack(df['Embed_comb_text'].values))

# Train

In [14]:
# Split data into train and validation sets
train_texts, val_texts = train_test_split(
    df["text"].values, test_size=0.2, random_state=42
)

# Create datasets
train_dataset = TextDataset(train_texts, tokenize, pad_list)
val_dataset = TextDataset(val_texts, tokenize, pad_list)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)

# Initialize model
model = Cola(lr=1e-3, n_vocab=vocab_size)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=model.lr)
criterion = torch.nn.CrossEntropyLoss()

In [10]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10) -> tuple[float, float]:
    train_loss: float = 0
    val_loss: float = 0

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for texts in train_loader:
            texts = texts.to(device)
            optimizer.zero_grad()
            outputs = model(texts)

            targets = texts[:, 1:].contiguous().view(-1)
            outputs = outputs[:, :-1].contiguous().view(-1, outputs.size(-1))
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * texts.size(0)

        train_loss = train_loss / len(train_loader.dataset)

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for texts in val_loader:
                texts = texts.to(device)
                outputs = model(texts)
                # In your training loop
                targets = texts[:, 1:].contiguous().view(-1)
                outputs = outputs[:, :-1, :].contiguous().view(-1, outputs.size(-1))
                loss = criterion(outputs, targets)
                val_loss += loss.item() * texts.size(0)

        val_loss = val_loss / len(val_loader.dataset)

        print(
            f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}",
            end="\r"
        )

    return (train_loss, val_loss)

In [15]:
# Train the model
train_loss, val_loss = train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=100)

Epoch 100/100, Train Loss: 0.2713, Val Loss: 0.5675

In [16]:
model_name = f"model_{str(train_loss).replace('.', '_')[:5]}_{str(val_loss).replace('.', '_')[:5]}.pth"

In [17]:
try:
    torch.save(model.state_dict(), config.MODELS_DIR / model_name)
except Exception as e:
    print(e)

# Embeddings results

In [19]:
model_name = f"model_0_271_0_567.pth"

In [20]:
MODEL_PATH = config.MODELS_DIR / Path(model_name)

In [21]:
from sklearn.manifold import TSNE

In [22]:
df = pd.read_csv(
    config.PROCESSED_DATA_DIR / "articles.csv",
    nrows=None,
    dtype={
        "article_id": str,
    },
)

df["text"] = df.apply(
    lambda x: " ".join(
        [
            str(x["prod_name"]),
            str(x["product_type_name"]),
            str(x["product_group_name"]),
            str(x["graphical_appearance_name"]),
            str(x["colour_group_name"]),
            str(x["perceived_colour_value_name"]),
            str(x["index_name"]),
            str(x["section_name"]),
            str(x["detail_desc"])
        ]
    ),
    axis=1,
)

In [23]:
model = Cola(lr=1e-3)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.load_state_dict(state_dict=torch.load(MODEL_PATH, map_location=device))
model.to(device)
model.eval()

  model.load_state_dict(state_dict=torch.load(MODEL_PATH, map_location=device))


Cola(
  (item_embeddings): Embedding(30522, 128)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=2048, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=2048, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
    )
  )
  (output_layer): Linear(in_features=128, out_features=30522, bias=True)
)

In [None]:
text_embeddings = []

for text in tqdm(df.text.values, total=len(df)):
    text_t = tokenize(text)
    text_t = torch.tensor(pad_list(text_t), dtype=torch.long, device=device).unsqueeze(0)

    with torch.no_grad():
        text_embed = model.encode_text(text_t)
        text_embed = text_embed.squeeze().cpu().tolist()

    text_embeddings.append(text_embed)

text_embeddings = np.array(text_embeddings)

 24%|██▍       | 242/1000 [00:30<02:59,  4.23it/s]

In [None]:
len(text_embeddings), len(text_embeddings[2]), len(text_embeddings[0][0])

(100, 90, 30522)

In [None]:
# Flatten embeddings: (100, 90, 30522) -> (100, 90*30522)
flattened_embeddings = text_embeddings.reshape(len(text_embeddings), -1)

# Apply TSNE
tsne = TSNE(
    n_components=3,
    init="random",
    random_state=0,
    learning_rate="auto",
    # n_iter=300,
)
Y = tsne.fit_transform(flattened_embeddings)

# Create visualization
fig = px.scatter_3d(
    x=Y[:, 0],
    y=Y[:, 1],
    z=Y[:, 2],
    color=df.index_name,
    hover_data=[df.prod_name, df.product_type_name],
    title="Text Embeddings Visualization (t-SNE)",
    labels={"color": "Index Category"},
    opacity=0.7,
)

# Improve layout
fig.update_layout(
    scene=dict(xaxis_title="t-SNE 1", yaxis_title="t-SNE 2", zaxis_title="t-SNE 3"),
    width=1000,
    height=800,
)

fig.show()