In [16]:
import pandas as pd
import sys
import numpy as np
sys.path.append('../')
from recsys import config

In [17]:
np.random.seed(42)

In [12]:
# Загрузка данных
transactions = pd.read_csv(config.RAW_DATA_DIR / 'transactions.csv')

# Преобразование столбца t_dat в формат временной метки
transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])

# Сортировка по значению t_dat
transactions = transactions.sort_values(by='t_dat')

# Группировка по customer_id и создание списка покупок
grouped_transactions = transactions.groupby('customer_id')['article_id'].apply(list).reset_index()

# Переименование столбца
grouped_transactions.columns = ['customer_id', 'articles']

# Сохранение отфильтрованного датасета
grouped_transactions.to_csv('grouped_transactions.csv', index=False)

In [14]:
# Сортировка по customer_id и создание списка покупок
grouped_transactions = grouped_transactions.groupby('customer_id')['articles'].apply(list).reset_index()

# Переименование столбца
grouped_transactions.columns = ['customer_id', 'articles']

# Длина последовательности, минимальное количество истории и размер шага окна
sequence_length = 4
min_history = 1
step_size = 2

# Функция для создания последовательностей с использованием скользящего окна
def create_sequences(values, window_size, step_size, min_history):
    sequences = []
    start_index = 0
    while len(values[start_index:]) > min_history:
        seq = values[start_index : start_index + window_size]
        sequences.append(seq)
        start_index += step_size
    return sequences

# Применение функции для создания последовательностей
grouped_transactions['articles'] = grouped_transactions['articles'].apply(
    lambda ids: create_sequences(ids[0], sequence_length, step_size, min_history)
)

# Разделение под-последовательностей
grouped_transactions_transformed = grouped_transactions.explode('articles', ignore_index=True)

# Переименование столбца
grouped_transactions_transformed.rename(
    columns={'articles': 'sequence_articles'},
    inplace=True,
)

# Сохранение преобразованных данных
grouped_transactions_transformed.to_csv(config.PROCESSED_DATA_DIR / "grouped_transactions_transformed.csv", index=False)

In [19]:
# Случайный выбор для разделения данных
random_selection = np.random.rand(len(grouped_transactions_transformed.index)) <= 0.85

# Разделение данных на тренировочные
df_train_data = grouped_transactions_transformed[random_selection]
train_data_raw = df_train_data[["customer_id", "sequence_articles"]].values

# Разделение данных на тестовые
df_test_data = grouped_transactions_transformed[~random_selection]
test_data_raw = df_test_data[["customer_id", "sequence_articles"]].values

# Сохранение тренировочных и тестовых данных
np.save(config.PROCESSED_DATA_DIR / "train_data_raw.npy", train_data_raw)
np.save(config.PROCESSED_DATA_DIR / "test_data_raw.npy", test_data_raw)

In [44]:
import torch
import torch.nn as nn
import math
import time
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [21]:
df = pd.read_csv(config.PROCESSED_DATA_DIR / "grouped_transactions_transformed.csv")

# Преобразование данных в нужный формат
data = df[['customer_id', 'sequence_articles']].values.tolist()

In [50]:
# Создание словарей для преобразования идентификаторов в индексы
article_vocab = set()
customer_vocab = set()
for customer_id, articles in data:
    customer_vocab.add(customer_id)
    if isinstance(articles, str):
        articles = eval(articles)
    if isinstance(articles, list):
        for article_list in articles:
            if isinstance(article_list, list):
                for article in article_list:
                    article_vocab.add(article)
            else:
                article_vocab.add(article_list)

article_vocab_stoi = {article: idx for idx, article in enumerate(article_vocab, start=1)}
article_vocab_stoi['<unk>'] = 0
customer_vocab_stoi = {customer: idx for idx, customer in enumerate(customer_vocab, start=1)}

# Pytorch Dataset для взаимодействий пользователей
class TransactionSeqDataset(Dataset):
    def __init__(self, data, article_vocab_stoi, customer_vocab_stoi):
        self.data = data
        self.article_vocab_stoi = article_vocab_stoi
        self.customer_vocab_stoi = customer_vocab_stoi

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        customer, article_sequence = self.data[idx]
        if isinstance(article_sequence, str):
            article_sequence = eval(article_sequence)
        article_data = [self.article_vocab_stoi.get(item, 0) for sublist in article_sequence for item in sublist]
        customer_data = self.customer_vocab_stoi[customer]
        return torch.tensor(article_data), torch.tensor(customer_data)

# Функция для объединения батчей и добавления паддинга
def collate_batch(batch):
    article_list = [item[0] for item in batch]
    customer_list = [item[1] for item in batch]
    return pad_sequence(article_list, padding_value=article_vocab_stoi['<unk>'], batch_first=True), torch.stack(customer_list)

In [51]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        self.d_model = d_model  # Добавляем атрибут d_model

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(1, max_len, d_model)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer("pe", pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x + self.pe[:, :x.size(1)] / math.sqrt(self.d_model)
        return self.dropout(x)


In [36]:
class Cola(nn.Module):
    def __init__(self, lr=0.001, use_pretrained=False, dropout=0.2, d_model=128, n_vocab=30522, smoothing=0.1):
        super().__init__()
        self.dropout = dropout
        self.lr = lr
        self.d_model = d_model
        self.n_vocab = n_vocab
        self.smoothing = smoothing

        self.item_embeddings = nn.Embedding(self.n_vocab, self.d_model)
        self.pos_encoder = PositionalEncoding(d_model=self.d_model, dropout=self.dropout)
        encoder_layer = nn.TransformerEncoderLayer(d_model=self.d_model, nhead=4, dropout=self.dropout, batch_first=True)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=4)
        self.output_layer = nn.Linear(self.d_model, self.n_vocab)

    def encode_text(self, x):
        x = self.item_embeddings(x)
        x = self.pos_encoder(x)
        x = self.encoder(x)
        x = self.output_layer(x)
        return x

    def forward(self, x):
        x = self.item_embeddings(x)
        x = self.pos_encoder(x)
        x = self.encoder(x)
        x = self.output_layer(x)
        return x

In [41]:
class TransformerModel(nn.Module):
    def __init__(self,  cola: Cola, ntoken: int, nuser: int, d_model: int, nhead: int, d_hid: int, nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.movie_embedding = nn.Embedding(ntoken, d_model)
        self.user_embedding = nn.Embedding(nuser, d_model)
        self.d_model = d_model
        self.linear = nn.Linear(2 * d_model, ntoken)
        self.init_weights()
        self.cola = cola

    def init_weights(self) -> None:
        initrange = 0.1
        self.movie_embedding.weight.data.uniform_(-initrange, initrange)
        self.user_embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: torch.Tensor, user: torch.Tensor, src_mask: torch.Tensor = None) -> torch.Tensor:
        movie_embed = self.movie_embedding(src) * math.sqrt(self.d_model)
        user_embed = self.user_embedding(user) * math.sqrt(self.d_model)
        movie_embed = self.pos_encoder(movie_embed)
        output = self.transformer_encoder(movie_embed, src_mask)
        user_embed = user_embed.expand(-1, output.size(1), -1)
        output = torch.cat((output, user_embed), dim=-1)
        output = self.linear(output)
        return output

In [42]:
cola = Cola(lr=1e-4)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

cola.load_state_dict(state_dict=torch.load("../models/model_0_2705_0_1933.pth", map_location=device))

cola.to(device)

cola.eval()

  cola.load_state_dict(state_dict=torch.load("../models/model_0_2705_0_1933.pth", map_location=device))


Cola(
  (item_embeddings): Embedding(30522, 128)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=2048, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=2048, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
    )
  )
  (output_layer): Linear(in_features=128, out_features=30522, bias=True)
)

In [45]:
# Пример использования
BATCH_SIZE = 256
ntoken = 10000  # Примерное количество уникальных фильмов
nuser = 1000    # Примерное количество уникальных пользователей
d_model = 128
nhead = 4
d_hid = 512
nlayers = 4
dropout = 0.2

model = TransformerModel(cola, ntoken, nuser, d_model, nhead, d_hid, nlayers, dropout)

# Пример данных
src = torch.randint(0, ntoken, (BATCH_SIZE, 10))  # Пример последовательностей фильмов
user = torch.randint(0, nuser, (BATCH_SIZE, 1))   # Пример пользователей

output = model(src, user)
print(output.shape)  # Ожидаемый вывод: (BATCH_SIZE, 10, ntoken)

torch.Size([256, 10, 10000])




In [81]:
# Создание словарей для преобразования идентификаторов в индексы
article_vocab = set()
customer_vocab = set()
for customer_id, articles in data:
    customer_vocab.add(customer_id)
    if isinstance(articles, list):
        for article in articles:
            article_vocab.add(article)

article_vocab_stoi = {article: idx for idx, article in enumerate(article_vocab, start=1)}
article_vocab_stoi['<unk>'] = 0
customer_vocab_stoi = {customer: idx for idx, customer in enumerate(customer_vocab, start=1)}

# Pytorch Dataset для взаимодействий пользователей
class TransactionSeqDataset(Dataset):
    def __init__(self, data, article_vocab_stoi, customer_vocab_stoi):
        self.data = data
        self.article_vocab_stoi = article_vocab_stoi
        self.customer_vocab_stoi = customer_vocab_stoi

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        customer, article_sequence = self.data[idx]
        article_data = [self.article_vocab_stoi.get(item, 0) for item in article_sequence]
        customer_data = self.customer_vocab_stoi[customer]
        return torch.tensor(article_data), torch.tensor(customer_data)

# Функция для объединения батчей и добавления паддинга
def collate_batch(batch):
    article_list = [item[0] for item in batch]
    customer_list = [item[1] for item in batch]
    return pad_sequence(article_list, padding_value=article_vocab_stoi['<unk>'], batch_first=True), torch.stack(customer_list)


In [82]:
BATCH_SIZE = 256

# Создание экземпляров Dataset для каждого набора данных
train_dataset = TransactionSeqDataset(train_data_raw, article_vocab_stoi, customer_vocab_stoi)
val_dataset = TransactionSeqDataset(test_data_raw, article_vocab_stoi, customer_vocab_stoi)

# Создание DataLoader
train_iter = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
val_iter = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)

# Определение функции потерь и оптимизатора
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)

In [85]:
def train(model: nn.Module, train_iter, epoch) -> None:
    # Переключение в режим обучения
    model.train()
    total_loss = 0.
    log_interval = 200
    start_time = time.time()
    for i, [movie_data, user_data] in enumerate(train_iter):
        # Загрузка данных
        movie_data, user_data = movie_data.to(device), user_data.to(device)
        user_data = user_data.reshape(-1, 1)

        # Разделение последовательности фильмов на входы и цели
        inputs, targets = movie_data[:, :-1], movie_data[:, 1:]
        targets_flat = targets.reshape(-1)

        # Предсказание фильмов
        output = model(inputs, user_data)
        output_flat = output.reshape(-1, ntoken)
        
        # Обратное распространение ошибки
        loss = criterion(output_flat, targets_flat)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        
        total_loss += loss.item()
        # Результаты
        if i % log_interval == 0 and i > 0:
            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            ppl = math.exp(cur_loss)
            print(f'| epoch {epoch:3d} '
                  f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
            total_loss = 0
            start_time = time.time()

def evaluate(model: nn.Module, val_iter) -> float:
    # Переключение в режим оценки
    model.eval()
    total_loss = 0.
    with torch.no_grad():
        for movie_data, user_data in val_iter:
            movie_data, user_data = movie_data.to(device), user_data.to(device)
            user_data = user_data.reshape(-1, 1)
            inputs, targets = movie_data[:, :-1], movie_data[:, 1:]
            targets_flat = targets.reshape(-1)
            output = model(inputs, user_data)
            output_flat = output.reshape(-1, ntoken)
            loss = criterion(output_flat, targets_flat)
            total_loss += loss.item()
    return total_loss / len(val_iter)

In [86]:
EPOCHS = 10

for epoch in range(1, EPOCHS + 1):
    train(model, train_iter, epoch)
    val_loss = evaluate(model, val_iter)
    print('-' * 89)
    print(f'| end of epoch {epoch:3d} | valid loss {val_loss:5.2f} | valid ppl {math.exp(val_loss):8.2f}')
    print('-' * 89)
    scheduler.step()

TypeError: unhashable type: 'list'