In [2]:
import config
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [3]:
train = pd.read_parquet(os.path.join(config.data_processed_dir , 'train.parquet'))
valid = pd.read_parquet(os.path.join(config.data_processed_dir , 'valid.parquet'))

In [7]:
# select only the columns we need
train = train[['customer_id', 'article_id']]
valid = valid[['customer_id', 'article_id']]
# concat train and valid
train = pd.concat([train, valid], axis=0)

In [8]:
# groupby customer_id and aggregate article_id into a list, then split the lists into groups of 2 consecutive articles
train = train.groupby('customer_id')['article_id'].agg(list).apply(lambda x: [x[i:i+2] for i in range(len(x)-1)]).explode().reset_index()
train = train[train['article_id'].notna()]
# explode article_id into 2 columns article_1 and article_2
train = train.join(pd.DataFrame(train.pop('article_id').tolist(), columns=['article_1', 'article_2']))
# drop na values again
train = train[train['article_1'].notna()]
train = train[train['article_2'].notna()]
# drop rows where article_1 and article_2 are the same
train = train[train['article_1'] != train['article_2']]
train['label'] = 1

In [15]:
train

Unnamed: 0,customer_id,article_1,article_2,label
2,1,599580055.0,811835004.0,1
6,1,811835004.0,723529001.0,1
7,1,723529001.0,559630026.0,1
8,1,559630026.0,599580083.0,1
9,1,599580083.0,811927004.0,1
...,...,...,...,...
11253154,1368904,884081001.0,794819001.0,1
11253155,1368904,794819001.0,762846027.0,1
11253156,1368904,866755002.0,840360003.0,1
11253157,1368904,840360003.0,866755002.0,1


In [16]:
# now we need to create negative samples, for each article_1 we need to randomly select N article_2 that is not the same as article_1
# N is the number of negative samples we want
N = 300

# create a copy of train
train_negative = train.copy()

# create a list of all article_id
article_ids = train['article_1'].unique()

# for each article_id, randomly select N article_id that is not the same as article_id
negative_samples = []
for article_id in article_ids:
    do_not_select = train[train['article_1'] == article_id]['article_2'].unique()
    # randomly select N article_id that is not in do_not_select
    negative_samples.extend(np.random.choice(np.setdiff1d(article_ids, do_not_select), N, replace=False))
# create a dataframe from the negative samples
train_negative = pd.DataFrame(negative_samples, columns=['article_2'])
train_negative['article_1'] = article_ids.repeat(N)
train_negative['label'] = 0

train_negative


Unnamed: 0,article_2,article_1,label
0,869005001.0,599580055.0,0
1,866111001.0,599580055.0,0
2,856667005.0,599580055.0,0
3,739461002.0,599580055.0,0
4,679011009.0,599580055.0,0
...,...,...,...
14996995,662369058.0,795358001.0,0
14996996,841808001.0,795358001.0,0
14996997,687635018.0,795358001.0,0
14996998,805000002.0,795358001.0,0


In [20]:
# merge train and train_negative    
train = pd.concat([train, train_negative], ignore_index=True)
train.drop_duplicates(inplace=True)

In [24]:
# split 30% of train as valid
valid = train.sample(frac=0.3, random_state=42)
train = train.drop(valid.index)

In [25]:
train.to_parquet(os.path.join(config.data_processed_dir , 'train_pairs.parquet'), index=False)
valid.to_parquet(os.path.join(config.data_processed_dir , 'valid_pairs.parquet'), index=False) 

In [3]:
train = pd.read_parquet(os.path.join(config.data_processed_dir , 'train_pairs.parquet'))
valid = pd.read_parquet(os.path.join(config.data_processed_dir , 'valid_pairs.parquet'))

In [4]:
articles = pd.read_parquet(os.path.join(config.data_raw_dir , 'articles.parquet'))
articles = articles[['article_id', 'prod_name', 'detail_desc']]

# join prod_name and prod_desc
articles['text'] = articles['prod_name'] + ' ' + articles['detail_desc']
articles['text'] = articles['text'].str.lower()
articles = articles[['article_id', 'text']]

In [5]:
def merge_articles(df, articles, article_column, text_column):
    df = df.merge(articles, left_on=article_column, right_on='article_id', how='left')
    df = df.rename(columns={'text': text_column})
    return df.drop(columns=['article_id'])

# Use the helper function to merge train and valid with article_1 and article_2
train = merge_articles(train, articles, 'article_1', 'text_1')
train = merge_articles(train, articles, 'article_2', 'text_2')

valid = merge_articles(valid, articles, 'article_1', 'text_1')
valid = merge_articles(valid, articles, 'article_2', 'text_2')


In [6]:
train.drop(columns=['customer_id'], inplace=True)
valid.drop(columns=['customer_id'], inplace=True)
train.dropna(inplace=True)
valid.dropna(inplace=True)

In [7]:
print(valid.label.mean())
print(train.label.mean())


0.3590698311474374
0.3590972014059759


In [8]:
df = pd.concat([train, valid], ignore_index=True)
item_ids = set(df['article_1'].unique()).union(set(df['article_2'].unique()))
# Create a dictionary that maps each item ID to a unique index
vocab = {item_id: i for i, item_id in enumerate(item_ids)}
num_items = len(set(df['article_1'].unique()).union(set(df['article_2'].unique())))
del item_ids, df

In [9]:
from transformers import BertTokenizer



  from .autonotebook import tqdm as notebook_tqdm


In [54]:
import torch
from torch import nn, optim


class PairDataset(Dataset):
    def __init__(self, df, vocab):
        self.df = df
        # Create a dictionary that maps each item ID to a unique index
        self.vocab = vocab
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', padding=True)
        self.max_seq_length = 8
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        # get article_id1 and article_id2 and labels
        article_id1 = row['article_1']
        article_id2 = row['article_2']
        label = row['label']
        # convert to torch tensors
        article_id1 = torch.tensor(self.vocab[article_id1], dtype=torch.long)
        article_id2 = torch.tensor(self.vocab[article_id2], dtype=torch.long)
        label = torch.tensor(label, dtype=torch.float)

        text_1 = self.tokenizer(
            row['text_1'], 
            padding="max_length", 
            truncation=True, 
            max_length=self.max_seq_length, 
            return_tensors="pt"
        )
        
        text_2 = self.tokenizer(
            row['text_2'], 
            padding="max_length", 
            truncation=True, 
            max_length=self.max_seq_length, 
            return_tensors="pt"
        )
        # text_1 = torch.tensor(text_1['input_ids'][0], dtype=torch.long)
        # text_2 = torch.tensor(text_2['input_ids'][0], dtype=torch.long)
        text_1 = text_1['input_ids'][0]
        text_2 = text_2['input_ids'][0]

        return article_id1, article_id2, text_1, text_2, label


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
dataset = PairDataset(train, vocab)
data_loader = DataLoader(dataset, batch_size=4000, shuffle=True)

valid_dataset = PairDataset(valid, vocab)
valid_data_loader = DataLoader(valid_dataset, batch_size=4000, shuffle=False)


class Item2Vec(nn.Module):
    def __init__(self, num_items, embedding_dim, input_tokens=30522):
        super().__init__()
        # embedding layer
        self.embeddings = nn.Embedding(num_items, embedding_dim)
        
        self.text_embeddings = nn.Embedding(input_tokens, embedding_dim)

        self.lstm = nn.LSTM(embedding_dim, 64, num_layers=1, batch_first=True)

        # dense layer
        self.linear = nn.Linear(embedding_dim + 64, 512)
        # activation function
        self.act = nn.ReLU()
        # dropout
        self.dropout = nn.Dropout(0.2)
        # output layer
        self.output = nn.Linear(512, embedding_dim)
        # output activation
        self.output_act = nn.Sigmoid()

    def forward(self, item1, text):
        embed = self.embeddings(item1)
        # LSTM on text
        text = self.text_embeddings(text)

        lstm_out, _ = self.lstm(text)

        # Take the last hidden state
        text = lstm_out[:, -1, :]

        # Concatenate item embeddings and text embeddings
        combined = torch.cat((embed, text), dim=1)

        embed1 = self.dropout(combined)
        # pass through dense layer
        dense1 = self.linear(embed1)
        # pass through activation function
        act1 = self.act(dense1)
        # pass through dropout
        # pass through output layer
        output = self.output(act1)
        # pass through output activation
        # output = self.output_act(output)

        return output


def loss_function(output, target):
    return nn.BCEWithLogitsLoss()(output, target)


def train_model(model, data_loader, optimizer, num_epochs):
    print(device)
    model = model.to(device)

    for epoch in range(num_epochs):
        # switch model to training mode
        model.train()
        with tqdm(total=len(valid_data_loader)) as progress_bar:
            for i, (item1, item2, text1, text2, target) in enumerate(valid_data_loader):
                optimizer.zero_grad()
                
                output1 = model(item1.to(device), text1.to(device)) #torch.Size([4000, 128])
                output2 = model(item2.to(device), text2.to(device)) #torch.Size([4000, 128])
                
                print(output1.shape) #torch.Size([4000, 128])
                print(output2.shape) #torch.Size([4000, 128])
                dot_product = torch.sum(output1 * output2, dim=1)

                # Apply sigmoid to convert the dot product to a similarity score
                similarity_score = torch.sigmoid(dot_product)
                # do the dot product between output1 and output2 to get the similarity score between the two items

                
                loss = loss_function(similarity_score, target.float().to(device))

                loss.backward()
                optimizer.step()
                
                progress_bar.set_postfix(loss=loss.item())
                progress_bar.update(1)
                break
            break
                

        # compute total loss and accuracy
        total_loss = 0
        total_accuracy = 0

        # switch model to evaluation mode
        model.eval()

        with torch.no_grad():
            with tqdm(total=len(valid_data_loader)) as progress_bar:
                
                for i, (item1, item2, text1, text2, target) in enumerate(valid_data_loader): 
                    
                    output1 = model(item1.to(device), text1.to(device))
                    output2 = model(item2.to(device), text2.to(device))


                    output1 = output1.view(output1.shape[0], 1, output1.shape[1])  # shape: (batch_size, 1, embedding_dim)
                    output2 = output2.view(output2.shape[0], output2.shape[1], 1)  # shape: (batch_size, embedding_dim, 1)
                    # Compute the dot product
                    output = torch.bmm(output1, output2)  # shape: (batch_size, 1, 1)

                    # Reshape the output to be two-dimensional
                    output = output.view(output.shape[0], -1)  # shape: (batch_size, 1)
                    target = target.view(target.shape[0], -1)  # shape: (batch_size, 1)

                    loss = loss_function(output, target.float().to(device))

                    # compute accuracy
                    output = output.detach().cpu().numpy()
                    target = target.detach().cpu().numpy()
                    accuracy = ((output > 0) == target).mean()
                    total_loss += loss.item()
                    total_accuracy += accuracy
                    progress_bar.update(1)

        print('Epoch: {}, Loss: {:.4f}, Accuracy: {:.4f}'.format(epoch + 1, total_loss / len(valid_data_loader), total_accuracy / len(valid_data_loader)))


model = Item2Vec(num_items=num_items, embedding_dim=128, input_tokens=len(dataset.tokenizer))
train_model(model, data_loader, optimizer=torch.optim.Adam(model.parameters(), lr=0.001), num_epochs=10)

Using device: cuda
cuda


  0%|          | 0/1749 [00:02<?, ?it/s]

torch.Size([4000, 128])
torch.Size([4000, 128])





ValueError: Target size (torch.Size([4000])) must be the same as input size (torch.Size([4000, 1, 1]))

In [13]:
len(dataset.tokenizer)

30522