<a href="https://colab.research.google.com/github/krooner/til/blob/main/replicate_LMRec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/gdrive')

In [None]:
# !pip list | grep kaggle
# !pip list | grep transformers
!pip install datasets -q

## Data Preparation
Kaggle

In [None]:
!mkdir /root/.config/kaggle
!cp '/gdrive/MyDrive/datastore/kaggle.json' /root/.config/kaggle/kaggle.json

In [None]:
!kaggle datasets download carrie1/ecommerce-data && unzip ecommerce-data.zip

In [None]:
import pandas as pd

In [None]:
dataframe = pd.read_csv("/content/data.csv", encoding='latin-1')
display(dataframe.shape)
display(dataframe.head())

In [None]:
dataframe = dataframe.loc[~dataframe.CustomerID.isnull()].astype({'CustomerID': int})

display(dataframe.shape)
display(dataframe.head())

In [None]:
# dataframe.CustomerID.unique().shape # 4372
user_df = dataframe.sort_values('InvoiceDate')
user_df = dataframe.groupby('CustomerID').agg({'Description': lambda x: "->".join(x), 'Quantity': len}).reset_index()
user_df = user_df.loc[user_df.Quantity>1]

user_df['user_sequence'] = user_df.Description.apply(lambda x: "->".join(x.split("->")[:-1]))
user_df['user_item'] = user_df.Description.apply(lambda x: x.split("->")[-1])
user_df.head()
# user_df.CustomerID.unique().shape # 4293

In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(user_df)

In [None]:
# from transformers import AutoTokenizer
from transformers import GPT2TokenizerFast
model_tag = 'skt/kogpt2-base-v2'

# tokenizer = AutoTokenizer.from_pretrained(model_tag)
tokenizer = GPT2TokenizerFast.from_pretrained(model_tag)

In [None]:
encoded = tokenizer.encode('<s></s>')
print(encoded)
print(tokenizer.convert_ids_to_tokens(encoded))
decoded = tokenizer.decode(encoded)
print(decoded)

In [None]:
encoded = tokenizer.encode('<|endoftext|><|endoftext|>')
print(encoded)
print(tokenizer.convert_ids_to_tokens(encoded))
decoded = tokenizer.decode(encoded)
print(decoded)

In [None]:
from tokenizers.processors import TemplateProcessing

tokenizer.bos_token = '<s>'
tokenizer.bos_token_id = 0

tokenizer.eos_token = '</s>'
tokenizer.eos_token_id = 1

tokenizer.pad_token = '<pad>'
tokenizer.pad_token_id = 3

tokenizer._tokenizer.post_processor = TemplateProcessing(
    single="<s> $0 </s>",
    special_tokens=[("<s>", tokenizer.bos_token_id), ("</s>", tokenizer.eos_token_id)]
)

In [None]:
def tokenize_data(inputs):
  tokenized_data = tokenizer(inputs['Description'], padding=True, truncation=True, max_length=32)
  return tokenized_data

dataset_mapped = dataset.map(
    tokenize_data,
    batched=True
)

In [None]:
dataset_mapped

In [None]:
tokenizer.convert_ids_to_tokens(range(10))

In [None]:
# sample_input_ids = dataset_mapped[0]['input_ids']
sample_texts = [
    "Hello World",
    "We are the world"
]
tokenize_text = tokenizer(sample_texts, padding=True, truncation=True, max_length=16, return_tensors='pt')

eos_token_pos = (tokenize_text['input_ids'] == tokenizer.eos_token_id).nonzero()
display(tokenize_text['input_ids'])
display(tokenize_text['input_ids'] == tokenizer.eos_token_id)
display(eos_token_pos[:, -1])

In [None]:
display(tokenizer.bos_token, tokenizer.bos_token_id)
display(tokenizer.eos_token, tokenizer.eos_token_id)

## Modeling

In [None]:
import transformers
from transformers import AutoModelForCausalLM
# from transformers import AutoTokenizer

model = AutoModelForCausalLM.from_pretrained(model_tag)
# tokenizer = AutoTokenizer.from_pretrained(model_tag)

display(model)
# display(tokenizer)

In [None]:
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np

from tokenizers.processors import TemplateProcessing

class LMRecModel(nn.Module):
  def __init__(self, model_name="skt/kogpt2-base-v2"):
    super().__init__()

    self.language_model = AutoModelForCausalLM.from_pretrained(model_name, output_hidden_states=True)
    self.tokenizer = AutoTokenizer.from_pretrained(model_name)
    self.n_dim = self.language_model.config.n_embd

    self.coef = 1e-3

    self.user_linear = nn.Linear(self.n_dim, self.n_dim)
    self.item_linear = nn.Linear(self.n_dim, self.n_dim)
    self.sigmoid = nn.Sigmoid()

    self.loss_fct = nn.BCEWithLogitsLoss()

    self.tokenizer.bos_token = '<s>'
    self.tokenizer.bos_token_id = 0

    self.tokenizer.eos_token = '</s>'
    self.tokenizer.eos_token_id = 1

    self.tokenizer.pad_token = '<pad>'
    self.tokenizer.pad_token_id = 3

    self.tokenizer._tokenizer.post_processor = TemplateProcessing(
        single="<s> $0 </s>",
        special_tokens=[("<s>", self.tokenizer.bos_token_id), ("</s>", self.tokenizer.eos_token_id)]
    )

  def forward(self, user_input_ids, user_attention_mask, item_input_ids, item_attention_mask, labels=None):
    user_outputs = self.language_model(
        input_ids=user_input_ids,
        attention_mask=user_attention_mask,
        labels=user_input_ids.clone()
    )
    causal_lm_loss = user_outputs.loss

    user_eos_indices = (user_input_ids == self.tokenizer.eos_token_id).nonzero(as_tuple=True)[-1]
    item_eos_indices = (item_input_ids == self.tokenizer.eos_token_id).nonzero(as_tuple=True)[-1]

    last_hidden_state_user = user_outputs.hidden_states[-1]
    # user_embedding = user_outputs.hidden_states[-1][:, -1, :] # (batch_size, max_length, n_dim) -> (batch_size, n_dim)

    # user_embedding = torch.Tensor(
    #     [last_hidden_state_user[i, eos_indices[i], :] for i in range(len(user_input_ids))]
    # ).reshape((-1, self.n_dim)) # (batch_size, max_length, n_dim) -> (batch_size, n_dim)

    user_embedding = last_hidden_state_user[torch.arange(len(user_input_ids)), user_eos_indices]

    with torch.no_grad():
      item_outputs = self.language_model(
          input_ids=item_input_ids,
          attention_mask=item_attention_mask,
      )

    last_hidden_state_item = item_outputs.hidden_states[-1]
    # item_embedding = item_outputs.hidden_states[-1][:, -1, :] # (batch_size, max_length, n_dim) -> (batch_size, n_dim)

    # item_embedding = torch.Tensor(
    #     [last_hidden_state_item[i, eos_indices[i], :] for i in range(len(item_input_ids))]
    # ).reshape((-1, self.n_dim))

    item_embedding = last_hidden_state_item[torch.arange(len(item_input_ids)), item_eos_indices]

    user_hidden = self.user_linear(user_embedding)
    item_hidden = self.item_linear(item_embedding)

    user_item_dot_product = (user_hidden * item_hidden).sum(dim=1)

    user_item_prob = self.sigmoid(user_item_dot_product)

    rec_loss = self.loss_fct(user_item_prob, labels.float())

    total_loss = causal_lm_loss + self.coef * rec_loss

    return total_loss

  # TODO
  def encode_user(self, user_input_ids, user_attention_mask):
    self.language_model.eval()
    with torch.no_grad():
      user_outputs = self.language_model(
          input_ids=user_input_ids,
          attention_mask=user_attention_mask
      )

    user_eos_indices = (user_input_ids == self.tokenizer.eos_token_id).nonzero(as_tuple=True)[-1]

    last_hidden_state_user = user_outputs.hidden_states[-1].cpu().detach()

    user_embedding = last_hidden_state_user[torch.arange(len(user_input_ids)), user_eos_indices]

    user_hidden = self.user_linear(user_embedding)

    return user_hidden

  # TODO
  def encode_item(self, item_input_ids, item_attention_mask):
    self.language_model.eval()
    with torch.no_grad():
      item_outputs = self.language_model(
          input_ids=item_input_ids,
          attention_mask=item_attention_mask
      )

    item_eos_indices = (item_input_ids == self.tokenizer.eos_token_id).nonzero(as_tuple=True)[-1]

    last_hidden_state_item = item_outputs.hidden_states[-1].cpu().detach()

    item_embedding = last_hidden_state_item[torch.arange(len(item_input_ids)), item_eos_indices]

    item_hidden = self.item_linear(user_embedding)

    return item_hidden

  def get_user_item_probability(self, user_hidden, item_hidden):

    return self.sigmoid(torch.matmul(user_hidden, item_hidden.t()))

    # user_item_dot_product = (user_hidden * item_hidden).sum(dim=1)
    # user_item_prob = self.sigmoid(user_item_dot_product)
    # return user_item_prob.detach().numpy()

In [None]:
model = LMRecModel()

In [None]:
user_seq = ["안녕하세요. 고객센터입니다.", "Hello World"]
item_seq = ["세탁기 소음 문제 해결 방법 안내", "How are you"]

user_inputs = tokenizer(user_seq, padding=True, truncation=True, max_length=32, return_tensors='pt')
item_inputs = tokenizer(item_seq, padding=True, truncation=True, max_length=32, return_tensors='pt')

displays

lm_labels = user_inputs["input_ids"].clone()
lm_labels

In [None]:
lm_labels[lm_labels==tokenizer.pad_token_id] = -100
display(lm_labels)

labels = torch.tensor([1, 0])
display(labels)

In [None]:
outputs = model(
    user_input_ids=user_inputs['input_ids'],
    user_attention_mask=user_inputs['attention_mask'],
    item_input_ids=item_inputs['input_ids'],
    item_attention_mask=item_inputs['attention_mask'],
    labels=labels,
  )
outputs

In [None]:
user_embedding = model.encode_user(
    user_input_ids=user_inputs['input_ids'],
    user_attention_mask=user_inputs['attention_mask'],
  )

item_embedding = model.encode_item(
    item_input_ids=item_inputs['input_ids'],
    item_attention_mask=item_inputs['attention_mask'],
  )
user_embedding.shape, item_embedding.shape

In [None]:
user_item_prob = model.get_user_item_probability(user_embedding, item_embedding)
user_item_prob