In [None]:
!pip install transformers

In [None]:
!nvidia-smi

## Import Libraries

In [None]:
import os
import numpy as np
import pandas as pd

from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import Trainer, TrainingArguments
from transformers.trainer_callback import ProgressCallback

import torch
from sklearn.model_selection import train_test_split

from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import datetime

from google.colab import drive
import os
import sys
from pathlib import Path
import requests
from zipfile import ZipFile

In [None]:
drive.mount('/content/drive')

## Downlaod dataset

In [None]:
class EnglishDatasetLoader:
    MAIN_DIR_PATH = 'http://data.dws.informatik.uni-mannheim.de/largescaleproductcorpus/data/v2'

    @staticmethod
    def load_train(type:object, size:object)->pd.DataFrame:
        """Loads the training dataset from WDC website
        Args:
            type (object): dataset type: computers, cameras, watches, shoes, all
            size (object): dataset size: small, medium, large, xlarge
        Returns:
            pd.DataFrame: training dataset
        """
        p = Path(os.path.join('trainsets', f'{type}_train'))
        p.mkdir(parents=True, exist_ok=True)
        dataset_path = f'{p}/{type}_train_{size}.json.gz'
        if not os.path.exists(dataset_path):
            zip_path = f'{p}.zip'
            url = f'{EnglishDatasetLoader.MAIN_DIR_PATH}/trainsets/{type}_train.zip'
            r = requests.get(url, allow_redirects=True)
            open(zip_path, 'wb').write(r.content)
            with ZipFile(zip_path, 'r') as zip:
                zip.extractall(path=p)
        
        df = pd.read_json(dataset_path, compression='gzip', lines=True)
        return df.reset_index()

    @staticmethod
    def load_test(type:object)->pd.DataFrame:
        """Loads the test dataset form repository
        Args:
            type (object): dataset type: computers, cameras, watches, shoes, all
        Returns:
            pd.DataFrame: test dataset
        """
        path = f'{EnglishDatasetLoader.MAIN_DIR_PATH}/goldstandards/{type}_gs.json.gz'
        df = pd.read_json(path, compression='gzip', lines=True)
        return df.reset_index()


class FeatureBuilder:
    def __init__(self, columns):
        self.columns = columns

    def get_X(self, dataset):
        X = '[CLS] ' + dataset[f'{self.columns[0]}_left']
        for i in range(1, len(self.columns)):
            X = X + ' [SEP] ' + dataset[f'{self.columns[i]}_left']
        for i in range(len(self.columns)):
            X = X + ' [SEP] ' + dataset[f'{self.columns[i]}_right']
        X + ' [SEP]'
        return X.to_list()

    def get_y(self, dataset):
        return dataset['label'].to_list()


class TorchPreprocessedDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        self.items = self.preprocessItems(encodings, labels)

    def __getitem__(self, idx):
        return self.items[idx]

    def __len__(self):
        return len(self.labels)

    def preprocessItems(self, encodings, labels):
        items = []
        for idx in range(len(labels)):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx])
            items.append(item)
        return items


## Model definition, Example of dataset 

In [None]:
model_name = 'bert-base-multilingual-uncased'
dataset_type = 'cameras'
dataset_size = 'small'

In [None]:
train_df = EnglishDatasetLoader.load_train(dataset_type, dataset_size)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, sep_token = '[SEP]', cls_token = '[CLS]')
title_fb = FeatureBuilder(['title'])

In [None]:
train_df = EnglishDatasetLoader.load_train(dataset_type, dataset_size)
X_train = title_fb.get_X(train_df)
y_train = title_fb.get_y(train_df)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

train_encodings = tokenizer(X_train , return_tensors='pt',  truncation=True, padding=True)
val_encodings = tokenizer(X_val, truncation=True, padding=True)

train_dataset = TorchPreprocessedDataset(train_encodings, y_train)
val_dataset = TorchPreprocessedDataset(val_encodings, y_val)
title_fb = FeatureBuilder(['title'])
del train_df, X_train, X_val, y_train, y_val

In [None]:
test_df = EnglishDatasetLoader.load_test(dataset_type)
X_test = title_fb.get_X(test_df)
y_test = title_fb.get_y(test_df)
test_encodings = tokenizer(X_test, truncation=True, padding=True)
test_dataset = TorchPreprocessedDataset(test_encodings, y_test)
del test_df, X_test

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=16,   # batch size per device during training
    per_device_eval_batch_size=64,    # batch size for evaluation
    warmup_steps=500,                 # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                # strength of weight decay
    logging_dir=logdir,               # directory for storing logs
    logging_steps=10,
    disable_tqdm=False,
    fp16=True,
    evaluation_strategy='epoch',
    save_strategy='no',
)
model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                                  num_labels=2,
                                                                  output_attentions=False,
                                                                  output_hidden_states=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


## Train Model

In [None]:
trainer.train()

## Get Embeddings of each offer from the BERT

In [None]:
# get embeddings
import torch as th
def getPooledOutputs(model, encoded_dataset, batch_size = 32):
  model.eval()

  # pooled_outputs = []
  pooled_outputs = torch.empty([0,768]).cuda()
  print("total number of iters ", len(encoded_dataset['input_ids'])//batch_size + 1)
  
  for i in range(len(encoded_dataset['input_ids'])//batch_size + 1):
    print(i)
    up_to = i*batch_size + batch_size
    if len(encoded_dataset['input_ids']) < up_to:
      up_to = len(encoded_dataset['input_ids'])
    input_ids = th.LongTensor(encoded_dataset['input_ids'][i*batch_size:up_to]).cuda()
    attention_mask = th.LongTensor(encoded_dataset['attention_mask'][i*batch_size:up_to]).cuda()

    with torch.no_grad():
      embeddings = model.forward(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)['hidden_states'][-1][:,0] # Pooled output
      pooled_outputs = th.cat([pooled_outputs, embeddings],0)
      th.cuda.empty_cache()

  return pooled_outputs

In [None]:
train_df = EnglishDatasetLoader.load_train(dataset_type, dataset_size)
X_train = title_fb.get_X(train_df)
y_train = title_fb.get_y(train_df)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
del X_val, y_train, y_val, train_df

In [None]:
def get_embedding_one_input(model,tokenizer, sentence):
  
  model.eval()
  tok = tokenizer(sentence, return_tensors='pt')
  input_ids = tok.input_ids.cuda()

  with torch.no_grad():
    embeddings = model.forward(input_ids=input_ids, output_hidden_states=True)
        
  emb = embeddings.hidden_states[-1].cuda()

  sep_idx = np.argwhere(np.array(sentence.split(" "))== '[SEP]')[0][0]

  embedding1 = torch.empty([0,768]).cuda()
  embedding2 = torch.empty([0,768]).cuda()
  
  for i in range(2, len(emb[0])-1):
    if tok.word_ids()[i] < sep_idx:
      embedding1 = torch.cat((embedding1, emb[:,i,:]), 0)

    elif tok.word_ids()[i] > sep_idx:
      embedding2 = torch.cat((embedding2, emb[:,i,:]), 0)

  return embedding1, embedding2

## Calculate cosine similarity between offer embedings

In [None]:
def calculate_emb_cosine_metric(e1, e2):
  avg1 = e1.mean(axis=0)
  avg2 = e2.mean(axis=0)
  return torch.cosine_similarity(avg1.reshape(1,-1), avg2.reshape(1,-1))

## Exemple for input sample

In [None]:
sentence = X_train[1000]
e1, e2 = get_embedding_one_input(model,tokenizer, sentence)

In [None]:
calculate_emb_cosine_metric(e1, e2)

In [None]:
e1

In [None]:
e2