## Install Required Libraries

In [1]:
!pip install transformers
!pip install torchmetrics

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p38/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mLooking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p38/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

## import libraries

In [2]:
import pandas as pd 
import scipy as sp
from torch import nn
from transformers import BertModel
import torch
import numpy as np
from transformers import BertTokenizer
from torchmetrics import AUROC

In [3]:
from bert_user_utlis import *

## Load Dataset

In [4]:
input_path = '../data/'
df = pd.read_csv(input_path+'clean_df.csv')
df.fillna(' ', inplace=True)
# df.head(1)

In [5]:
# for each user, keep the most recent 50 records
sort_df = df.sort_values(['event_time'], ascending=True).groupby('user_no')
df = sort_df.head(50).reset_index()
df.shape

(16181, 18)

## Define the input info

In [6]:
df['input_info'] = df['clean_title'] + ' ' + df['clean_abstract']
df['input_info'].head(2)

0    News highlights  Top global markets news of th...
1    News highlights  Top global markets news of th...
Name: input_info, dtype: object

## Generate pseudo user embeddings

In [7]:
user_number = df['user_no'].nunique()
user_emb_size = 256
emb = RandomEmbedding(user_number,user_emb_size,avg_embedding_norm=1)

In [8]:
user_ids = torch.tensor(list(range(user_number)), dtype=torch.int64)
user_ids

tensor([   0,    1,    2,  ..., 4472, 4473, 4474])

In [9]:
user_embeddings = emb(user_ids)
user_embeddings.shape

torch.Size([4475, 256])

In [10]:
user_embeddings[:10]

tensor([[-0.1422,  0.0060,  0.0554,  ..., -0.0351, -0.1055,  0.0338],
        [ 0.0338,  0.1422,  0.0060,  ..., -0.0810,  0.0351,  0.1055],
        [ 0.1055, -0.0338,  0.1422,  ...,  0.0515,  0.0810, -0.0351],
        ...,
        [-0.0971,  0.0272, -0.0515,  ...,  0.0137, -0.0733, -0.0968],
        [-0.0968,  0.0971,  0.0272,  ...,  0.0286, -0.0137,  0.0733],
        [ 0.0733,  0.0968,  0.0971,  ...,  0.0506, -0.0286,  0.0137]])

In [11]:
# split dataset as training, validation and testing dataset
np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),[int(.8*len(df)), int(.9*len(df))])
print(len(df_train),len(df_val), len(df_test))

12944 1618 1619


In [12]:
# find the average text length
length = int(df['input_info'].apply(len).mean())
print('length', length)

length 219


## Import tokenizer and BERT 

In [13]:
# initialize tokenizer and bert model
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case = True)
bert_model = BertModel.from_pretrained('bert-base-multilingual-cased')

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
model = BertClassifier(bert_model, user_emb_size=user_emb_size, bert_freeze=False)

In [15]:
auroc = AUROC(num_classes=2)



In [16]:
# construct pytorch Dataset
train_dataset, val_dataset, test_dataset = Dataset_with_user(df_train, length, tokenizer), Dataset_with_user(df_val, length, tokenizer), Dataset_with_user(df_test, length, tokenizer)

In [17]:
# hyperparameters for training 
EPOCHS = 10
LR = 1e-5
BATCH_SIZE = 16

In [18]:
# train the text-classifier model
train_acc, train_auc, val_acc, val_auc = train_model(model, train_dataset, val_dataset, auroc, user_embeddings, LR, EPOCHS, BATCH_SIZE)

len(train_data_loader) 809
device ~~~~~~~  cuda


100%|██████████| 809/809 [00:55<00:00, 14.53it/s]


Epochs: 1 | Train Loss:  0.041                 | Train Accuracy:  0.692                 | Train AUC:  0.501                | Val Loss:  0.040                 | Val Accuracy:  0.689                | Val AUC:  0.498


100%|██████████| 809/809 [00:56<00:00, 14.39it/s]


Epochs: 2 | Train Loss:  0.039                 | Train Accuracy:  0.701                 | Train AUC:  0.505                | Val Loss:  0.040                 | Val Accuracy:  0.689                | Val AUC:  0.501


In [19]:
# evaluate the text-classifier model
auroc = AUROC(num_classes=2)

evaluate_model(model, test_dataset, auroc, user_embeddings, BATCH_SIZE)

device ~~~~~~~  cuda
Test Accuracy:  0.711             | Test AUC:  0.495
