## Install Required Libraries

In [None]:
!pip install transformers
!pip install torchmetrics

## import libraries

In [None]:
import pandas as pd 
import scipy as sp
from torch import nn
import torch
import numpy as np
from transformers import BertTokenizer
from torchmetrics import AUROC

In [None]:
from models import TextClassifierModel_User, Dataset_with_user, train_model_user, evaluate_model_user
from data_utils import *

## Load Dataset

In [None]:
input_path = '../data/'
data_name = 'task_output_5313.txt'
df = clean_data(input_path, data_name)

In [None]:
# for each user, keep the most recent 50 records
sort_df = df.sort_values(['event_time'], ascending=True).groupby('user_no')
df = sort_df.head(50).reset_index()
df.shape

In [None]:
# df = df[:100]

## Create Pseudo user data

In [None]:
user_num = df['user_no'].max()+1
user_emb_dim = 256
user_embeddings = generate_pseudo_user_embeddings(user_num, user_emb_dim)
print('user_embeddings size: ', user_embeddings.shape)

## Define the input info

In [None]:
df['input_info'] = df['clean_title'] + ' ' + df['clean_abstract']
df['input_info'].head(2)

In [None]:
# split dataset as training, validation and testing dataset
np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),[int(.8*len(df)), int(.9*len(df))])
print(len(df_train),len(df_val), len(df_test))

In [None]:
# find the average text length
length = int(df['input_info'].apply(len).mean())
print('length', length)

## Import tokenizer and BERT 

In [None]:
# initialize tokenizer and bert model
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case = True)

In [None]:
# construct pytorch Dataset
train_dataset, val_dataset, test_dataset = Dataset_with_user(df_train, length, tokenizer), Dataset_with_user(df_val, length, tokenizer), Dataset_with_user(df_test, length, tokenizer)

In [None]:
model = TextClassifierModel_User(user_emb_size=user_emb_dim, bert_freeze=False)

In [None]:
auroc = AUROC(num_classes=2)

In [None]:
# hyperparameters for training 
EPOCHS = 10
LR = 1e-5
BATCH_SIZE = 16

In [None]:
# train the text-classifier model
train_model_user(model, train_dataset, val_dataset, auroc, user_embeddings, LR, EPOCHS, BATCH_SIZE)

In [None]:
# evaluate the text-classifier model
auroc = AUROC(num_classes=2)

evaluate_model_user(model, test_dataset, auroc, user_embeddings, BATCH_SIZE)