In [None]:
!pip install transformers
!pip install torchmetrics

In [None]:
import pandas as pd 
import torch
import numpy as np
from transformers import BertTokenizer
from torch import nn
from torchmetrics import AUROC

In [None]:
from data_utils import *
from models import TextClassifierModel_Sequential, Dataset_Sequential, train_model_sequential, evaluate_model_sequential

## Prepare Data

In [None]:
input_path = '../data/'
data_name = 'task_output_5313.txt'
df = create_sequential_data(input_path, data_name)

In [None]:
df.columns

In [None]:
# df = df[:100]

In [None]:
length = int(df['input_1'].apply(len).mean())
length = np.min([200, length])
print('length', length)

In [None]:
# split dataset into train, val and test
np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))])
df_val = df_val.iloc[:440,:]
df_test = df_test.iloc[:440, :]
print(len(df_train),len(df_val), len(df_test))

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case = True)

In [None]:
#Create pytorch Dataset
train_dataset  = Dataset_Sequential(df_train, length, tokenizer)
val_dataset = Dataset_Sequential(df_val, length, tokenizer)
test_dataset = Dataset_Sequential(df_test, length, tokenizer)

## Train and evaluate models

In [None]:
EPOCHS = 10
LR = 1e-5

## BERT+CNN Model

In [None]:
model_cnn = TextClassifierModel_Sequential(model_type='cnn', bert_freeze=False)

In [None]:
auroc = AUROC(num_classes=2) # roc-auc

In [None]:
train_model_sequential(model_cnn, train_dataset, val_dataset, auroc, LR, EPOCHS)

In [None]:
auroc = AUROC(num_classes=2) #roc-auc

In [None]:
evaluate_model_sequential(model_cnn, test_dataset, auroc)

## BERT+LSTM Model

In [None]:
model_lstm = TextClassifierModel_Sequential(model_type='lstm', bert_freeze=False)

In [None]:
auroc = AUROC(num_classes=2)

In [None]:
train_model_sequential(model_lstm, train_dataset, val_dataset, auroc, LR, EPOCHS)

In [None]:
auroc = AUROC(num_classes=2)

In [None]:
evaluate_model_sequential(model_lstm, test_dataset, auroc)