In [2]:
#import required libraries
#a script from https://github.com/Bjarten/early-stopping-pytorch#:~:text=Early%20stopping%20is%20a%20form,a%20row%20the%20training%20stops.
import torch
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import regex as re
from sklearn.model_selection import train_test_split
from early_stopping_script import EarlyStopping
from torch.utils.data import DataLoader
import itertools

In [3]:
train = pd.read_csv('/kaggle/input/drug-reviews/drugLibTrain_raw.tsv', sep='\t')
test = pd.read_csv('/kaggle/input/drug-reviews/drugLibTest_raw.tsv', sep='\t')

In [4]:
train.dropna(inplace=True)
test.dropna(inplace=True)

In [5]:
train['combinedReview'] = train['benefitsReview'] + '.' + train['sideEffectsReview'] + '.' + train['commentsReview']

In [6]:
top_10_conditions = train['condition'].value_counts(ascending=False).index[:10]
top_10 = train[train['condition'].isin(top_10_conditions)]

In [7]:
lengths_combinedReview = top_10['combinedReview'].str.split().str.len()
sns.boxplot(data = lengths_combinedReview.reset_index(drop=True))

In [8]:
top_10_shorter = top_10[['condition', 'combinedReview']].copy()
top_10_shorter.head()

In [9]:
#remove links
def clean_text(x):
  y = re.sub(r'(?:http:\/\/)?www\.[^\s\,]+', '', x, flags=re.IGNORECASE)
  return y

In [10]:
#clean
top_10_shorter['combinedReview'] = top_10_shorter['combinedReview'].apply(lambda x: clean_text(x))

In [11]:
top_10_shorter['condition_code'] = top_10_shorter.groupby('condition').ngroup()

In [12]:
classes = top_10_shorter[['condition', 'condition_code']].drop_duplicates().sort_values('condition_code')['condition']

In [13]:
classes

In [14]:
#stratified sampling - train test split
X_train, X_test, y_train, y_test = train_test_split(top_10_shorter['combinedReview'], top_10_shorter['condition_code'], 
                                                    test_size=0.2, random_state=42, shuffle=True, stratify=top_10_shorter['condition_code'])
#train val split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
                                                    test_size=0.15, random_state=42, shuffle=True, stratify=y_train)

In [15]:
print(f'No. train observations: {len(X_train)}')
print(f'No. validation observations: {len(X_val)}')
print(f'No. test observations: {len(X_test)}')

In [16]:
y_test.value_counts()

In [17]:
y_train.value_counts()

In [18]:
class ReviewsDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        return (
            row['condition_code'],
            row['combinedReview']
        )

In [19]:
train_df = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
test_df = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)
val_df = pd.concat([X_val, y_val], axis=1).reset_index(drop=True)

In [20]:
trainset = ReviewsDataset(train_df)
testset = ReviewsDataset(test_df)
valset = ReviewsDataset(val_df)

In [23]:
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [25]:
#take max around 250 tokens

def collate_batch(batch):
    labels, token_ids, attention_masks = [], [], []
    for (label, review) in batch:
        encoded_dict = tokenizer.encode_plus(
                        review,
                        add_special_tokens = True,
                        max_length = 250,
                        padding = 'max_length',
                        return_attention_mask = True,
                        return_tensors = 'pt',
                        truncation = True)
        labels.append(label)
        token_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    label_tensor = torch.tensor(labels, dtype=torch.int64)
    token_tensor = torch.cat(token_ids, dim=0)
    attention_tensor = torch.cat(attention_masks, dim=0)
    return label_tensor.to(device), token_tensor.to(device), attention_tensor.to(device)

In [26]:
trainloader = DataLoader(trainset, batch_size=32, collate_fn= collate_batch, shuffle=True)
valloader = DataLoader(valset, batch_size=25, collate_fn= collate_batch, shuffle=True)
testloader = DataLoader(testset, batch_size=32, collate_fn= collate_batch, shuffle=True)

In [27]:
iterator = iter(trainloader)
labels, tokens, attention = iterator.next()

In [28]:
print(labels[0])
print(tokens[0])
print(attention[0])

In [29]:
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased', num_labels=10)

In [30]:
#transfer to cuda
model.to(device)

In [31]:
optimizer = torch.optim.Adam(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8
                )

In [32]:
#scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, verbose=True)

In [33]:
def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

In [34]:
class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl: 
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)

In [35]:
trainloader = DeviceDataLoader(trainloader, device)
valloader = DeviceDataLoader(valloader, device)
testloader = DeviceDataLoader(testloader, device)

In [36]:
def run_training(num_epochs):
    train_loss_list = []
    val_loss_list = []
    early_stopping = EarlyStopping(patience=3, verbose=True)

    for k in range(num_epochs):
        #training mode
        model.train()
        total_training_loss = 0
        i=0
        for label, token, attention in trainloader:
            optimizer.zero_grad()
            output = model(input_ids=token, 
                              attention_mask=attention, 
                              labels=label)
            loss = output['loss']
            total_training_loss+=loss.item()
            loss.backward()
            optimizer.step()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            i+=1
        avg_train_loss = total_training_loss/i
        train_loss_list.append(avg_train_loss)

        #eval mode
        model.eval()
        total_validation_loss = 0
        i = 0
        for label, token, attention in valloader:
            with torch.no_grad():
                output = model(input_ids=token, 
                                attention_mask=attention, 
                                labels=label)
                loss = output['loss']
                total_validation_loss+=loss.item()
                i+=1
        avg_val_loss = total_validation_loss/i
        val_loss_list.append(avg_val_loss)
        
        early_stopping(avg_val_loss, model)
        
        if early_stopping.early_stop:
            print("Early stopping")
            break

        scheduler.step(avg_val_loss)

        print(f'epoch: {k+1}, train loss: {avg_train_loss}, val loss: {avg_val_loss}')

    model.load_state_dict(torch.load('checkpoint.pt'))
    
    return model, train_loss_list, val_loss_list

In [37]:
model, train_loss_list, val_loss_list = run_training(15)
#early stopping is needed

In [38]:
cf_matrix = pd.DataFrame(list(itertools.product(classes, classes)))
cf_matrix.rename(columns = {0:'Actual', 1:'Predicted'}, inplace=True)
cf_matrix['Num_Obs']=0
classes = classes.to_list()

In [39]:
#get confusion matrix
#https://stackoverflow.com/questions/53290306/confusion-matrix-and-test-accuracy-for-pytorch-transfer-learning-tutorial

#testing accuracy
model.eval()
num_correct = 0
total = 0
softmax = torch.nn.Softmax(dim=1)

with torch.no_grad():
    for i, batch in enumerate(testloader):
        label, token, attention = batch
        output = model(input_ids=token, 
                      attention_mask=attention, 
                      labels=label)
        logits = output['logits']
        probability = softmax(logits)
        predictions = torch.max(probability, dim=1).indices
        num_correct+=(predictions == label).sum().item()
        total+= label.size(0)
        for actual, predicted in zip(label, predictions):
            actual = classes[actual.item()]
            predicted = classes[predicted.item()]
            cf_matrix.loc[((cf_matrix['Actual']==actual) & (cf_matrix['Predicted']==predicted)), 'Num_Obs']+=1
    
print(f'Accuracy is {(num_correct/total)*100}%')

In [40]:
cf_matrix = cf_matrix.pivot("Actual", "Predicted", "Num_Obs")

sns.heatmap(data=cf_matrix, annot=True)

The model can differentiate between the classes well for most of the classes. However it has more difficulty differentiating between anxiety and depression.

In [41]:
#obtain training and validation loss curve
#create dataframes for seaborn visualisation
trainloss_df = pd.DataFrame(train_loss_list, columns=['value'])
trainloss_df['type']='training_loss'
trainloss_df['epoch_num']=range(len(train_loss_list))
trainloss_df['epoch_num']=trainloss_df['epoch_num']+1
valloss_df = pd.DataFrame(val_loss_list, columns=['value'])
valloss_df['type']='valid_loss'
valloss_df['epoch_num']=range(len(val_loss_list))
valloss_df['epoch_num']=valloss_df['epoch_num']+1
combined_df = pd.concat([trainloss_df, valloss_df])
combined_df.reset_index(drop=True, inplace=True)

In [42]:
sns.lineplot(data=combined_df,x='epoch_num', y='value', hue='type')