<a href="https://colab.research.google.com/github/jabanto22/NLP-Project/blob/main/project_classifier_vs_ver.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch 
!pip install torchtext 
!pip install torchvision 
!pip install transformers
!pip install tweet-preprocessor

# Libraries
import pandas as pd
import preprocessor as p
import numpy as np
import random
import copy
import os
import json
import torch

# Models
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup

# Training
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast, GradScaler
from sklearn.utils.class_weight import compute_class_weight

# Evaluation
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Global definitions
source_folder = '/content/drive/MyDrive/Colab Notebooks/NLP_Project/project-data/'


def read_data(filename):
    """
    Read twitter datasets.
    """
    data = pd.DataFrame()
    with open(filename, 'r', encoding="utf8") as f:
        for line in f:
            line = json.loads(line)
            tweet_id = line[0]["id_str"]
            tweet = p.clean(line[0]["text"])
            comments = ""
            for row in line:
                # use tweet preprocessor to clean text
                comments += " " + p.clean(row["text"]) + "."
            data = data.append({"id":tweet_id,"text":tweet,"comments":comments}, ignore_index=True)
    f.close()

    return data

    
def read_label(filename):
    """
    Read class labels.
    """
    label = pd.DataFrame()

    with open(filename, 'r', encoding="utf8") as f:
        label = pd.DataFrame.from_dict(json.load(f), orient="index").reset_index()
        label.columns = ["id", "label"]
    f.close()

    return label

    
def merge_data_label(data, label):
    """
    Merge train data with class labels and class label codes for prediction.
    """
    data = pd.merge(data, label, on="id", how="outer")
    data.label = pd.Categorical(data.label)
    class_labels = dict(enumerate(data.label.cat.categories))
    data['label'] = data.label.cat.codes

    # write predicted labels to json file
    with open(source_folder + 'labels.json', 'w') as f:
        json.dump(class_labels, f, separators=(',', ':'))
    f.close()

    return data     


def extract_class_labels():
    # read class labels from json file
    label = pd.DataFrame()
    with open(source_folder + 'labels.json', 'r', encoding="utf8") as f:
        label = json.load(f)
    f.close()
    return label


def save_data_to_csv():
    """
    Read and extract datasets from files.
    """
    # read data (jsonl files)
    train_data = read_data(source_folder + 'train.data.jsonl')
    dev_data = read_data(source_folder + 'dev.data.jsonl')
    test_data = read_data(source_folder + 'test.data.jsonl')
    covid_data = read_data(source_folder + 'covid.data.jsonl')

    # read labels (json files)
    train_label = read_label(source_folder + 'train.label.json')
    dev_label = read_label(source_folder + 'dev.label.json')

    # merge data with class labels
    train_data = merge_data_label(train_data, train_label)
    dev_data = merge_data_label(dev_data, dev_label)

    # write filetered data to csv
    open(source_folder + 'train.csv','w', newline='').write(train_data.to_csv(index=False))
    open(source_folder + 'dev.csv','w', newline='').write(dev_data.to_csv(index=False))
    open(source_folder + 'test.csv','w', newline='').write(test_data.to_csv(index=False))
    open(source_folder + 'covid.csv','w', newline='').write(covid_data.to_csv(index=False))


def check_input_files(filename):
    """
    Check input files if they exist.
    """
    try:
        f = open(filename,'r')
        f.close()
    except:
        save_data_to_csv()


def read_csv_datasets():
    # check if input files exist
    check_input_files(source_folder + 'train.csv')
    check_input_files(source_folder + 'dev.csv')
    check_input_files(source_folder + 'test.csv')
    check_input_files(source_folder + 'covid.csv')

    # read datasets
    train_df = pd.read_csv(source_folder + 'train.csv')
    dev_df = pd.read_csv(source_folder + 'dev.csv')
    test_df = pd.read_csv(source_folder + 'test.csv')

    return train_df, dev_df, test_df


class TweetDataset(Dataset):

    def __init__(self, data, maxlen, with_labels=True, bert_model='bert-base-uncased'):

        self.data = data
        self.with_labels = with_labels 
        
        # Initialize BERT tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(bert_model)  
        self.maxlen = maxlen
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        # Selecting sentence1 and sentence2 at the specified index in the data frame
        sent1 = str(self.data.loc[index, 'text'])
        sent2 = str(self.data.loc[index, 'comments'])

        # Tokenize the pair of sentences to get token ids, attention masks and token type ids
        encoded_pair = self.tokenizer(sent1, sent2, 
                                      padding='max_length',  # Pad to max_length
                                      truncation=True,  # Truncate to max_length
                                      max_length=self.maxlen,  
                                      return_tensors='pt')  # Return torch.Tensor objects
        
        token_ids = encoded_pair['input_ids'].squeeze(0)  # tensor of token ids
        attn_masks = encoded_pair['attention_mask'].squeeze(0)  # binary tensor with "0" for padded values and "1" for the other values
        token_type_ids = encoded_pair['token_type_ids'].squeeze(0)  # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens

        if self.with_labels:  # True if the dataset has labels (train and validation dataset)
            label = self.data.loc[index, 'label']
            return token_ids, attn_masks, token_type_ids, label  
        else:  # for test set that has no labels
            return token_ids, attn_masks, token_type_ids


class SentencePairClassifier(nn.Module):

    def __init__(self, bert_model="bert-base-uncased", freeze_bert=False):
        super(SentencePairClassifier, self).__init__()
        #  Instantiating BERT-based model object
        self.bert_layer = AutoModel.from_pretrained(bert_model)
        
        # Freeze bert layers and only train the classification layer weights
        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False

        # Classification layer
        # input dimension is 768 because [CLS] embedding has a dimension of 768
        # output dimension is 1 because we're working with a binary classification problem
        self.cls_layer = nn.Linear(768, 1)

        self.dropout = nn.Dropout(p=0.1)

    @autocast()  # run in mixed precision
    def forward(self, input_ids, attn_masks, token_type_ids):
        '''
        Inputs:
            -input_ids : Tensor  containing token ids
            -attn_masks : Tensor containing attention masks to be used to focus on non-padded values
            -token_type_ids : Tensor containing token type ids to be used to identify sentence1 and sentence2
        '''

        # Feeding the inputs to the BERT-based model to obtain contextualized representations
        output = self.bert_layer(input_ids, attn_masks, token_type_ids)
        
        # the last layer hidden-state of the first token of the sequence (classification token) 
        # further processed by a Linear layer and a Tanh activation function.
        logits = self.dropout(output['pooler_output'])
        
        # Feeding to the classifier layer 
        logits = self.cls_layer(logits)

        return logits


def set_seed(seed):
    """ 
    Set all seeds to make results reproducible.
    """
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)


def compute_class_weights(train_df):
    #compute the class weights
    class_weights = compute_class_weight(class_weight='balanced', 
                                        classes=np.unique(train_df.label.values), 
                                        y=train_df.label.values)

    # converting list of class weights to a tensor
    weights = torch.tensor(class_weights[1]/class_weights[0], dtype=torch.float)

    # push to GPU
    weights = weights.to(device)

    return weights


def get_accuracy_from_logits(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    soft_probs = (probs > 0.5).long()
    acc = (soft_probs.squeeze() == labels).float().mean()
    return acc


def evaluate(net, device, criterion, dataloader):
    net.eval()

    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        for (seq, attn_masks, token_type_ids, labels) in dataloader:
            seq, attn_masks, token_type_ids, labels = \
                seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)
            logits = net(seq, attn_masks, token_type_ids)
            mean_loss += criterion(logits.squeeze(-1), labels.float()).item()
            mean_acc += get_accuracy_from_logits(logits, labels).item()
            count += 1

    return mean_loss / count, mean_acc / count


def train_bert(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate):

    best_loss = np.Inf
    best_acc = 0
    best_ep = 1
    nb_iterations = len(train_loader)
    print_every = nb_iterations // 5  # print the training loss 5 times per epoch
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []
    
    scaler = GradScaler()

    for ep in range(epochs):

        net.train()
        running_loss = 0.0
        total_loss = 0
        total_acc = 0
        iter = 0
        for (seq, attn_masks, token_type_ids, labels) in train_loader:
            iter += 1
            #Converting these to tensors
            seq, attn_masks, token_type_ids, labels = \
                seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)
    
            # Enables autocasting for the forward pass (model + loss)
            with autocast():
                # Obtaining the logits from the model
                logits = net(seq, attn_masks, token_type_ids)

                # Computing loss
                loss = criterion(logits.squeeze(-1), labels.float())
                loss = loss / iters_to_accumulate  # Normalize the loss because it is averaged

                # Computing accuracy
                acc = get_accuracy_from_logits(logits, labels)
                total_acc += acc

            # Backpropagating the gradients
            # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
            scaler.scale(loss).backward()

            if iter % iters_to_accumulate == 0:
                # Optimization step
                # scaler.step() first unscales the gradients of the optimizer's assigned params.
                # If these gradients do not contain infs or NaNs, opti.step() is then called,
                # otherwise, opti.step() is skipped.
                scaler.step(opti)
                # Updates the scale for next iteration.
                scaler.update()
                # Adjust the learning rate based on the number of iterations.
                lr_scheduler.step()
                # Clear gradients
                opti.zero_grad()

            running_loss += loss.item()
            
            if iter % print_every == 0 and iter != 0:  # Print training loss information
                print("Iteration {}/{} of epoch {} complete. Loss : {} "
                      .format(iter, nb_iterations, ep+1, running_loss / print_every))
                
                total_loss += running_loss
                
                running_loss = 0.0

        train_losses.append(total_loss / len(train_loader))
        train_accuracies.append(total_acc / len(train_loader))
        
        val_loss, val_acc = evaluate(net, device, criterion, val_loader)  # Compute validation loss and accuracy
        val_losses.append(val_loss)
        val_accuracies.append(val_acc)
        print("\nEpoch {} complete! Validation Loss : {}".format(ep+1, val_loss))

        if val_loss < best_loss:
            print("Best validation loss improved from {} to {}\n".format(best_loss, val_loss))
            net_copy = copy.deepcopy(net)  # save a copy of the model
            best_loss = val_loss
            best_acc = val_acc
            best_ep = ep + 1

    # Saving the model
    path_to_model = '/content/drive/MyDrive/Colab Notebooks/NLP_Project/models/{}_lr_{}_val_loss_{}_acc_{}_ep_{}.pt'.format(bert_model, lr, round(best_loss, 5), round(best_acc, 5), best_ep)
    best_model = '{}_lr_{}_val_loss_{}_acc_{}_ep_{}'.format(bert_model, lr, round(best_loss, 5), round(best_acc, 5), best_ep)
    torch.save(net_copy.state_dict(), path_to_model)
    print("Finished training!")
    print("The model has been saved in {}".format(path_to_model))

    del loss
    torch.cuda.empty_cache()

    # Plot performance of model on each epoch
    plot_accuracy(train_losses, val_losses, train_accuracies, val_accuracies, best_model)

    return path_to_model


def plot_accuracy(train_losses, val_losses, train_accuracies, val_accuracies, best_model):
    """
    Create a plot analysis of model loss and accuracy across training epochs.
    """
    acc = train_accuracies
    val_acc = val_accuracies
    loss = train_losses
    val_loss = val_losses

    epochs = range(1, len(acc) + 1)
    fig = plt.figure(figsize=(10, 8))
    fig.tight_layout()

    plt.subplot(2, 1, 1)
    plt.plot(epochs, loss, 'r', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(2, 1, 2)
    plt.plot(epochs, acc, 'r', label='Training accuracy')
    plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
    plt.title('Training and validation accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend(loc='lower right')
    path_to_fig = '/content/drive/MyDrive/Colab Notebooks/NLP_Project/models/accuracy-' + best_model + '.png'
    fig.savefig(path_to_fig,dpi=300)
    fig.show()
    plt.close()


def get_probs_from_logits(logits):
    """
    Converts a tensor of logits into an array of probabilities by applying the sigmoid function
    """
    probs = torch.sigmoid(logits.unsqueeze(-1))

    return probs.detach().cpu().numpy()


def test_prediction(net, device, dataloader, with_labels=True, result_file=source_folder + "output.txt"):
    """
    Predict the probabilities on a dataset with or without labels and print the result in a file
    """
    net.eval()
    w = open(result_file, 'w')
    probs_all = []

    with torch.no_grad():
        if with_labels:
            for seq, attn_masks, token_type_ids, _ in dataloader:
                seq, attn_masks, token_type_ids = seq.to(device), attn_masks.to(device), token_type_ids.to(device)
                logits = net(seq, attn_masks, token_type_ids)
                probs = get_probs_from_logits(logits.squeeze(-1)).squeeze(-1)
                probs_all += probs.tolist()
        else:
            for seq, attn_masks, token_type_ids in dataloader:
                seq, attn_masks, token_type_ids = seq.to(device), attn_masks.to(device), token_type_ids.to(device)
                logits = net(seq, attn_masks, token_type_ids)
                probs = get_probs_from_logits(logits.squeeze(-1)).squeeze(-1)
                probs_all += probs.tolist()

    w.writelines(str(prob)+'\n' for prob in probs_all)
    w.close()


def save_result(data, path_to_output_file=source_folder + "output.txt"):
    """
    Save predictions on the test data to json file.
    """
    probs_test = pd.read_csv(path_to_output_file, header=None)[0]  # read prediction probabilities from file
    preds_test=(probs_test>=0.5).astype('uint8') # predicted labels using the fixed threshold of 0.5

    labels = extract_class_labels()
    pred_label = {}
    for i in range(len(preds_test)):
        code = str(preds_test[i])
        text_id = str(data.iloc[i]['id'])
        pred_label[text_id] = labels[code]
        
    # write predicted labels to json file
    with open(source_folder + 'test-output.json', 'w') as f:
        json.dump(pred_label, f, separators=(',', ':'))
    f.close()

    print("Predictions for test data are available in : {}".format(source_folder + 'test-output.json'))
    

if __name__ == "__main__":

    destination_path = '/content/drive/MyDrive/Colab Notebooks/NLP_Project/models'
    try:
        os.makedirs(destination_path)
        print("Directory:", destination_path, "created.")
    except:
        print("Directory:", destination_path, "already exists.")

    # retrieve train, dev, and test datasets
    train_df, dev_df, test_df = read_csv_datasets()

    # training parameters
    bert_model = "bert-base-uncased"
    freeze_bert = False  # update encoder weights and classification layer weights
    maxlen = 64  # maximum length of the tokenized input sentence pair : if greater than "maxlen", the input is truncated and else if smaller, the input is padded
    bs = 16  # batch size
    iters_to_accumulate = 2  # the gradient accumulation adds gradients over an effective batch of size : bs * iters_to_accumulate. If set to "1", you get the usual batch size
    lr = 3e-5  # learning rate
    epochs = 4  # number of training epochs

    #  Set all seeds to make reproducible results
    set_seed(1)

    # Read train and validation datasets
    print("Reading training data...")
    train_set = TweetDataset(train_df, maxlen, bert_model)
    print("Reading validation data...")
    val_set = TweetDataset(dev_df, maxlen, bert_model)
    
    # Create instances of training and validation dataloaders
    train_loader = DataLoader(train_set, batch_size=bs, num_workers=2)
    val_loader = DataLoader(val_set, batch_size=bs, num_workers=2)
    print("Done preprocessing training and development data.")

    # Use gpu if available
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    net = SentencePairClassifier(bert_model, freeze_bert=freeze_bert)
    net.to(device)

    # model parameters for fine-tuning
    weights = compute_class_weights(train_df)
    criterion = nn.BCEWithLogitsLoss(weight=weights)
    opti = AdamW(net.parameters(), lr=lr, weight_decay=1e-2)
    num_warmup_steps = 0 # The number of steps for the warmup phase.
    # num_training_steps = epochs * len(train_loader)  # The total number of training steps
    num_training_steps = (len(train_loader) // iters_to_accumulate) * epochs  # Necessary to take into account Gradient accumulation
    lr_scheduler = get_linear_schedule_with_warmup(optimizer=opti, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

    # start training for downstream task
    path_to_model = train_bert(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate)

    # load the best model for classification task
    model = SentencePairClassifier(bert_model)
    print("\nLoading the weights of the model...")
    model.load_state_dict(torch.load(path_to_model))
    model.to(device)

    # use the trained model to predict class labels for the test set
    print("Reading test data...")
    test_set = TweetDataset(test_df, maxlen, False, bert_model)
    test_loader = DataLoader(test_set, batch_size=bs, num_workers=2)
    print("Done preprocessing test data.")

    print("Predicting on test data...")
    path_to_output_file = source_folder + 'test-output-probabilities.txt'
    test_prediction(net=model, device=device, dataloader=test_loader, with_labels=False,
                    result_file=path_to_output_file)
    print("\nTest classification probabilities are available in : {}".format(path_to_output_file))

    save_result(test_df, path_to_output_file)


Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 18.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 51.8MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 53.6MB/s 
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1
Collecting tweet-preprocessor
  Downloading https://files.pythonh

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…


Reading validation data...
Done preprocessing training and development data.


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…


Iteration 58/291 of epoch 1 complete. Loss : 0.5676862176122337 
Iteration 116/291 of epoch 1 complete. Loss : 0.4282150173495556 
Iteration 174/291 of epoch 1 complete. Loss : 0.38165400588306886 
Iteration 232/291 of epoch 1 complete. Loss : 0.3509611385906565 
Iteration 290/291 of epoch 1 complete. Loss : 0.3654688457990515 

Epoch 1 complete! Validation Loss : 0.6043922401763298
Best validation loss improved from inf to 0.6043922401763298

Iteration 58/291 of epoch 2 complete. Loss : 0.3304579137214299 
Iteration 116/291 of epoch 2 complete. Loss : 0.24677554607905192 
Iteration 174/291 of epoch 2 complete. Loss : 0.23875260301705065 
Iteration 232/291 of epoch 2 complete. Loss : 0.24365975887610994 
Iteration 290/291 of epoch 2 complete. Loss : 0.218277288857719 

Epoch 2 complete! Validation Loss : 0.5398962449383091
Best validation loss improved from 0.6043922401763298 to 0.5398962449383091

Iteration 58/291 of epoch 3 complete. Loss : 0.20075176559902472 
Iteration 116/291 of 

In [None]:
from google.colab import drive
drive.mount('/content/drive')