<a href="https://colab.research.google.com/github/jabanto22/NLP-Project/blob/main/project_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch 
!pip install torchtext 
!pip install torchvision 
!pip install transformers
!pip install tweet-preprocessor



In [2]:
# Libraries
import pandas as pd
import preprocessor as p
import numpy as np
import random
import copy
import os
import time, datetime
import json
import torch

# Models
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup

# Training
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm

# Evaluation
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt

# Global definitions
source_folder = '/content/drive/MyDrive/Colab Notebooks/NLP_Project/project-data/'

In [3]:
def read_data(filename):
    """
    Read twitter datasets.
    """
    data = pd.DataFrame()
    with open(filename, 'r', encoding="utf8") as f:
        for line in f:
            line = json.loads(line)
            tweet_id = line[0]["id_str"]
            tweet = p.clean(line[0]["text"])
            comments = ""
            for row in line:
                # use tweet preprocessor to clean text
                comments += " " + p.clean(row["text"]) + "."
            data = data.append({"id":tweet_id,"text":tweet,"comments":comments}, ignore_index=True)
    f.close()

    return data

In [4]:
def read_label(filename):
    """
    Read class labels.
    """
    label = pd.DataFrame()

    with open(filename, 'r', encoding="utf8") as f:
        label = pd.DataFrame.from_dict(json.load(f), orient="index").reset_index()
        label.columns = ["id", "label"]
    f.close()

    return label

In [5]:
def merge_data_label(data, label, extract_label=False):
    """
    Merge train data with class labels and class label codes for prediction.
    """
    data = pd.merge(data, label, on="id", how="outer")
    data.label = pd.Categorical(data.label)
    class_labels = dict(enumerate(data.label.cat.categories))
    data['label'] = data.label.cat.codes

    # write predicted labels to json file
    with open(source_folder + 'labels.json', 'w') as f:
        json.dump(class_labels, f, separators=(',', ':'))
    f.close()

    return data     

In [6]:
def extract_class_labels():
    # read class labels from json file
    label = pd.DataFrame()
    with open(source_folder + 'labels.json', 'r', encoding="utf8") as f:
        label = json.load(f)
    f.close()
    return label

In [7]:
def save_data_to_csv():
    """
    Read and extract datasets from files.
    """
    # read data (jsonl files)
    train_data = read_data(source_folder + 'train.data.jsonl')
    dev_data = read_data(source_folder + 'dev.data.jsonl')
    test_data = read_data(source_folder + 'test.data.jsonl')
    covid_data = read_data(source_folder + 'covid.data.jsonl')

    # read labels (json files)
    train_label = read_label(source_folder + 'train.label.json')
    dev_label = read_label(source_folder + 'dev.label.json')

    # merge data with class labels
    train_data = merge_data_label(train_data, train_label)
    dev_data = merge_data_label(dev_data, dev_label)

    # write filetered data to csv
    open(source_folder + 'train.csv','w', newline='').write(train_data.to_csv(index=False))
    open(source_folder + 'dev.csv','w', newline='').write(dev_data.to_csv(index=False))
    open(source_folder + 'test.csv','w', newline='').write(test_data.to_csv(index=False))
    open(source_folder + 'covid.csv','w', newline='').write(covid_data.to_csv(index=False))


def check_input_files(filename):
    """
    Check input files if they exist.
    """
    try:
        f = open(filename,'r')
        f.close()
    except:
        save_data_to_csv()


def read_csv_datasets():
    # check if input files exist
    check_input_files(source_folder + 'train.csv')
    check_input_files(source_folder + 'dev.csv')
    check_input_files(source_folder + 'test.csv')
    check_input_files(source_folder + 'covid.csv')

    # read datasets
    train_df = pd.read_csv(source_folder + 'train.csv')
    dev_df = pd.read_csv(source_folder + 'dev.csv')
    test_df = pd.read_csv(source_folder + 'test.csv')

    return train_df, dev_df, test_df

In [8]:
class CustomDataset(Dataset):

    def __init__(self, data, maxlen, with_labels=True, bert_model='bert-base-uncased'):

        self.data = data  # pandas dataframe
        #Initialize the tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(bert_model)  

        self.maxlen = maxlen
        self.with_labels = with_labels 

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        # Selecting sentence1 and sentence2 at the specified index in the data frame
        sent1 = str(self.data.loc[index, 'text'])
        sent2 = str(self.data.loc[index, 'comments'])

        # Tokenize the pair of sentences to get token ids, attention masks and token type ids
        encoded_pair = self.tokenizer(sent1, sent2, 
                                      padding='max_length',  # Pad to max_length
                                      truncation=True,  # Truncate to max_length
                                      max_length=self.maxlen,  
                                      return_tensors='pt')  # Return torch.Tensor objects
        
        token_ids = encoded_pair['input_ids'].squeeze(0)  # tensor of token ids
        attn_masks = encoded_pair['attention_mask'].squeeze(0)  # binary tensor with "0" for padded values and "1" for the other values
        token_type_ids = encoded_pair['token_type_ids'].squeeze(0)  # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens

        if self.with_labels:  # True if the dataset has labels
            label = self.data.loc[index, 'label']
            return token_ids, attn_masks, token_type_ids, label  
        else:
            return token_ids, attn_masks, token_type_ids

In [9]:
class SentencePairClassifier(nn.Module):

    def __init__(self, bert_model="bert-base-uncased", freeze_bert=False):
        super(SentencePairClassifier, self).__init__()
        #  Instantiating BERT-based model object
        self.bert_layer = AutoModel.from_pretrained(bert_model)

        #  Fix the hidden-state size of the encoder outputs (If you want to add other pre-trained models here, search for the encoder output size)
        if bert_model == "albert-base-v2":  # 12M parameters
            hidden_size = 768
        elif bert_model == "albert-large-v2":  # 18M parameters
            hidden_size = 1024
        elif bert_model == "albert-xlarge-v2":  # 60M parameters
            hidden_size = 2048
        elif bert_model == "albert-xxlarge-v2":  # 235M parameters
            hidden_size = 4096
        elif bert_model == "bert-base-uncased": # 110M parameters
            hidden_size = 768

        # Freeze bert layers and only train the classification layer weights
        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False

        # Classification layer
        self.cls_layer = nn.Linear(hidden_size, 1)

        self.dropout = nn.Dropout(p=0.1)

    @autocast()  # run in mixed precision
    def forward(self, input_ids, attn_masks, token_type_ids):
        '''
        Inputs:
            -input_ids : Tensor  containing token ids
            -attn_masks : Tensor containing attention masks to be used to focus on non-padded values
            -token_type_ids : Tensor containing token type ids to be used to identify sentence1 and sentence2
        '''

        # Feeding the inputs to the BERT-based model to obtain contextualized representations
        output = self.bert_layer(input_ids, attn_masks, token_type_ids)
        
        # Feeding to the classifier layer the last layer hidden-state of the [CLS] token further processed by a
        # Linear Layer and a Tanh activation. The Linear layer weights were trained from the sentence order prediction (ALBERT) or next sentence prediction (BERT)
        # objective during pre-training.
        logits = self.cls_layer(self.dropout(output['pooler_output']))

        return logits

In [10]:
def set_seed(seed):
    """ Set all seeds to make results reproducible """
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)


def evaluate_loss(net, device, criterion, dataloader):
    net.eval()

    mean_loss = 0
    count = 0

    with torch.no_grad():
        for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(dataloader)):
            seq, attn_masks, token_type_ids, labels = \
                seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)
            logits = net(seq, attn_masks, token_type_ids)
            mean_loss += criterion(logits.squeeze(-1), labels.float()).item()
            count += 1

    return mean_loss / count

In [11]:
def train_bert(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate):

    best_loss = np.Inf
    best_ep = 1
    nb_iterations = len(train_loader)
    print_every = nb_iterations // 5  # print the training loss 5 times per epoch
    train_losses = []
    val_losses = []

    scaler = GradScaler()

    for ep in range(epochs):

        net.train()
        running_loss = 0.0
        for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(train_loader)):

            # Converting to cuda tensors
            seq, attn_masks, token_type_ids, labels = \
                seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)
    
            # Enables autocasting for the forward pass (model + loss)
            with autocast():
                # Obtaining the logits from the model
                logits = net(seq, attn_masks, token_type_ids)

                # Computing loss
                loss = criterion(logits.squeeze(-1), labels.float())
                loss = loss / iters_to_accumulate  # Normalize the loss because it is averaged

            # Backpropagating the gradients
            # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
            scaler.scale(loss).backward()

            if (it + 1) % iters_to_accumulate == 0:
                # Optimization step
                # scaler.step() first unscales the gradients of the optimizer's assigned params.
                # If these gradients do not contain infs or NaNs, opti.step() is then called,
                # otherwise, opti.step() is skipped.
                scaler.step(opti)
                # Updates the scale for next iteration.
                scaler.update()
                # Adjust the learning rate based on the number of iterations.
                lr_scheduler.step()
                # Clear gradients
                opti.zero_grad()

            running_loss += loss.item()
            train_losses.append(loss)
        
            if (it + 1) % print_every == 0:  # Print training loss information
                print()
                print("Iteration {}/{} of epoch {} complete. Loss : {} "
                      .format(it+1, nb_iterations, ep+1, running_loss / print_every))

                running_loss = 0.0


        val_loss = evaluate_loss(net, device, criterion, val_loader)  # Compute validation loss
        val_losses.append(val_loss)
        print()
        print("Epoch {} complete! Validation Loss : {}".format(ep+1, val_loss))

        if val_loss < best_loss:
            print("Best validation loss improved from {} to {}".format(best_loss, val_loss))
            print()
            net_copy = copy.deepcopy(net)  # save a copy of the model
            best_loss = val_loss
            best_ep = ep + 1

    # Saving the model
    path_to_model='/content/drive/MyDrive/Colab Notebooks/NLP_Project/models/{}_lr_{}_val_loss_{}_ep_{}.pt'.format(bert_model, lr, round(best_loss, 5), best_ep)
    torch.save(net_copy.state_dict(), path_to_model)
    print("The model has been saved in {}".format(path_to_model))

    del loss
    torch.cuda.empty_cache()

In [12]:
def get_probs_from_logits(logits):
    """
    Converts a tensor of logits into an array of probabilities by applying the sigmoid function
    """
    probs = torch.sigmoid(logits.unsqueeze(-1))

    return probs.detach().cpu().numpy()


def test_prediction(net, device, dataloader, with_labels=True, result_file="/content/drive/MyDrive/Colab Notebooks/NLP_Project/results/output.txt"):
    """
    Predict the probabilities on a dataset with or without labels and print the result in a file
    """
    net.eval()
    w = open(result_file, 'w')
    probs_all = []

    with torch.no_grad():
        if with_labels:
            for seq, attn_masks, token_type_ids, _ in tqdm(dataloader):
                seq, attn_masks, token_type_ids = seq.to(device), attn_masks.to(device), token_type_ids.to(device)
                logits = net(seq, attn_masks, token_type_ids)
                probs = get_probs_from_logits(logits.squeeze(-1)).squeeze(-1)
                probs_all += probs.tolist()
        else:
            for seq, attn_masks, token_type_ids in tqdm(dataloader):
                seq, attn_masks, token_type_ids = seq.to(device), attn_masks.to(device), token_type_ids.to(device)
                logits = net(seq, attn_masks, token_type_ids)
                probs = get_probs_from_logits(logits.squeeze(-1)).squeeze(-1)
                probs_all += probs.tolist()

    w.writelines(str(prob)+'\n' for prob in probs_all)
    w.close()

In [13]:
model = '/content/drive/MyDrive/Colab Notebooks/NLP_Project/models'
try:
    os.makedirs(model)
    print("Directory:", model, "created.")
except:
    print("Directory:", model, "already exists.")

Directory: /content/drive/MyDrive/Colab Notebooks/NLP_Project/models already exists.


In [14]:
# retrieve train, dev, and test datasets
train_df, dev_df, test_df = read_csv_datasets()

In [15]:
# training parameters
bert_model = "bert-base-uncased"  # 'albert-base-v2', 'albert-large-v2', 'albert-xlarge-v2', 'albert-xxlarge-v2', 'bert-base-uncased', ...
freeze_bert = False  # if True, freeze the encoder weights and only update the classification layer weights
maxlen = 64  # maximum length of the tokenized input sentence pair : if greater than "maxlen", the input is truncated and else if smaller, the input is padded
bs = 16  # batch size
iters_to_accumulate = 2  # the gradient accumulation adds gradients over an effective batch of size : bs * iters_to_accumulate. If set to "1", you get the usual batch size
lr = 2e-5  # learning rate
epochs = 4  # number of training epochs

In [16]:
#  Set all seeds to make reproducible results
set_seed(1)

In [17]:
# Creating instances of training and validation set
print("Reading training data...")
train_set = CustomDataset(train_df, maxlen, bert_model)
print("Reading validation data...")
val_set = CustomDataset(dev_df, maxlen, bert_model)

# Creating instances of training and validation dataloaders
train_loader = DataLoader(train_set, batch_size=bs, num_workers=5)
val_loader = DataLoader(val_set, batch_size=bs, num_workers=5)

Reading training data...
Reading validation data...


  cpuset_checked))


In [18]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net = SentencePairClassifier(bert_model, freeze_bert=freeze_bert)

if torch.cuda.device_count() > 1:  # if multiple GPUs
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    net = nn.DataParallel(net)

net.to(device)

SentencePairClassifier(
  (bert_layer): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementw

In [19]:
# model parameters
criterion = nn.BCEWithLogitsLoss()
opti = AdamW(net.parameters(), lr=lr, weight_decay=1e-2)
num_warmup_steps = 0 # The number of steps for the warmup phase.
num_training_steps = epochs * len(train_loader)  # The total number of training steps
t_total = (len(train_loader) // iters_to_accumulate) * epochs  # Necessary to take into account Gradient accumulation
lr_scheduler = get_linear_schedule_with_warmup(optimizer=opti, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)

train_bert(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate)

  cpuset_checked))
 20%|█▉        | 58/291 [00:05<00:20, 11.58it/s]


Iteration 58/291 of epoch 1 complete. Loss : 0.31356283842489635 


 41%|████      | 118/291 [00:10<00:15, 10.82it/s]


Iteration 116/291 of epoch 1 complete. Loss : 0.24180220208805184 


 60%|██████    | 176/291 [00:15<00:09, 11.52it/s]


Iteration 174/291 of epoch 1 complete. Loss : 0.209068580819615 


 80%|████████  | 234/291 [00:20<00:05, 11.40it/s]


Iteration 232/291 of epoch 1 complete. Loss : 0.19216701844385986 


100%|██████████| 291/291 [00:26<00:00, 11.19it/s]
  0%|          | 0/37 [00:00<?, ?it/s]


Iteration 290/291 of epoch 1 complete. Loss : 0.1867364129886545 


100%|██████████| 37/37 [00:01<00:00, 24.16it/s]
  0%|          | 0/291 [00:00<?, ?it/s]


Epoch 1 complete! Validation Loss : 0.32860503969965754
Best validation loss improved from inf to 0.32860503969965754



 21%|██        | 60/291 [00:05<00:20, 11.41it/s]


Iteration 58/291 of epoch 2 complete. Loss : 0.181085374848596 


 41%|████      | 118/291 [00:10<00:15, 11.39it/s]


Iteration 116/291 of epoch 2 complete. Loss : 0.14206970499121938 


 60%|██████    | 176/291 [00:15<00:10, 11.29it/s]


Iteration 174/291 of epoch 2 complete. Loss : 0.13594611580001897 


 80%|████████  | 234/291 [00:20<00:05, 11.19it/s]


Iteration 232/291 of epoch 2 complete. Loss : 0.1376969989324952 


100%|██████████| 291/291 [00:26<00:00, 11.15it/s]
  0%|          | 0/37 [00:00<?, ?it/s]


Iteration 290/291 of epoch 2 complete. Loss : 0.1201564109852088 


100%|██████████| 37/37 [00:01<00:00, 23.72it/s]
  0%|          | 0/291 [00:00<?, ?it/s]


Epoch 2 complete! Validation Loss : 0.2985529512972445
Best validation loss improved from 0.32860503969965754 to 0.2985529512972445



 21%|██        | 60/291 [00:05<00:20, 11.26it/s]


Iteration 58/291 of epoch 3 complete. Loss : 0.12133852931959875 


 41%|████      | 118/291 [00:10<00:15, 11.15it/s]


Iteration 116/291 of epoch 3 complete. Loss : 0.09134575413090401 


 60%|██████    | 176/291 [00:16<00:10, 11.20it/s]


Iteration 174/291 of epoch 3 complete. Loss : 0.07610811721854682 


 80%|████████  | 234/291 [00:21<00:05, 11.22it/s]


Iteration 232/291 of epoch 3 complete. Loss : 0.09529936355378094 


100%|██████████| 291/291 [00:26<00:00, 11.05it/s]
  0%|          | 0/37 [00:00<?, ?it/s]


Iteration 290/291 of epoch 3 complete. Loss : 0.08517805183434794 


100%|██████████| 37/37 [00:01<00:00, 23.98it/s]
  0%|          | 0/291 [00:00<?, ?it/s]


Epoch 3 complete! Validation Loss : 0.311886233252448


 21%|██        | 60/291 [00:05<00:20, 11.16it/s]


Iteration 58/291 of epoch 4 complete. Loss : 0.07865392224028192 


 40%|███▉      | 116/291 [00:10<00:16, 10.89it/s]


Iteration 116/291 of epoch 4 complete. Loss : 0.059146523202673114 


 60%|██████    | 176/291 [00:16<00:10, 11.08it/s]


Iteration 174/291 of epoch 4 complete. Loss : 0.04629435878375481 


 80%|████████  | 234/291 [00:21<00:05, 11.14it/s]


Iteration 232/291 of epoch 4 complete. Loss : 0.05762256585575383 


100%|██████████| 291/291 [00:26<00:00, 10.94it/s]
  0%|          | 0/37 [00:00<?, ?it/s]


Iteration 290/291 of epoch 4 complete. Loss : 0.05528651693322022 


100%|██████████| 37/37 [00:01<00:00, 24.38it/s]



Epoch 4 complete! Validation Loss : 0.3257690700324806
The model has been saved in /content/drive/MyDrive/Colab Notebooks/NLP_Project/models/bert-base-uncased_lr_2e-05_val_loss_0.29855_ep_2.pt


In [20]:
results = '/content/drive/MyDrive/Colab Notebooks/NLP_Project/results'
try:
    os.makedirs(results)
    print("Directory:", results, "created.")
except:
    print("Directory:", results, "already exists.")

Directory: /content/drive/MyDrive/Colab Notebooks/NLP_Project/results already exists.


In [21]:
path_to_model = '/content/drive/MyDrive/Colab Notebooks/NLP_Project/models/bert-base-uncased_lr_2e-05_val_loss_0.29855_ep_2.pt'  

path_to_output_file = '/content/drive/MyDrive/Colab Notebooks/NLP_Project/results/output.txt'

print("Reading test data...")
test_set = CustomDataset(test_df, maxlen, False, bert_model)
test_loader = DataLoader(test_set, batch_size=bs, num_workers=5)

model = SentencePairClassifier(bert_model)
if torch.cuda.device_count() > 1:  # if multiple GPUs
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    model = nn.DataParallel(model)

print()
print("Loading the weights of the model...")
model.load_state_dict(torch.load(path_to_model))
model.to(device)

print("Predicting on test data...")
test_prediction(net=model, device=device, dataloader=test_loader, with_labels=False,
                result_file=path_to_output_file)
print()
print("Predictions are available in : {}".format(path_to_output_file))

Reading test data...


  cpuset_checked))



Loading the weights of the model...


  0%|          | 0/37 [00:00<?, ?it/s]

Predicting on test data...


100%|██████████| 37/37 [00:01<00:00, 22.90it/s]


Predictions are available in : /content/drive/MyDrive/Colab Notebooks/NLP_Project/results/output.txt





In [22]:
def extract_class_labels():
    # read class labels from json file
    label = pd.DataFrame()
    with open(source_folder + 'labels.json', 'r', encoding="utf8") as f:
        label = json.load(f)
    f.close()
    return label

labels = extract_class_labels()

In [23]:
probs_test = pd.read_csv(path_to_output_file, header=None)[0]  # prediction probabilities
threshold = 0.5   # you can adjust this threshold for your own dataset
preds_test=(probs_test>=threshold).astype('uint8') # predicted labels using the above fixed threshold

preds_test

pred_label = {}
for i in range(len(preds_test)):
    code = str(preds_test[i])
    text_id = str(test_df.iloc[i]['id'])
    pred_label[text_id] = labels[code]

# write predicted labels to json file
path_to_results = source_folder + 'test-output.json'
with open(source_folder + 'test-output.json', 'w') as f:
    json.dump(pred_label, f, separators=(',', ':'))
f.close()

print("Predictions are available in : {}".format(path_to_results))

Predictions are available in : /content/drive/MyDrive/Colab Notebooks/NLP_Project/project-data/test-output.json
