In [1]:
import os
os.getpid()

23682

In [2]:
import json
f = open('ESConv.json')
data = json.load(f)
f.close()
processed_data = []
for i in data:
    pt = i['problem_type']
    for j in i['dialog']:
        processed_data.append([pt, j['content'], j['speaker'], j['annotation']])

In [3]:
processed_data[4005]

['ongoing depression',
 'Sometimes we get in a funk, when we do, we can pull ourselves out, I do suggest you write a list of things to do and put on the frige.',
 'supporter',
 {'strategy': 'Others'}]

In [4]:
sl = []
for i in processed_data:
    sl.append(len(i[1].split()))
## Safely, we remove length 1,2,3
to_remove = []
for i in range(len(sl)):
    if sl[i] <= 3:
        to_remove.append(i)
for index in sorted(to_remove, reverse=True):
    del processed_data[index]

In [5]:
from collections import Counter
cpd = []
for i in processed_data:
    cpd.append(i[0])
Counter(cpd)

Counter({'job crisis': 7475,
         'problems with friends': 4702,
         'ongoing depression': 9462,
         'breakup with partner': 6697,
         'academic pressure': 4009,
         'conflict with parents': 255,
         'Procrastination': 352,
         'Alcohol Abuse': 360,
         'Issues with Parents': 239,
         'Sleep Problems': 765,
         'Appearance Anxiety': 346,
         'School Bullying': 46,
         'Issues with Children': 371})

In [6]:
# MERGE Issue with parents
for i in range(len(processed_data)):
    if processed_data[i][0] == 'conflict with parents':
        processed_data[i][0] = 'Issues with Parents'

In [7]:
# MERGE Into Family Issue
for i in range(len(processed_data)):
    if processed_data[i][0] == 'Issues with Parents':
        processed_data[i][0] = 'Family Issues'
    if processed_data[i][0] == 'Issues with Children':
        processed_data[i][0] = 'Family Issues'

In [8]:
cpd = []
for i in processed_data:
    cpd.append(i[0])
Counter(cpd)

Counter({'job crisis': 7475,
         'problems with friends': 4702,
         'ongoing depression': 9462,
         'breakup with partner': 6697,
         'academic pressure': 4009,
         'Family Issues': 865,
         'Procrastination': 352,
         'Alcohol Abuse': 360,
         'Sleep Problems': 765,
         'Appearance Anxiety': 346,
         'School Bullying': 46})

In [9]:
# For now, we care the all 7 categories.
kept = ['problems with friends', 'ongoing depression','breakup with partner', 'academic pressure', 'job crisis', 'Family Issues','Sleep Problems']

filtered_processed_data = []
for i in processed_data:
    if i[0] in kept:
        filtered_processed_data.append(list(i))
        filtered_processed_data[-1][0] = kept.index(filtered_processed_data[-1][0])

# Split the training and test

In [10]:
import random

random.seed(10)
random.shuffle(filtered_processed_data)
lenth = len(filtered_processed_data)
train_list = filtered_processed_data[0:int(lenth*0.8)]
test_list = filtered_processed_data[int(lenth*0.8):]

In [11]:
# Count labels in training
lt = []
for i in train_list:
    lt.append(i[0])
Counter(lt)

Counter({2: 5364, 1: 7583, 6: 628, 4: 5986, 3: 3144, 0: 3767, 5: 708})

In [12]:
# Count labels in testing
lts = []
for i in test_list:
    lts.append(i[0])
Counter(lts)

Counter({2: 1333, 1: 1879, 3: 865, 4: 1489, 0: 935, 5: 157, 6: 137})

# This would be a classification into 7 categories.

In [13]:
import torch
from torch.utils.data.sampler import BatchSampler, Sampler, RandomSampler, SequentialSampler
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, BertModel, AdamW, BertConfig
from tqdm import tqdm

class Corpora():
    """
        A class that holds the texts and class labels.
    """
    
    def __init__(self, l):
        """
            file_path: on MAGIC, it is either
                        data-badassnlp/cola_public/raw/in_domain_train.tsv or
                        data-badassnlp/cola_public/raw/in_domain_dev.tsv
                    on Google Colab, change them to the corresponding paths.
        """

        self.sentences = [s[1] for s in tqdm(l)]
        self.labels = [int(s[0]) for s in tqdm(l)]
        
        
class TextClassificationDataSet(Dataset):
    """
        Define a dataset consisting of pairs of (sequence_of_word_indices, class_label).
        class_label is either 0 or 1.
    """
    def __init__(self, corpora, tokenizer: BertTokenizer):
        """
            This function will tokenize the sentences in self.corpora,
            and pad the word indices properly.
            The tokenization and padding should be done using BERT's API.

                See https://huggingface.co/docs/transformers/internal/tokenization_utils
                    for the API of tokenizer
                See https://huggingface.co/docs/transformers/preprocessing
                    for examples of using the API
                Also see the BERT paper
                    "BERT: Pre-training of Deep Bidirectional Transformers forLanguage Understanding"
                regarding how paddings are done.

            We turn the entire corpora into a big tensor in a batch, rather than
                doing that in the __getitem__ function, as batch processing can speed up tokenization
        
            Then self.input_ids, self.attention_mask and self.labels
            will be created as tensors of shape
                torch.Size([num_sentences, max_padded_sentence_length])
                torch.Size([num_sentences, max_padded_sentence_length])
                torch.Size([num_sentences])

            corpora: an object of the Corpora class representing some raw classified texts.
            tokenizer: must be the BERTTokenizer loaded from the BERT/download folder
                        in order to have the same ids for the words in any vocabulary.
                        See the loading codes below.
        """
        self.corpora = corpora
        self.tokenizer = tokenizer
        d =  tokenizer.batch_encode_plus(corpora.sentences)
        padded_encoded_inputs = tokenizer.pad(d)
        self.input_ids = torch.tensor(padded_encoded_inputs.input_ids)
        self.attention_mask = torch.tensor(padded_encoded_inputs.attention_mask)
        self.labels = torch.tensor(corpora.labels)
        
    def __len__(self):
        return len(self.corpora.sentences)

    def __getitem__(self, idx):
        """
            Return the idx-th of the rows the self.input_ids, self.attention_mask, self.labels in this order.
            Don't do BERT tokenization here as that will be slow.
        """
        return self.input_ids[idx], self.attention_mask[idx], self.labels[idx]

In [14]:
from torch import nn

class BERTClassifier(nn.Module):
    
    def __init__(self, BERT_model, hidden_layer_size, num_classes):
        
        super(BERTClassifier, self).__init__()

        # loaded pretrained model
        self.bert = BERT_model
        
        # simple neural network that convert embedding of the first token to a class
        self.classifier = nn.Sequential(
            nn.Linear(BERT_hidden_size, hidden_layer_size),
            nn.ReLU(),
            nn.Linear(hidden_layer_size, num_classes)
        )
        
    def forward(self, input_ids, attention_mask):
        """
        The following two arguments are tensors from a mini-batch of the input_ids
            and attention_mask returned by the BERT tokenizer.
            
            input_ids: a tensor of shape [batch_size, max_length]
            attention_mask: a tensor of shape [batch_size, max_length]
            
            return: the logits of the sentences in the batch tensor of shape [batch_size, 1, 2]
        """
        # see https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
        # and https://huggingface.co/docs/transformers/main_classes/output
        # for the return value of the forward function of BERT

        ### Your codes go here (30 points) ###
        z = self.bert.forward(input_ids, attention_mask)
        return torch.softmax(self.classifier(z[1]).unsqueeze(1), dim=-1) 
        
        return logits

In [23]:
# make sure you check the availability of GPU when you set the device ID.
device = torch.device(0)

BATCH_SIZE = 8
num_classes = 7
classifier_hidden_size = 128

## if using MAGIC
BERT_PATH = '/data/badassnlp/bert-base-uncased/'
## if using Google Colab, you need to load the bert model after it downloads the model files.
## 

BERT_hidden_size = 768

N_EPOCHS = 1
CLIP = 1

In [24]:
## 'uncased' means all words are lowered-cased before tokenization
## 'base' means the smaller version of BERT (12 layers, 16 heads)
## un-comment one of the following two options.

# if using MAGIC, load from local BERT folder
tokenizer = BertTokenizer.from_pretrained(BERT_PATH, local_files_only=True)
BERT_model = BertModel.from_pretrained(BERT_PATH, num_labels = 2, output_attentions = False, output_hidden_states = False
).to(device)

## if using Colab, load from automatically downloaded files. Downloading can take half a minute
# tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
# BERT_model = BertModel.from_pretrained("bert-base-cased", num_labels = 2, output_attentions = False, output_hidden_states = False).to(device)


## if using MAGIC
print("creating training corpora ... ")
training_corpora = Corpora(train_list)
print("creating training dataset ... ")
training_dataset = TextClassificationDataSet(training_corpora, tokenizer)
print("creating training iterator ... ")
training_iterator = DataLoader(training_dataset, sampler = RandomSampler(training_dataset), batch_size=BATCH_SIZE)

## if using MAGIC
print("creating test corpora ... ")
dev_corpora = Corpora(test_list)
print("creating test dataset ... ")
dev_dataset = TextClassificationDataSet(dev_corpora, tokenizer)
print("creating test iterator ... ")
dev_iterator = DataLoader(dev_dataset, sampler = SequentialSampler(dev_dataset), batch_size=BATCH_SIZE)

classifier = BERTClassifier(BERT_model, classifier_hidden_size, num_classes).to(device)

Some weights of the model checkpoint at /data/badassnlp/bert-base-uncased/ were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


creating training corpora ... 


100%|██████████| 27180/27180 [00:00<00:00, 1865660.47it/s]
100%|██████████| 27180/27180 [00:00<00:00, 2482333.86it/s]

creating training dataset ... 





creating training iterator ... 
creating test corpora ... 


100%|██████████| 6795/6795 [00:00<00:00, 1801649.64it/s]
100%|██████████| 6795/6795 [00:00<00:00, 2253344.06it/s]

creating test dataset ... 





creating test iterator ... 


In [25]:
list(training_iterator)[0][0].shape

torch.Size([8, 206])

In [26]:
list(dev_iterator)[0][0].shape

torch.Size([8, 163])

In [27]:
from torch import optim
optimizer = optim.Adam(classifier.parameters())
criterion = nn.CrossEntropyLoss(reduction = 'sum')

# Global variable that count the number of training epoch
num_epochs_train = 0

from tqdm import tqdm

In [28]:
def train(model, iterator, optimizer, criterion, clip):
    """
        model: an BERTClassifier object
        iterator: a DataLoader object
        optimizer: torch optimizer
        criterion: the crossentropyloss
        clip: to be used with torch.nn.utils.clip_grad_norm_

        return: average loss over the training instances (sentences) in the DataLoader.
    """
    model.train()
    
    epoch_loss = 0
    num_batchs = 0
    total_instances = 0
    global num_epochs_train
    num_epochs_train += 1
    for i, batch in tqdm(enumerate(iterator)):
        num_batchs += 1
        optimizer.zero_grad() 
        tmp = optimizer.state_dict()
        tmp["param_groups"][0]["lr"] = 0.00002/(num_epochs_train)
        optimizer.load_state_dict(tmp)

        ### Your codes go here (10 points) ###

        # Step 1 (5 points): get the tensors from this mini-batch and increase the total_instances
        # make sure tensors are moved to GPU.
        ids = batch[0].to(device)
        msk = batch[1].to(device)
        y = (batch[2]).to(device)
        
        # Step 2 (5 points): call the forward function of the model and find the output logits
        # then calculate the loss. Then cumulate the epoch_loss
        logits = model.forward(ids,msk)
        loss = criterion(logits.squeeze(1), y) # if squeeze here, why unsqueeze previous
        loss.backward()

        # Clips gradient norm of an iterable of parameters.
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()
        total_instances += BATCH_SIZE

    return epoch_loss / total_instances

confusion_matrix = []
num_epochs = 0

def evaluate(model, iterator, criterion):
    """
        Find the loss (criterion) of the model over instances in this DataLoader (iterator)
        Same logic as the above train function, except no gradient calculation
        and no update of the parameter. 
        
        return: average loss over the training instances (sentences) in the DataLoader.
    """
    model.eval()

    epoch_loss = 0
    total_instances = 0
    
    confusion_matrix.append(torch.zeros(num_classes,num_classes))
    global num_epochs
    
    for i, batch in tqdm(enumerate(iterator)):
        
        ### Your codes go here (5 points) ###
        ids = batch[0].to(device)
        msk = batch[1].to(device)
        y = (batch[2]).to(device)
        logits = model.forward(ids,msk)
        loss = criterion(logits.squeeze(1), y) # if squeeze here, why unsqueeze previous
        
        epoch_loss += loss.item()
        total_instances += BATCH_SIZE
        

        for i in range(len(y)):
            row = y[i]
            col = torch.argmax((logits.squeeze(1))[i])
            confusion_matrix[num_epochs][row][col] += 1
    num_epochs += 1

    return epoch_loss / total_instances

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [29]:
import time

best_test_loss = float('inf')
training_losses = []
test_losses = []

In [30]:
for epoch in range(N_EPOCHS):  
    print("epoch start: ", epoch)  
    start_time = time.time()
    training_loss = train(classifier, training_iterator, optimizer, criterion, CLIP)
    training_losses.append(training_loss)
    test_loss = evaluate(classifier, dev_iterator, criterion)
    test_losses.append(test_loss)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    if test_loss < best_test_loss:
        best_test_loss = test_loss 
        torch.save(classifier.state_dict(), 'best_model_7classes.pt')
        

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s', end='')
    print(f'\tTrain Loss: {training_loss:.3f} | Test Loss: {test_loss:.3f}')

epoch start:  0


3398it [08:25,  6.72it/s]
850it [00:26, 32.55it/s]


Epoch: 01 | Time: 8m 52s	Train Loss: 1.753 | Test Loss: 1.703


In [34]:
list(confusion_matrix[0])

[tensor([292., 451., 112.,  19.,  61.,   0.,   0.]),
 tensor([ 112., 1348.,  131.,   50.,  238.,    0.,    0.]),
 tensor([139., 669., 453.,  15.,  57.,   0.,   0.]),
 tensor([ 29., 378.,  10., 317., 131.,   0.,   0.]),
 tensor([ 43., 715.,  46.,  35., 650.,   0.,   0.]),
 tensor([35., 86., 17.,  3., 16.,  0.,  0.]),
 tensor([  8., 114.,   3.,   1.,  11.,   0.,   0.])]