In [None]:
import pandas as pd
import torch
import sklearn
from sklearn.metrics import cohen_kappa_score as kappa
from sklearn.metrics import accuracy_score as accuracy

In [3]:
### WRITE CODE TO LOAD ANNOTATIONS AND 
df = pd.read_table('DataFiles/augmented_data.tsv')

In [13]:
import torch
torch.cuda.empty_cache()

# Confirm that the GPU is detected

assert torch.cuda.is_available()

# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")
device = torch.device("cuda")

Found device: Tesla T4, n_gpu: 1


## Installing Hugging Face's Transformers library
We will use Hugging Face's Transformers (https://github.com/huggingface/transformers), an open-source library that provides general-purpose architectures for natural language understanding and generation with a collection of various pretrained models made by the NLP community.

In [14]:
!pip install transformers
!pip install -U -q PyDrive

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


The cell below imports some helper functions we wrote to demonstrate the task on the sample dataset.

In [15]:
import pandas as pd
from transformers import BertForSequenceClassification, AdamW, BertConfig
from torch.utils.data import TensorDataset, random_split
from transformers import BertTokenizer
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import sys
import numpy as np
import time
import datetime

def tokenize_and_format(sentences):
  tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case=True)

  # Tokenize all of the sentences and map the tokens to thier word IDs.
  input_ids = []
  attention_masks = []

  # For every sentence...
  for sentence in sentences:
      # `encode_plus` will:
      #   (1) Tokenize the sentence.
      #   (2) Prepend the `[CLS]` token to the start.
      #   (3) Append the `[SEP]` token to the end.
      #   (4) Map tokens to their IDs.
      #   (5) Pad or truncate the sentence to `max_length`
      #   (6) Create attention masks for [PAD] tokens.
      encoded_dict = tokenizer.encode_plus(
                          sentence,                      # Sentence to encode.
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                          max_length = 64,           # Pad & truncate all sentences.
                          padding = 'max_length',
                          truncation = True,
                          return_attention_mask = True,   # Construct attn. masks.
                          return_tensors = 'pt',     # Return pytorch tensors.
                    )

      # Add the encoded sentence to the list.
      input_ids.append(encoded_dict['input_ids'])

      # And its attention mask (simply differentiates padding from non-padding).
      attention_masks.append(encoded_dict['attention_mask'])
  return input_ids, attention_masks

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


In [None]:
df = df.sample(frac=1).reset_index(drop=True)

texts = df.sentence.values
labels = df.label_ID.values
labels2 = df.label_ID2.values

input_ids, attention_masks = tokenize_and_format(texts)


label_list = []
for l,m in zip(labels, labels2):
  label_array = np.zeros(len(set(labels)))
  label_array[int(l)-1] += 0.5
  label_array[int(m)-1] += 0.5
  label_list.append(label_array)



# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(np.array(label_list))

# Print sentence 0, now as a list of IDs.
print('Original: ', texts[0])
print('Token IDs:', input_ids[0])

In [18]:
total = len(df)
num_train = int(len(df)*0.8)
num_val = int(len(df)*0.1)
num_test = total - num_val

# make lists of 3-tuples (already shuffled the dataframe in cell above)

train_set = [(input_ids[i], attention_masks[i], labels[i]) for i in range(num_train)]
val_set = [(input_ids[i], attention_masks[i], labels[i]) for i in range(num_train, num_val)]
test_set = [(input_ids[i], attention_masks[i], labels[i]) for i in range(num_val, total)]


train_text = [texts[i] for i in range(num_train)]
val_text = [texts_test[i] for i in range(num_train, num_val)]
test_text = [texts_test[i] for i in range(num_val, total)]


In [19]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained(
    "bert-base-multilingual-uncased", # Use the 12-layer English BERT model, with an uncased vocab.
    num_labels = 15, # The number of output labels.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

model.config.hidden_dropout_prob = 0.2

# Tell pytorch to run this model on the GPU.
model.cuda()
print()

Downloading pytorch_model.bin:   0%|          | 0.00/672M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 




In [20]:

batch_size = 32
optimizer = AdamW(model.parameters(), lr = 5e-5) #with default values of learning rate and epsilon value
epochs = 7



In [21]:
def validation(val_set):
    # Put the model in evaluation mode
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0

    num_batches = int(len(val_set)/batch_size) + 1

    total_correct = 0
    incorrect_values = []
    for i in range(num_batches):

      end_index = min(batch_size * (i+1), len(val_set))

      batch = val_set[i*batch_size:end_index]
      
      if len(batch) == 0: continue

      input_id_tensors = torch.stack([data[0] for data in batch])
      input_mask_tensors = torch.stack([data[1] for data in batch])
      label_tensors = torch.stack([data[2] for data in batch])
      
      # Move tensors to the GPU
      b_input_ids = input_id_tensors.to(device)
      b_input_mask = input_mask_tensors.to(device)
      b_labels = label_tensors.to(device)
        
      # Tell pytorch not to bother with constructing the compute graph during
      # the forward pass, since this is only needed for backprop (training).
      with torch.no_grad():        

        # Forward pass, calculate logit predictions.
        outputs = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask,
                                labels=b_labels)
        loss = outputs.loss
        logits = outputs.logits   
        # Accumulate the validation loss.
        total_eval_loss += loss.item()
        
        # Move logits and labels to CPU
        logits = (logits).detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()


        # Calculate the number of correctly labeled examples in batch
        pred_flat = np.argmax(logits, axis=1).flatten()
        labels_flat = np.argmax(label_ids, axis=1).flatten()

        num_correct = np.sum(pred_flat == labels_flat)
        labels_incorrect = (pred_flat != labels_flat)
        incorrect_values.append([pred_flat+1, labels_flat+1])
        total_correct += num_correct
        
    # Report the final accuracy for this validation run.
    print("Num of correct predictions =", total_correct)
    avg_val_accuracy = total_correct / len(val_set)

    return avg_val_accuracy, incorrect_values[0]



In [22]:
for epoch_i in range(0, epochs):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    total_train_loss = 0

    model.train()

    num_batches = int(len(train_set)/batch_size) + 1

    for i in range(num_batches):
      end_index = min(batch_size * (i+1), len(train_set))

      batch = train_set[i*batch_size:end_index]

      if len(batch) == 0: continue

      input_id_tensors = torch.stack([data[0] for data in batch])
      input_mask_tensors = torch.stack([data[1] for data in batch])
      label_tensors = torch.stack([data[2] for data in batch])

      b_input_ids = input_id_tensors.to(device)
      b_input_mask = input_mask_tensors.to(device)
      b_labels = label_tensors.to(device) 

      outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
      loss = outputs.loss
      logits = outputs.logits

      total_train_loss += loss.item()

      model.zero_grad()     

      loss.backward()

      optimizer.step()
        
    print(f"Total loss: {total_train_loss}")
    val_acc = validation(val_set)
    print(f"Validation accuracy: {val_acc}")
    
print("")
print("Training complete!")



Training...
Total loss: 15.141500369205195
Num of correct predictions = 7
Validation accuracy: (0.1346153846153846, [array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), array([ 6, 13,  7,  1,  4,  2, 11, 13,  9,  8,  1,  2,  5, 12,  1, 13,  1,
       13, 15,  5,  2, 12, 11,  4,  7, 13,  9,  1,  4,  9, 12,  7])])

Training...
Total loss: 10.477692454923975
Num of correct predictions = 7
Validation accuracy: (0.1346153846153846, [array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), array([ 6, 13,  7,  1,  4,  2, 11, 13,  9,  8,  1,  2,  5, 12,  1, 13,  1,
       13, 15,  5,  2, 12, 11,  4,  7, 13,  9,  1,  4,  9, 12,  7])])

Training...
Total loss: 9.930332305583741
Num of correct predictions = 21
Validation accuracy: (0.40384615384615385, [array([12, 13,  7,  1, 11,  1,  3, 13,  2,  5,  1,  1,  5, 11, 13, 13,  1,
       13,  3,  5,  7, 12, 13, 11,  7,  5,  7,  1,  3,  

In [24]:
Out = get_validation_performance(test_set)

Num of correct predictions = 22


In [25]:
Labels = np.vstack(Out[1])
Labels

array([[15,  2,  9, 15,  6,  7, 15,  4,  3, 12, 13, 11,  9,  2,  7,  5,
         7,  1,  6,  6, 13,  5,  3, 14,  7, 15,  7, 12,  9,  9, 13, 13],
       [ 5,  1, 15, 15,  7,  9, 11,  1, 11,  4,  8,  4,  9, 12,  7,  5,
         7,  1,  1,  4, 13,  5, 10,  5,  7,  1,  6,  4,  9, 10,  5,  4]])

The following is for creating the labels for the hidden dataset

In [100]:
## YOUR ERROR ANALYSIS CODE HERE
listofLabels = [0, 3, 5, 6, 10 ]
for i in listofLabels:
  if Labels[0][i] != Labels[1][i]:
    print(test_text[i])
    print('Predicted Label :', res[Labels[0][i]])
    print('Acutal Label :' ,res[Labels[1][i]] )
    print('*************************************')

Excluded from the action will be some students and others with residency status, along with people applying for immigration status, such as those carrying "green cards."
Predicted Label : Crime and Punishment
Acutal Label : Legality, Constitutionality, Jurisdiction
*************************************
The reasons for migration are important. Migrating for work purposes are different to migration to embark in higher education.
Predicted Label : Capacity and Resources
Acutal Label : Other
*************************************
Pakistani man arrested by FBI dies in jail cell
Predicted Label : Crime and Punishment
Acutal Label : Health and Safety
*************************************
IMMIGRANTS POUR INTO GEORGIA, HEARTLAND STATES, STUDY FINDS
Predicted Label : Quality of Life
Acutal Label : Cultural Identity
*************************************
 but Ms. Trump said that the temporary plan was necessary for national security reasons.
Predicted Label : Political
Acutal Label : Security and D