In [16]:
#@title Use GPU for training

#use colab GPU for training
import tensorflow as tf
import torch
# Get the GPU device name.
device_name = tf.test.gpu_device_name()
# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

Found GPU at: /device:GPU:0
There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [17]:
#@title Installations

!pip install transformers
import nltk
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from transformers import BertTokenizer,BertForSequenceClassification, AdamW, BertConfig
from torch.utils.data import TensorDataset,DataLoader, RandomSampler, SequentialSampler
from transformers import BertModel, BertConfig
from transformers import get_linear_schedule_with_warmup
import numpy as np
from sklearn.metrics import f1_score

nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
def preprocess(tweet):
    tweet = tweet.lower()
    tweet = re.sub("@user", "", tweet)
    tweet = re.sub(r"@[\w\-]+", "", tweet)
    tweet = re.sub(r"[^A-Za-z]", " ", tweet)
    # remove url
    tweet = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|''[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", "", tweet)
    tokens = tweet.split(" ")
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    tokens = [token for token in tokens if not token in stop_words]
    return " ".join(tokens)

In [19]:
#@title Load Data

train_df = pd.read_csv('train.csv')
dev_df = pd.read_csv('dev.csv')
test_df = pd.read_csv('test.csv')

# Creating the training corpus
train_df['tweet'] = train_df['tweet'].apply(lambda x: preprocess(x))
# Creating the development corpus
dev_df['tweet'] = dev_df['tweet'].apply(lambda x: preprocess(x))
# Creating the testing corpus
test_df['tweet'] = test_df['tweet'].apply(lambda x: preprocess(x))

train_sentences = train_df.tweet.values
train_labels = train_df['class'].values
dev_sentences = dev_df.tweet.values
dev_labels = dev_df['class'].values
test_sentences = test_df.tweet.values
test_labels = test_df['class'].values

In [None]:
#@title Helper Functions

# # Print the original sentence.
# print(' Original: ', train_sentences[0])
# # Print the sentence split into tokens.
# print('Tokenized: ', tokenizer.tokenize(train_sentences[0]))
# # Print the sentence mapped to token ids.
# print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_sentences[0])))

# max_len = 0
# # For every sentence...
# for sent in train_sentences:
#     # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
#     input_ids = tokenizer.encode(sent, add_special_tokens=True)
#     # Update the maximum sentence length.
#     max_len = max(max_len, len(input_ids))
# print('Max sentence length: ', max_len)

In [23]:
def tokenizer(data,labels):
	# Load the BERT tokenizer.
	tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
	# Tokenize all of the sentences and map the tokens to thier word IDs.
	input_ids = []
	attention_masks = []
	# For every sentence: Add '[CLS]' and '[SEP]', Pad & truncate all sentences, Construct attn. masks. and Return pytorch tensors.)
	for sent in data:
		encoded_dict = tokenizer.encode_plus(sent, add_special_tokens = True, max_length = 84, pad_to_max_length = True, return_attention_mask = True, return_tensors = 'pt')  
		input_ids.append(encoded_dict['input_ids']) # Add the encoded sentence to the list.    
		attention_masks.append(encoded_dict['attention_mask']) # And its attention mask (simply differentiates padding from non-padding).
	# Convert the lists into tensors.
	input_ids = torch.cat(input_ids, dim=0)
	attention_masks = torch.cat(attention_masks, dim=0)
	labels = torch.tensor(labels)
	return input_ids, attention_masks, labels

def data_loader(data, labels, flag):
	# flag == 1 means the input dataset is training dataset
	batch_size = 32
	input_ids, attention_masks, labels = tokenizer(data, labels)
	dataset = TensorDataset(input_ids, attention_masks, labels)
	if flag == 1:# Select batches randomly 
		dataloader = DataLoader(dataset,  sampler = RandomSampler(dataset), batch_size = batch_size)
	else: # Pull out batches sequentially.
		dataloader = DataLoader(dataset, sampler = SequentialSampler(dataset), batch_size = batch_size)
	return dataloader

In [24]:
train_dataloader = data_loader(train_sentences, train_labels,1)
validation_dataloader = data_loader(dev_sentences, dev_labels,0)
test_dataloader = data_loader(test_sentences,test_labels,0)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

# 4. Train Our Classification Model

In [25]:
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
# Use the 12-layer BERT model, with an uncased vocab.
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2, output_attentions = False, output_hidden_states = False)
model.cuda()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [26]:
optimizer = AdamW(model.parameters(),lr = 2e-5, eps = 1e-8)

In [27]:
#tried 4, but may lead to overfitting
epochs = 1

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)

In [28]:
# Function to calculate the accuracy of our predictions vs labels
def flat_f1(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, pred_flat,average='macro')

We're ready to kick off the training!

In [29]:
import random
import numpy as np
# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# We'll store a number of quantities such as training and validation loss, 
# validation accuracy, and timings.
training_stats = []

# For each epoch...
for epoch_i in range(0, epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    # Reset the total loss for this epoch.
    total_train_loss = 0
    model.train()
    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        model.zero_grad()        
        result = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask,labels=b_labels, return_dict=True)
        loss = result.loss
        logits = result.logits
        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. 
        total_train_loss += loss.item()
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()
    # Tracking variables 
    total_eval_f1 = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device) #   [0]: input ids 
        b_input_mask = batch[1].to(device) #   [1]: attention masks
        b_labels = batch[2].to(device) #   [2]: labels 
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        
            # Forward pass, calculate logit predictions.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            result = model(b_input_ids,token_type_ids=None,  attention_mask=b_input_mask,labels=b_labels, return_dict=True)

        # Get the loss and "logits" output by the model. The "logits" are the 
        # output values prior to applying an activation function like the 
        # softmax.
        loss = result.loss
        logits = result.logits
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_f1 += flat_f1(logits, label_ids)
        

    # Report the final accuracy for this validation run.
    avg_val_f1 = total_eval_f1 / len(validation_dataloader)
    print("  f1: {0:.2f}".format(avg_val_f1))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
   # print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append( {'epoch': epoch_i + 1,'Training Loss': avg_train_loss, 'Valid. Loss': avg_val_loss, 'Valid. F1.': avg_val_f1})
print("")
print("Training complete!")


Training...

  Average training loss: 0.50
  f1: 0.74
  Validation Loss: 0.45

Training complete!


In [30]:
#Evaluation on Test
# Put model in evaluation mode
model.eval()
# Tracking variables 
predictions , true_labels = [], []
total_eval_f1 = 0
# Predict 
for batch in test_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions.
      result = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask,return_dict=True)
  logits = result.logits
  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)
print('    DONE.')

    DONE.


In [31]:
# Combine the results across all batches. 
flat_predictions = np.concatenate(predictions, axis=0)
# For each sample, pick the label (0 or 1) with the higher score.
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
# Combine the correct labels for each batch into a single list.
flat_true_labels = np.concatenate(true_labels, axis=0)
# Calculate the F1-score
f1_test = f1_score(flat_true_labels, flat_predictions,average ='macro')
print('f1 score for test: %.3f' % f1_test)

f1 score for test: 0.772
