In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel, AutoModelForMaskedLM
import numpy as np
from torch.utils.data import DataLoader, Dataset
from torch import nn, cuda, optim
import pandas as pd
import argparse
from tqdm import tqdm
from torch.optim.lr_scheduler import StepLR
import mlflow

In [12]:
from sklearn.model_selection import train_test_split

In [5]:
training_input_df = pd.read_csv(filepath_or_buffer="../data/train.csv")
testing_input_df = pd.read_csv(filepath_or_buffer="../data/test.csv")

In [9]:
# drop 'id' , 'keyword' and 'location' columns.
training_input_df.drop(columns=['id','keyword','location'], inplace=True)
training_input_df["text"]=normalise_text(training_input_df["text"])

  text = text.str.replace(r"\#","") # replaces hashtags
  text = text.str.replace(r"http\S+","URL")  # remove URL addresses
  text = text.str.replace(r"[^A-Za-z0-9()!?\'\`\"]", " ")
  text = text.str.replace("\s{2,}", " ")


In [8]:
# to clean data
def normalise_text (text):
    text = text.str.lower() # lowercase
    text = text.str.replace(r"\#","") # replaces hashtags
    text = text.str.replace(r"http\S+","URL")  # remove URL addresses
    text = text.str.replace(r"@","")
    text = text.str.replace(r"[^A-Za-z0-9()!?\'\`\"]", " ")
    text = text.str.replace("\s{2,}", " ")
    return text

  text = text.str.replace("\s{2,}", " ")


In [13]:
training_input_df.head()
# split data into train and validation 
train_df, valid_df = train_test_split(training_input_df)

Initialiaze the tokenizer

In [15]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

tokenizer = RobertaTokenizer.from_pretrained('roberta-base') # or any other tokenizer
sentences = training_input_df["text"]
sentences_tokenized = [tokenizer.tokenize(sentence) for sentence in sentences ]
# tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentence0))
# print('Max sentence length: ', max([len(sen) for sen in input_ids]))

In [17]:
# tokenizer = torchtext.data.get_tokenizer("basic_english")

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.data = [] # (sent1, label1) ...
        for i, row in dataframe.iterrows():
            text, target = row['text'], row['target']
            self.data.append((text, int(target)))
            self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text, target = self.data[idx]
        inputs = self.tokenizer(text, padding='max_length', truncation=True, max_length=76, return_tensors='pt')
        inputs_ids = inputs['input_ids'].squeeze(0)
        attention_masks = inputs['attention_mask'].squeeze(0)
        return inputs_ids, attention_masks, target 

In [18]:
train_dataset = CustomDataset(train_df, tokenizer)
val_dataset = CustomDataset(valid_df, tokenizer)

batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
# inputs_ids, attention_masks, label = next(iter(train_dataloader))
# print(label.shape)

In [19]:
from transformers import AdamW, RobertaConfig

In [20]:
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = RobertaForSequenceClassification.from_pretrained(
    'roberta-base', # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# model that we keep for validation
best_model = RobertaForSequenceClassification.from_pretrained(
    'roberta-base', # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

Downloading pytorch_model.bin: 100%|██████████| 501M/501M [00:27<00:00, 18.0MB/s] 
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoin

In [21]:
# Define the device to run the model on (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [22]:
# Tell pytorch to run this model on the GPU.
model = model.to(device)

In [23]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW (model.parameters(),
                  lr =1e-6, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8, # args.adam_epsilon  - default is 1e-8.
                  weight_decay=0.05
                )



In [24]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs (authors recommend between 2 and 4)
epochs = 50
# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [25]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs (authors recommend between 2 and 4)
epochs = 50
# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [26]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [27]:
def plot_loss_accuracy(train_losses, val_losses, train_accuracies, val_accuracies):
    fig, axs = plt.subplots(1, 2, figsize=(10, 5))
    axs[0].plot(train_losses, label='Train Loss')
    axs[0].plot(val_losses, label='Validation Loss')
    axs[0].set_title("Losses over Epochs")
    axs[0].set_xlabel("Epoch")
    axs[0].set_ylabel("Loss")
    axs[0].legend()
    
    axs[1].plot(train_accuracies, label='Train Accuracy')
    axs[1].plot(val_accuracies, label='Validation Accuracy')
    axs[1].set_title("Accuracies over Epochs")
    axs[1].set_xlabel("Epoch")
    axs[1].set_ylabel("Accuracy")
    axs[1].legend()
    
    plt.tight_layout()
#     plt.savefig(path)

In [28]:
import numpy as np
from torch.nn import functional as F

def compute_acc(preds, labels):
    correct = 0
    preds_ = preds.data.max(1)[1]
    correct = preds_.eq(labels.data).cpu().sum()
    acc = float(correct) / float(len(labels.data)) * 100.0
    return acc

In [31]:

def compare_scalar(t_tensor, s):
    return t_tensor.eq(torch.from_numpy(np.array([s])).to(device))

In [30]:
def get_accuracy_stats(predictions, labels):
    true_positives = torch.sum(torch.logical_and(compare_scalar(labels, 1), compare_scalar(predictions, 1)).int()).item()
    false_positives = torch.sum(torch.logical_and(compare_scalar(labels, 0), compare_scalar(predictions, 1)).int()).item()
    false_negatives = torch.sum(torch.logical_and(compare_scalar(labels, 1), compare_scalar(predictions, 0)).int()).item()
    true_negatives = torch.sum(torch.logical_and(compare_scalar(labels, 0), compare_scalar(predictions, 0)).int()).item()
    return true_positives, false_positives, false_negatives, true_negatives


In [29]:
# ! alternative and prefered evals
def eval_model(model, loader):
    # Keep track of the total loss for the batch
    print("Evaluating Model")
    true_positives, false_positives, false_negatives, true_negatives = 0, 0, 0, 0
    with torch.no_grad():
        for a in tqdm(loader):
            input_ids = torch.from_numpy(np.array(a[0])).to(device)
            masks = torch.from_numpy(np.array(a[1])).to(device)
            labels = torch.from_numpy(np.array(a[2])).to(device)
            output = model(input_ids, masks)
            predictions = torch.argmax(output, dim=1).int()
            tp, fp, fn, tn = get_accuracy_stats(predictions, labels)
            true_positives += tp
            false_positives += fp
            false_negatives += fn
            true_negatives += tn

    f1_score = true_positives / (true_positives + 0.5 *(false_positives + false_negatives))
    accuracy = (true_positives+true_negatives) / (true_positives + false_positives + false_negatives + true_negatives) * 100
    return accuracy, f1_score

In [33]:
patience = 4 
num_no_improvement = 0 

# Store the average loss after each epoch so we can plot them.
train_losses = []
train_accuracies = []
val_losses = []
val_accuracies = []
show_every = 20

# Keep track of the best validation loss
best_val_loss = float('inf')


# For each epoch...
for epoch_i in range(0, epochs):
    
    store_train_loss = []
    store_train_acc = []
    store_val_loss = []
    store_val_acc = []
    
     # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    # Measure how long the training epoch takes.
    t0 = time.time()
    # Reset the total loss for this epoch.
    total_loss = 0
    
    model.train()
    
    
    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):
        
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        inputs_ids, attention_masks, labels = batch
        model.zero_grad()
        
        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(inputs_ids, 
                    attention_mask=attention_masks)
        
#         print(outputs[0])
#         print(labels.shape)
#         print(outputs[0].shape)
        loss = F.cross_entropy(outputs[0], labels)
#         print(outputs.shape)
        
        train_acc = compute_acc(outputs[0], labels)
        

        # Print the statistics
        store_train_loss.append(loss.item())
        store_train_acc.append(train_acc)
        
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()

        # Update the learning rate.
        scheduler.step()
        
        # Progress update every 40 batches.
        if step % show_every == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            # Report progress.
            print('  Batch {} / {}.  Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            print('Training loss: %.3f  Training acc: %.3f'%(np.mean(store_train_loss[-show_every:]), np.mean(store_train_acc[-show_every:])) ) 
    
        
    # compute epoch loss and accuracy 
    train_losses.append(np.mean(store_train_loss))
    train_accuracies.append(np.mean(store_train_acc))
    
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
    
    
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    
    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()
    
    # Evaluate data for one epoch
    for batch in val_dataloader:
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        val_inputs_ids, val_attention_masks, val_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():  
            val_outputs = model(val_inputs_ids,  
                            attention_mask=val_attention_masks)
            
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        val_logits = val_outputs[0]
        
        val_loss = F.cross_entropy(val_logits, val_labels)
        
        # Calculate the accuracy for this batch of test sentences.
        val_acc = compute_acc(val_logits, val_labels)
        
        store_val_loss.append(val_loss.item())
        store_val_acc.append(val_acc)
        
    
    # compute epoch loss and accuracy 
    mean_val_loss = np.mean(store_val_loss)
    val_losses.append(mean_val_loss)
    val_accuracies.append(np.mean(store_val_acc))
        
    # Report the final accuracy for this validation run.
    # Print loss and acc at the end of the epoch
    print("Epoch {}: Train Loss: {:.4f}, Validation Loss: {:.4f}, Train Accuracy: {:.2f}%, Validation Accuracy: {:.2f}%".format
    (epoch_i, train_losses[-1], val_losses[-1], train_accuracies[-1], val_accuracies[-1]))
    


    #Check if validation loss has improved
    if np.mean(mean_val_loss < best_val_loss):
        best_val_loss = mean_val_loss
        # Get the current state of the model
        model_state_dict = model.state_dict()
        best_model.load_state_dict(model_state_dict)
        num_no_improvement = 0 
    else:
        num_no_improvement+=1 
    if num_no_improvement == patience: 
        break

        
print("")
print("Training complete!")
    


Training...
  Batch 20 / 179.  Elapsed: 0:01:35.
Training loss: 0.689  Training acc: 56.094


KeyboardInterrupt: 

In [None]:
plot_loss_accuracy(train_losses, val_losses, train_accuracies, val_accuracies)

In [None]:
test = pd.read_csv('../data/test.csv')

test.drop(columns=['keyword','location'], inplace=True)

test["text"]=normalise_text(test["text"])

test['target'] = 0

test_dataset = CustomDataset(test, tokenizer)
batch_size = 8
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Load the state dictionary into the model

# Set the model to evaluation mode
best_model.to(device)
best_model.eval()


store_test_acc = []

store_predictions = torch.zeros(len(test_dataset), dtype=torch.int)

# For each batch of training data...
for step, batch in enumerate(test_dataloader):
    
    
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    inputs_ids, attention_masks, labels = batch
    with torch.no_grad():
        outputs = best_model(inputs_ids, 
                    token_type_ids=None, 
                    attention_mask=attention_masks)
        predictions = torch.max(outputs[0], dim=1)[1]
        store_predictions[step*len(predictions): (step+1)*len(predictions)] = predictions
        # store_test_acc.append(compute_acc(outputs[0], labels))


store_predictions = store_predictions.detach().cpu().numpy()

test['target'] = store_predictions

test.drop(columns=['text'], inplace=True)

test.to_csv('submission.csv', index=False)


print("Finished testing")

In [None]:
test.head()
