<a href="https://colab.research.google.com/github/j84m9/NLP-with-Disaster-Tweets/blob/main/Code/Submission_2_Fine_tuned_DistilBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Mount Google Drive to Access Files in Current Notebook
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Install required libraries
%%capture
!pip install transformers
!pip install datasets
!pip install evaluate
#!pip install pyarrow==15.0.2

In [3]:
# Hugging Face Libraries
import transformers
import datasets
import evaluate

# Standard Libraries
import numpy as np
import pandas as pd

import os
# Check current Working Directory
os.getcwd()

'/content'

In [4]:
def split_df(df, val_size=0.05,test_size=0.1,seed_=42):
  """
  Splits a pandas DataFrame into training, validation, and test sets.

  Parameters:
  -----------
  df : pandas.DataFrame
      The input dataset to be split.

  val_size : float, optional, default=0.1
      The proportion of the dataset to include in the validation set.

  test_size : float, optional, default=0.1
      The proportion of the dataset to include in the test set.

  seed_ : int, optional, default=42
      Seed for random number generator to ensure reproducibility.

  Returns:
  --------
  tuple of pandas.DataFrame

  Example:
  --------
  >>> train_df, test_df, val_df = split_df(df, val_size=0.2, test_size=0.2, seed_=123)
    """
  from random import choices, seed

  seed(seed_)
  val_idxs = choices(df.index,k=round(len(df)*val_size))
  remainder = [idx for idx in df.index if idx not in val_idxs]

  seed(seed_)
  test_idxs = choices(remainder,k=round(len(df)*test_size))

  train_idxs = list(set(df.index)-set(test_idxs)-set(val_idxs))

  return df.iloc[train_idxs].reset_index(drop=True),df.iloc[test_idxs].reset_index(drop=True),df.iloc[val_idxs].reset_index(drop=True)



In [5]:
# Loading training data
temp = pd.read_csv("/content/drive/MyDrive/train.csv")
display(temp.head(3))

#Split data into train, test and validation sets
train, test, val = split_df(temp)

# Combining datasets into a DatasetDict object appropriate for our tokenizer and model
from datasets import Dataset, DatasetDict
raw_datasets = {'train': Dataset.from_pandas(train),
                'validation': Dataset.from_pandas(val),
                'test': Dataset.from_pandas(test)}
raw_datasets = DatasetDict(raw_datasets)
raw_datasets

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1


DatasetDict({
    train: Dataset({
        features: ['id', 'keyword', 'location', 'text', 'target'],
        num_rows: 6516
    })
    validation: Dataset({
        features: ['id', 'keyword', 'location', 'text', 'target'],
        num_rows: 381
    })
    test: Dataset({
        features: ['id', 'keyword', 'location', 'text', 'target'],
        num_rows: 761
    })
})

In [6]:
from transformers import AutoTokenizer, DataCollatorWithPadding

# Initialize tokenizer --- use distilbert here since there is limited GPU memory on colab
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Define a tokenization function
def tokenize_function(raw_data):
    return tokenizer(raw_data["text"], truncation=True)

# Tokenize the raw data
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

# Initialize a data collator with our tokenizer so that we cann dynamically pad by batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



Map:   0%|          | 0/6516 [00:00<?, ? examples/s]

Map:   0%|          | 0/381 [00:00<?, ? examples/s]

Map:   0%|          | 0/761 [00:00<?, ? examples/s]

In [11]:
tokenized_datasets = tokenized_datasets.rename_column("target","labels")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'keyword', 'location', 'text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 6516
    })
    validation: Dataset({
        features: ['id', 'keyword', 'location', 'text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 381
    })
    test: Dataset({
        features: ['id', 'keyword', 'location', 'text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 761
    })
})

In [12]:
#Here we define a compute metrics function that will be used to assess the model performance during training. For MRPC we use accuracy and F1-score.
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [13]:
#Initialize training arguments for the Trainer API --- only argument we need here is to provide directory where trained model will be saved. Default values should work well for basic fine-tuning
from transformers import TrainingArguments
training_args = TrainingArguments("distilbert-finetuned", evaluation_strategy="epoch") #evaluation_strategy="epoch" will compute our metrics after each epoch during training

# Instantiating the Model
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
from transformers import Trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4716,0.467481,0.818898,0.788991
2,0.317,0.586014,0.790026,0.766082
3,0.2331,0.66879,0.826772,0.79375


Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

TrainOutput(global_step=2445, training_loss=0.3223454855702406, metrics={'train_runtime': 178.993, 'train_samples_per_second': 109.211, 'train_steps_per_second': 13.66, 'total_flos': 254460558704208.0, 'train_loss': 0.3223454855702406, 'epoch': 3.0})

In [15]:
# Zipping model checkpoints for download
!zip -r /content/distilbert-finetuning-tweets.zip /content/distilbert-finetuned

  adding: content/distilbert-finetuned/ (stored 0%)
  adding: content/distilbert-finetuned/checkpoint-2000/ (stored 0%)
  adding: content/distilbert-finetuned/checkpoint-2000/config.json (deflated 46%)
  adding: content/distilbert-finetuned/checkpoint-2000/scheduler.pt (deflated 55%)
  adding: content/distilbert-finetuned/checkpoint-2000/special_tokens_map.json (deflated 42%)
  adding: content/distilbert-finetuned/checkpoint-2000/trainer_state.json (deflated 63%)
  adding: content/distilbert-finetuned/checkpoint-2000/training_args.bin (deflated 51%)
  adding: content/distilbert-finetuned/checkpoint-2000/rng_state.pth (deflated 25%)
  adding: content/distilbert-finetuned/checkpoint-2000/vocab.txt (deflated 53%)
  adding: content/distilbert-finetuned/checkpoint-2000/model.safetensors (deflated 8%)
  adding: content/distilbert-finetuned/checkpoint-2000/optimizer.pt (deflated 28%)
  adding: content/distilbert-finetuned/checkpoint-2000/tokenizer_config.json (deflated 76%)
  adding: content/

In [26]:
#Get predictions from our trained model on test set
predictions = trainer.predict(tokenized_datasets["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

#To transform logits output by model into predictions that we can compare to our labels, we need to take the index with the maximum value on the second axis:
preds = np.argmax(predictions.predictions, axis=-1)

metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=test.target.to_list())

(761, 2) (761,)


{'accuracy': 0.8081471747700394, 'f1': 0.762987012987013}

In [27]:
# Creating a Kaggle Submission
kaggle_sample = pd.read_csv("/content/drive/MyDrive/sample_submission.csv")

kaggle_sample.head(3)

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0


In [29]:
kaggle_test = pd.read_csv("/content/drive/MyDrive/test.csv")
display(kaggle_test.head(3))

raw_datasets = {'kaggle_test': Dataset.from_pandas(kaggle_test)}
raw_datasets = DatasetDict(raw_datasets)
raw_datasets

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."


DatasetDict({
    kaggle_test: Dataset({
        features: ['id', 'keyword', 'location', 'text'],
        num_rows: 3263
    })
})

In [31]:
# Tokenize the raw test data
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

Map:   0%|          | 0/3263 [00:00<?, ? examples/s]

DatasetDict({
    kaggle_test: Dataset({
        features: ['id', 'keyword', 'location', 'text', 'input_ids', 'attention_mask'],
        num_rows: 3263
    })
})

In [33]:
# Getting model predictions
predictions = trainer.predict(tokenized_datasets["kaggle_test"])
preds = np.argmax(predictions.predictions, axis=-1)

print(len(preds))

# Updating the target column of the sample submission using the model predictions
kaggle_sample['target'] = preds

3263


In [41]:
# Saving files locally from colab
from google.colab import files
kaggle_sample.to_csv('Submission 2.csv',index=False)
files.download('Submission 2.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>