In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

## Preprocessing

In [2]:
events = [
    "canada_wildfires_2016", "cyclone_idai_2019", "ecuador_earthquake_2016",
    "hurricane_harvey_2017", "hurricane_irma_2017", "hurricane_maria_2017",
    "hurricane_matthew_2016", "italy_earthquake_aug_2016", "kaikoura_earthquake_2016",
    "puebla_mexico_earthquake_2017", "srilanka_floods_2017"
]

dataframes = {event: {} for event in events}

# Loop through each event and load train, dev, and test sets
for event in events:
    for set_type in ['train', 'dev', 'test']:
        file_path = f'../data/HumAID/{event}/{event}_{set_type}.tsv'
        # Load the dataset and store it in the dictionary under the appropriate event and set type
        dataframes[event][set_type] = pd.read_csv(file_path, sep='\t')

In [3]:
dataframes['canada_wildfires_2016']['test']

Unnamed: 0,tweet_id,tweet_text,class_label
0,728674116773904384,RT @FoothillsFCU23: In response the to the #Fo...,rescue_volunteering_or_donation_effort
1,729787427829612544,Redcross is offering charitable donation recei...,rescue_volunteering_or_donation_effort
2,730510385544085505,RT @globeandmail: Red Cross to transfer $50-mi...,rescue_volunteering_or_donation_effort
3,733705874594746368,Live: Emergency operations briefing on north A...,other_relevant_information
4,730606066023665665,"$9bn fire damage to Fort McMurray, ‘the beast’...",infrastructure_and_utility_damage
...,...,...,...
440,729062171993374720,I feel sad Mom &amp; I r donating money to the...,rescue_volunteering_or_donation_effort
441,728733841230311425,This is the best way to help. The Red Cross wi...,rescue_volunteering_or_donation_effort
442,730078294259961856,RT @TheGrayGroup: Donations for Fort McMurray ...,rescue_volunteering_or_donation_effort
443,730002964035723264,Local volleyball team members raise money to h...,rescue_volunteering_or_donation_effort


In [4]:
train_dfs = []
dev_dfs = []
test_dfs = []

# Iterate over each event and set type, adding the event name as a column
for event, sets in dataframes.items():
    for set_type, df in sets.items():
        df['event'] = event  # Add the event name as a column
        
        # Append the DataFrame to the appropriate list based on the set type
        if set_type == 'train':
            train_dfs.append(df)
        elif set_type == 'dev':
            dev_dfs.append(df)
        elif set_type == 'test':
            test_dfs.append(df)

# Concatenate the lists of DataFrames into three large DataFrames
train_df = pd.concat(train_dfs, ignore_index=True)
val_df = pd.concat(dev_dfs, ignore_index=True)
test_df = pd.concat(test_dfs, ignore_index=True)

In [5]:
# train_df = train_df.drop(columns=['tweet_id'])

## Topic Classification

In [6]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

  torch.utils._pytree._register_pytree_node(


In [7]:
sequence_to_classify = "one day I will see the world"
candidate_labels = ['travel', 'cooking', 'dancing']
classifier(sequence_to_classify, candidate_labels)

{'sequence': 'one day I will see the world',
 'labels': ['travel', 'dancing', 'cooking'],
 'scores': [0.9938650727272034, 0.003273798618465662, 0.0028610429726541042]}

In [8]:
sequence_to_classify = "This pole is tilted a little"
candidate_labels = ["requests_or_urgent_needs", "infrastructure_and_utility_damage"]
classifier(sequence_to_classify, candidate_labels)

{'sequence': 'This pole is tilted a little',
 'labels': ['infrastructure_and_utility_damage', 'requests_or_urgent_needs'],
 'scores': [0.7147158980369568, 0.2852840721607208]}

# Fine tuning

In [9]:
from transformers import BartTokenizerFast
import random
import torch
from datasets import Dataset, load_metric

In [10]:
tokenizer = BartTokenizerFast.from_pretrained('facebook/bart-large-mnli')

In [11]:
tokenizer

BartTokenizerFast(name_or_path='facebook/bart-large-mnli', vocab_size=50265, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}, clean_up_tokenization_spaces=True)

In [12]:
## create list of unique classes
label_to_int = list(train_df['class_label'].unique())
template = "This is {}"

In [13]:
train_ds = Dataset.from_pandas(train_df, split="train")
test_ds = Dataset.from_pandas(test_df, split="test")

In [14]:
train_ds

Dataset({
    features: ['tweet_id', 'tweet_text', 'class_label', 'event'],
    num_rows: 28802
})

In [15]:
def create_input_sequence(sample):
  text = sample["tweet_text"]
  label = sample["class_label"][0]
  contradiction_label = random.choice([x for x in label_to_int if x != label])
  encoded_sequence = tokenizer(text * 2, [template.format(label), template.format(contradiction_label)], truncation = True, padding = 'max_length')
  encoded_sequence["labels"] = [2, 0]
  encoded_sequence["input_sentence"] = tokenizer.batch_decode(encoded_sequence.input_ids)
  return encoded_sequence


train_dataset = train_ds.map(create_input_sequence, batched = True, batch_size = 1, remove_columns = ["class_label", "tweet_text", 'tweet_id', 'event'])
test_dataset = test_ds.map(create_input_sequence, batched = True, batch_size = 1, remove_columns = ["class_label", "tweet_text", 'tweet_id', 'event'])



  0%|          | 0/28802 [00:00<?, ?ba/s]

  0%|          | 0/8157 [00:00<?, ?ba/s]

In [16]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels', 'input_sentence'],
    num_rows: 57604
})

In [17]:
from transformers import BartForSequenceClassification, Trainer, TrainingArguments, EvalPrediction
import numpy as np

In [18]:
# training
def compute_metrics(p: EvalPrediction):
  metric_acc = load_metric("accuracy")
  metric_f1 = load_metric("f1")
  preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
  preds = np.argmax(preds, axis = 1)
  result = {}
  result["accuracy"] = metric_acc.compute(predictions = preds, references = p.label_ids)["accuracy"]
  result["f1"] = metric_f1.compute(predictions = preds, references = p.label_ids, average = 'macro')["f1"]
  return result

In [19]:
model_directory = '/Users/brandonluong/Pole-Validation/notebooks'
training_args = TrainingArguments(
  output_dir = model_directory,      # Output directory
  num_train_epochs = 32,             # Total number of training epochs
  per_device_train_batch_size = 16,  # Batch size per device during training
  per_device_eval_batch_size = 64,   # Batch size for evaluation
  warmup_steps = 500,                # Number of warmup steps for learning rate scheduler
  weight_decay = 0.01,               # Strength of weight decay
)

model = BartForSequenceClassification.from_pretrained("facebook/bart-large-mnli", num_labels = len(label_to_int), ignore_mismatched_sizes = True)

trainer = Trainer(
  model = model,                     # The instantiated model to be trained
  args = training_args,              # Training arguments, defined above
  compute_metrics = compute_metrics, # A function to compute the metrics
  train_dataset = train_dataset,     # Training dataset
  eval_dataset = test_dataset,       # Evaluation dataset
  tokenizer = tokenizer              # The tokenizer that was used
)

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([10]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([10, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
trainer.train()

  0%|          | 0/115232 [00:00<?, ?it/s]

RuntimeError: MPS backend out of memory (MPS allocated: 8.39 GB, other allocations: 1.27 GB, max allowed: 9.07 GB). Tried to allocate 1024.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).