In [1]:
#Connect with GDrive (Do not run if not used in Colab)
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/MyDrive/Colab Notebooks/RAG Disinformation/models')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install wandb -q
!pip install datasets -q
!pip install accelerate -U
!pip install scikit-learn
!pip install evaluate



In [3]:
import os

import wandb
import pandas as pd
import numpy as np
from datasets import (load_metric, Dataset, DatasetDict)
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, TextClassificationPipeline, DataCollatorForLanguageModeling)
import torch
import evaluate

torch.manual_seed(0)

<torch._C.Generator at 0x7b7e9052d370>

In [15]:
# model = 'BERT'
model_name = "google-bert/bert-base-uncased"

# model = 'roBERTa'
# model_name = "cardiffnlp/twitter-roberta-base-sentiment"

model = 'MiniLM'
model_name = "microsoft/MiniLM-L12-H384-uncased"

out_dir = f'./{model}'

In [16]:
# # Save environment variables
# %env WANDB_PROJECT= 'RAG Disinformation'
# %env WANDB_NOTEBOOK_NAME= 'RAG Disinformation'
# %env WANDB_WATCH=all

# # Log in to wandb
# wandb.login()

# # Initialize wandb
# wandb.init(project="RAG Disinformation", name=model, tags=[model_name], group="Transformers")

In [17]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

train['Text'] = train['Text'].astype(str)
test['Text'] = test['Text'].astype(str)
test['Disinformation'] = test['Disinformation'].astype(str)
train['Disinformation'] = train['Disinformation'].astype(str)

In [18]:
cats = train['Disinformation'].unique()
n_labels = len(cats)
label2id = {}
id2label = {}
for i in range(len(cats)):
  label2id[cats[i]] = i
  id2label[i] = cats[i]

train = train.rename(columns={'Text': 'text', 'Disinformation': 'label'})
test = test.rename(columns={'Text': 'text', 'Disinformation': 'label'})
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

train['label'] = train['label'].map(label2id)
test['label'] = test['label'].map(label2id)

train_dataset = Dataset.from_dict(train)
test_dataset = Dataset.from_dict(test)
my_dataset_dict = DatasetDict({"train":train_dataset,"test":test_dataset})

In [19]:
torch.cuda.empty_cache()

In [20]:
# # Tokenize the dataset (FOR BERT)
# tokenizer = AutoTokenizer.from_pretrained(model_name, id2label = id2label, label2id = label2id)

# def tokenize_function(examples):
#     return tokenizer(examples["text"], padding="max_length", truncation=True)


# Tokenize the dataset (FOR roBERTa and MiniLM)
tokenizer = AutoTokenizer.from_pretrained(model_name, id2label = id2label, label2id = label2id, max_len = 512)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=tokenizer.model_max_length)

tokenized_datasets = my_dataset_dict.map(tokenize_function, batched=True)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=0)
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=0)

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [21]:
# Define the model
model = AutoModelForSequenceClassification.from_pretrained(model_name, id2label = id2label, label2id = label2id,
                                                           num_labels=n_labels, ignore_mismatched_sizes=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# Define the metrics

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy_metric = evaluate.load("accuracy")
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]

    if n_labels == 2:
        precision_metric = evaluate.load("precision")
        recall_metric = evaluate.load("recall")
        f1_metric = evaluate.load("f1")

        precision = precision_metric.compute(predictions=predictions, references=labels, average='macro')["precision"]
        recall = recall_metric.compute(predictions=predictions, references=labels, average='macro')["recall"]
        f1 = f1_metric.compute(predictions=predictions, references=labels, average='macro')["f1"]

        return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}
    elif n_labels > 2:
        return {"accuracy": accuracy}

In [23]:
# n_epochs = 7
# n_epochs = 7
n_epochs = 20

# training_args = TrainingArguments(output_dir=out_dir, evaluation_strategy="epoch",
#                                   save_strategy = "epoch", save_total_limit = 2,
#                                   logging_strategy = "epoch",
#                                   report_to="wandb", run_name=model,
#                                   seed = 0, num_train_epochs = n_epochs,
#                                   auto_find_batch_size = True)



training_args = TrainingArguments(output_dir=out_dir, evaluation_strategy="epoch",
                                  save_strategy = "epoch", save_total_limit = 2,
                                  logging_strategy = "epoch",
                                  seed = 0, num_train_epochs = n_epochs,
                                  auto_find_batch_size = True)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [24]:
# for batch in trainer.get_train_dataloader():
#     break

# outputs = trainer.model.cpu()(**batch)

# Ensure the model is moved to the GPU
trainer.model.to('cuda')

for batch in trainer.get_train_dataloader():
    # Move each tensor in the batch to the GPU
    batch = {k: v.to('cuda') for k, v in batch.items()}
    break

# Compute the outputs using the model and batch on GPU
outputs = trainer.model(**batch)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Device: {device}")
batch = {k: v.to(device) for k, v in batch.items()}

outputs = trainer.model.to(device)(**batch)

loss = outputs.loss
loss.backward()

trainer.create_optimizer()
trainer.optimizer.step()

Device: cuda


In [25]:
import pandas as pd
from tqdm.auto import tqdm

# Initialize a list to store results after each epoch.
results_data = []

# Move the model to the correct device.
trainer.model.to(device)

# Iterate over epochs.
for epoch in range(n_epochs):
    # Training phase.
    trainer.model.train()
    total_train_loss = 0
    num_train_batches = 0
    train_progress_bar = tqdm(trainer.get_train_dataloader(), desc=f"Training (Epoch {epoch+1}/{n_epochs})")
    for batch in train_progress_bar:
        # Move the batch to the correct device.
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = trainer.model(**batch)
        loss = outputs.loss
        total_train_loss += loss.item()
        num_train_batches += 1
        loss.backward()
        trainer.optimizer.step()
        trainer.optimizer.zero_grad()

    avg_train_loss = total_train_loss / num_train_batches

    # Evaluation phase.
    trainer.model.eval()
    total_eval_loss = 0
    num_eval_batches = 0
    all_logits = []
    all_labels = []
    eval_progress_bar = tqdm(trainer.get_eval_dataloader(), desc=f"Evaluating (Epoch {epoch+1}/{n_epochs})")
    for batch in eval_progress_bar:
        with torch.no_grad():
            # Move the batch to the correct device.
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = trainer.model(**batch)
            logits = outputs.logits
            loss = outputs.loss
            total_eval_loss += loss.item()
            num_eval_batches += 1
            all_logits.append(logits.cpu().numpy())
            all_labels.append(batch['labels'].cpu().numpy())

    avg_eval_loss = total_eval_loss / num_eval_batches

    # Safely handle the concatenation of logits and labels.
    all_logits = np.vstack(all_logits)
    all_labels = np.concatenate(all_labels)

    # Compute metrics.
    eval_pred = (all_logits, all_labels)
    metrics = compute_metrics(eval_pred)

    # Print metrics.
    print(f"Epoch {epoch+1}/{n_epochs} - Training Loss: {avg_train_loss:.4f}, Evaluation Loss: {avg_eval_loss:.4f}, Metrics: {metrics}")

    # Store results.
    epoch_results = {
        'epoch': epoch + 1,
        'train_loss': avg_train_loss,
        'eval_loss': avg_eval_loss,
    }
    epoch_results.update(metrics)
    results_data.append(epoch_results)

# Convert the results list of dictionaries to a DataFrame.
results_df = pd.DataFrame(results_data)


Training (Epoch 1/20):   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating (Epoch 1/20):   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1/20 - Training Loss: 0.7042, Evaluation Loss: 0.6950, Metrics: {'accuracy': 0.5, 'precision': 0.25, 'recall': 0.5, 'f1': 0.3333333333333333}


  _warn_prf(average, modifier, msg_start, len(result))


Training (Epoch 2/20):   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating (Epoch 2/20):   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 2/20 - Training Loss: 0.6828, Evaluation Loss: 0.6600, Metrics: {'accuracy': 0.6, 'precision': 0.7777777777777778, 'recall': 0.6, 'f1': 0.5238095238095238}


Training (Epoch 3/20):   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating (Epoch 3/20):   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 3/20 - Training Loss: 0.5217, Evaluation Loss: 0.4544, Metrics: {'accuracy': 0.8, 'precision': 0.8125, 'recall': 0.8, 'f1': 0.797979797979798}


Training (Epoch 4/20):   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating (Epoch 4/20):   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 4/20 - Training Loss: 0.4354, Evaluation Loss: 0.2215, Metrics: {'accuracy': 0.9, 'precision': 0.9, 'recall': 0.9, 'f1': 0.9}


Training (Epoch 5/20):   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating (Epoch 5/20):   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 5/20 - Training Loss: 0.2573, Evaluation Loss: 0.5473, Metrics: {'accuracy': 0.75, 'precision': 0.8333333333333333, 'recall': 0.75, 'f1': 0.7333333333333334}


Training (Epoch 6/20):   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating (Epoch 6/20):   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 6/20 - Training Loss: 0.4980, Evaluation Loss: 0.8851, Metrics: {'accuracy': 0.7, 'precision': 0.8125, 'recall': 0.7, 'f1': 0.6703296703296704}


Training (Epoch 7/20):   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating (Epoch 7/20):   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 7/20 - Training Loss: 0.4736, Evaluation Loss: 0.3204, Metrics: {'accuracy': 0.85, 'precision': 0.8535353535353536, 'recall': 0.8500000000000001, 'f1': 0.849624060150376}


Training (Epoch 8/20):   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating (Epoch 8/20):   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 8/20 - Training Loss: 0.2422, Evaluation Loss: 0.2795, Metrics: {'accuracy': 0.9, 'precision': 0.9166666666666667, 'recall': 0.9, 'f1': 0.898989898989899}


Training (Epoch 9/20):   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating (Epoch 9/20):   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 9/20 - Training Loss: 0.2065, Evaluation Loss: 0.5603, Metrics: {'accuracy': 0.8, 'precision': 0.8571428571428572, 'recall': 0.8, 'f1': 0.7916666666666665}


Training (Epoch 10/20):   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating (Epoch 10/20):   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 10/20 - Training Loss: 0.2466, Evaluation Loss: 0.1854, Metrics: {'accuracy': 0.95, 'precision': 0.9545454545454546, 'recall': 0.95, 'f1': 0.949874686716792}


Training (Epoch 11/20):   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating (Epoch 11/20):   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 11/20 - Training Loss: 0.1440, Evaluation Loss: 0.2217, Metrics: {'accuracy': 0.9, 'precision': 0.9166666666666667, 'recall': 0.9, 'f1': 0.898989898989899}


Training (Epoch 12/20):   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating (Epoch 12/20):   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 12/20 - Training Loss: 0.1218, Evaluation Loss: 0.2181, Metrics: {'accuracy': 0.9, 'precision': 0.9166666666666667, 'recall': 0.9, 'f1': 0.898989898989899}


Training (Epoch 13/20):   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating (Epoch 13/20):   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 13/20 - Training Loss: 0.0408, Evaluation Loss: 0.0390, Metrics: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}


Training (Epoch 14/20):   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating (Epoch 14/20):   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 14/20 - Training Loss: 0.0299, Evaluation Loss: 0.0278, Metrics: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}


Training (Epoch 15/20):   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating (Epoch 15/20):   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 15/20 - Training Loss: 0.0233, Evaluation Loss: 0.0221, Metrics: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}


Training (Epoch 16/20):   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating (Epoch 16/20):   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 16/20 - Training Loss: 0.0185, Evaluation Loss: 0.0177, Metrics: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}


Training (Epoch 17/20):   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating (Epoch 17/20):   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 17/20 - Training Loss: 0.0545, Evaluation Loss: 0.9099, Metrics: {'accuracy': 0.8, 'precision': 0.8571428571428572, 'recall': 0.8, 'f1': 0.7916666666666665}


Training (Epoch 18/20):   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating (Epoch 18/20):   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 18/20 - Training Loss: 0.0903, Evaluation Loss: 0.1638, Metrics: {'accuracy': 0.95, 'precision': 0.9545454545454546, 'recall': 0.95, 'f1': 0.949874686716792}


Training (Epoch 19/20):   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating (Epoch 19/20):   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 19/20 - Training Loss: 0.0381, Evaluation Loss: 0.5725, Metrics: {'accuracy': 0.85, 'precision': 0.8846153846153846, 'recall': 0.85, 'f1': 0.8465473145780051}


Training (Epoch 20/20):   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating (Epoch 20/20):   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 20/20 - Training Loss: 0.0555, Evaluation Loss: 0.6913, Metrics: {'accuracy': 0.8, 'precision': 0.8571428571428572, 'recall': 0.8, 'f1': 0.7916666666666665}


In [26]:
# wandb.finish()

In [27]:
trainer.save_model()


In [28]:
# Evaluate the model
results = trainer.evaluate()
trainer.save_metrics('all',results)
print(results)

[34m[1mwandb[0m: Currently logged in as: [33mjosephsf7[0m ([33mstorymodelers[0m). Use [1m`wandb login --relogin`[0m to force relogin


{'eval_loss': 0.6225994229316711, 'eval_accuracy': 0.8, 'eval_precision': 0.8571428571428572, 'eval_recall': 0.8, 'eval_f1': 0.7916666666666665, 'eval_runtime': 2.2076, 'eval_samples_per_second': 9.06, 'eval_steps_per_second': 1.359}


# Upload the model to HuggingFace

In [29]:
# Upload the model to huggingface
trainer.push_to_hub(f'storymodelers/{model}-disinformation', private = True)


ValueError: Token is required (write-access action) but no token found. You need to provide a token or be logged in to Hugging Face with `huggingface-cli login` or `huggingface_hub.login`. See https://huggingface.co/settings/tokens.