In [19]:
import os

import wandb
import pandas as pd
import numpy as np
from datasets import (load_metric, Dataset, DatasetDict)
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, TextClassificationPipeline)
import torch

torch.manual_seed(0)

<torch._C.Generator at 0x1d51c60e5b0>

In [21]:
model = 'BERT'
out_dir = f'./{model}' 
model_name = "google-bert/bert-base-uncased"

In [20]:
# Save environment variables
%env WANDB_PROJECT= 'RAG Disinformation'
%env WANDB_NOTEBOOK_NAME= 'RAG Disinformation'
%env WANDB_WATCH=all

# Log in to wandb
wandb.login()

# Initialize wandb
wandb.init(project="RAG Disinformation",  name=model,   tags=["google-bert/bert-base-uncased", "baseline", "uncased"],    group="Transformers")



env: WANDB_PROJECT='RAG Disinformation'
env: WANDB_NOTEBOOK_NAME='RAG Disinformation'
env: WANDB_WATCH=all


0,1
eval/accuracy,▁█▃
eval/f1,▁█▃
eval/loss,█▁▂
eval/precision,▁█▂
eval/recall,▁█▃
eval/runtime,█▁▁
eval/samples_per_second,▁██
eval/steps_per_second,▁██
train/epoch,▁▁▁▅▅██
train/global_step,▁▁▁▅▅██

0,1
eval/accuracy,0.85
eval/f1,0.84655
eval/loss,0.29118
eval/precision,0.88462
eval/recall,0.85
eval/runtime,7.4493
eval/samples_per_second,2.685
eval/steps_per_second,0.403
train/epoch,3.0
train/global_step,30.0


In [22]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

train['Text'] = train['Text'].astype(str)
test['Text'] = test['Text'].astype(str)
test['Disinformation'] = test['Disinformation'].astype(str)
train['Disinformation'] = train['Disinformation'].astype(str)

In [23]:
cats = train['Disinformation'].unique()
n_labels = len(cats)
label2id = {}
id2label = {}
for i in range(len(cats)):
  label2id[cats[i]] = i
  id2label[i] = cats[i]

train = train.rename(columns={'Text': 'text', 'Disinformation': 'label'})
test = test.rename(columns={'Text': 'text', 'Disinformation': 'label'})
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

train['label'] = train['label'].map(label2id)
test['label'] = test['label'].map(label2id)

train_dataset = Dataset.from_dict(train)
test_dataset = Dataset.from_dict(test)
my_dataset_dict = DatasetDict({"train":train_dataset,"test":test_dataset})

In [24]:
torch.cuda.empty_cache()

In [25]:
# Tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained(model_name, id2label = id2label, label2id = label2id)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = my_dataset_dict.map(tokenize_function, batched=True)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=0) 
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=0) 

Map: 100%|██████████| 80/80 [00:00<00:00, 837.00 examples/s]
Map: 100%|██████████| 20/20 [00:00<00:00, 329.31 examples/s]


In [26]:
# Define the model 
model = AutoModelForSequenceClassification.from_pretrained(model_name, id2label = id2label, label2id = label2id,
                                                           num_labels=n_labels, ignore_mismatched_sizes=True) 

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
# Define the metrics 

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    metric1 = load_metric("accuracy")
    accuracy = metric1.compute(predictions=predictions, references=labels)["accuracy"]
    if n_labels == 2:
      metric2 = load_metric("precision")
      metric3 = load_metric("recall")
      metric4 = load_metric("f1")
      precision = metric2.compute(predictions=predictions, references=labels, average = 'macro')["precision"]
      recall = metric3.compute(predictions=predictions, references=labels, average = 'macro')["recall"]
      f1 = metric4.compute(predictions=predictions, references=labels, average = 'macro')["f1"]
      return {"accuracy":accuracy, "precision": precision, "recall": recall, "f1":f1}
    elif n_labels > 2:
      return {"accuracy":accuracy}

In [28]:
n_epochs = 20

training_args = TrainingArguments(output_dir=out_dir, evaluation_strategy="epoch", 
                                  save_strategy = "epoch", save_total_limit = 2, 
                                  logging_strategy = "epoch",
                                  report_to="wandb", run_name=model,
                                  seed = 0, num_train_epochs = n_epochs,
                                  auto_find_batch_size = True)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [29]:
for batch in trainer.get_train_dataloader():
    break

outputs = trainer.model.cpu()(**batch)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Device: {device}")
batch = {k: v.to(device) for k, v in batch.items()}

outputs = trainer.model.to(device)(**batch)

loss = outputs.loss
loss.backward()

trainer.create_optimizer()
trainer.optimizer.step()

Device: cpu


In [30]:
trainer.train()
wandb.finish()

  5%|▌         | 10/200 [01:14<22:22,  7.07s/it]

  lambda data: self._console_raw_callback("stderr", data),
                                                
  5%|▌         | 10/200 [01:14<22:22,  7.07s/it]

{'loss': 0.7234, 'grad_norm': 5.417157173156738, 'learning_rate': 4.75e-05, 'epoch': 1.0}


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
                                                

  lambda data: self._console_raw_callback("stderr", data),
[A[A                                          
  5%|▌         | 10/200 [01:21<22:22, 

{'eval_loss': 0.6414111256599426, 'eval_accuracy': 0.65, 'eval_precision': 0.7941176470588236, 'eval_recall': 0.65, 'eval_f1': 0.6011396011396011, 'eval_runtime': 6.9859, 'eval_samples_per_second': 2.863, 'eval_steps_per_second': 0.429, 'epoch': 1.0}


AttributeError: Can't pickle local object 'TorchHistory.add_log_parameters_hook.<locals>.<lambda>'

In [None]:
trainer.save_model()


In [None]:
# Evaluate the model
results = trainer.evaluate()
trainer.save_metrics('all',results)
print(results)

# Upload the model to HuggingFace

In [None]:
# Upload the model to huggingface
trainer.push_to_hub(f'storymodelers/{model}-disinformation', private = True)
