In [1]:
import wandb
# wandb.login()
wandb.init(project="PAN", entity="jueri")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjueri[0m. Use [1m`wandb login --relogin`[0m to force relogin


# BERT Sentence Classifyer
## Prepare Dataset

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from typing import List, Dict

In [3]:
def load_dataset(
    fields: List[str], 
    files: Dict[str, str]={"train": "Data/webis-clickbait-22/train.jsonl", "validation":"Data/webis-clickbait-22/validation.jsonl"}) -> pd.DataFrame:
    mapping: Dict[str, str]={'passage': 0, 'phrase':1, 'multi':2}

    dataset = {}
    
    def encode_label(label: str):
        return mapping[label]

    def load_data(file: str):
        df = pd.read_json(file, lines=True)

        data = []
        for _, i in df.iterrows():
            text = ""
            for field in fields:
                if isinstance(i[field], list):
                    text += ' '.join(i[field])
                elif isinstance(field, str):
                    text += i[field]
                else:
                    raise NotImplemented

            data.append({
                "text": text,
                "label": encode_label(i["tags"][0])})
        return data


    for split in list(files.keys()):
        dataset[split] = load_data(files[split])

        
    return dataset

In [4]:
fields=["postText", "targetTitle", "targetParagraphs"]
dataset = load_dataset(fields=fields)

wandb.log({'fields': str(fields)})

In [5]:
from datasets import Dataset

dataset_train = Dataset.from_list(dataset["train"])
dataset_validation = Dataset.from_list(dataset["validation"])

  from .autonotebook import tqdm as notebook_tqdm


## Preprocess Dataset

In [6]:
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [7]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [8]:
tokenized_data_train = dataset_train.map(preprocess_function, batched=True)
tokenized_data_validation = dataset_validation.map(preprocess_function, batched=True)

 75%|███████▌  | 3/4 [00:01<00:00,  2.19ba/s]
  0%|          | 0/1 [00:00<?, ?ba/s]


In [9]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Prepare Model

In [10]:
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [11]:
import numpy as np
import evaluate

metric = evaluate.load("f1")
clf_metrics = evaluate.combine(["accuracy", "f1"])

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="weighted")

In [12]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    report_to="wandb",
    logging_steps=10
)

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data_train,
    eval_dataset=tokenized_data_validation,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3200
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2000
  Number of trainable parameters = 109484547
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,1.0544
20,1.0306
30,1.0781
40,1.0412
50,1.0301
60,1.0041
70,1.0619
80,1.039
90,1.0455
100,1.0224


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1500/special_toke

TrainOutput(global_step=2000, training_loss=0.5816252614408731, metrics={'train_runtime': 572.0541, 'train_samples_per_second': 27.969, 'train_steps_per_second': 3.496, 'total_flos': 4207948222840992.0, 'train_loss': 0.5816252614408731, 'epoch': 5.0})

In [14]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 8


{'eval_loss': 1.1840167045593262,
 'eval_f1': 0.6696366020170901,
 'eval_runtime': 9.0334,
 'eval_samples_per_second': 88.56,
 'eval_steps_per_second': 11.07,
 'epoch': 5.0}

In [15]:
wandb.finish()

0,1
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,█▇▇▇█▇▇▆▇▆▆▇▅▅▇▅▄▄▄▄▃▄▄▄▂▂▂▂▃▃▂▂▂▃▁▁▂▁▁▂
train/total_flos,▁

0,1
eval/f1,0.66964
eval/loss,1.18402
eval/runtime,9.0334
eval/samples_per_second,88.56
eval/steps_per_second,11.07
fields,"['postText', 'target..."
train/epoch,5.0
train/global_step,2000
train/learning_rate,0.0
train/loss,0.3466
