# M2D2 Baseline

## import library

In [1]:
import torch

import numpy as np
import math

import datasets

from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import Trainer, TrainingArguments
from transformers import set_seed
from transformers import EarlyStoppingCallback

In [2]:
import transformers
transformers.__version__

'4.26.1'

## conifg

In [3]:
set_seed(718)

model_checkpoint = "gpt2"
model_name = model_checkpoint.split("/")[-1]

# list_dataset_checkpoints = ["econ.TH", "econ.EM", "econ.GN"]
list_dataset_checkpoints = [
    "Health_and_fitness__Exercise",
    "Health_and_fitness__Health_science",
    "Health_and_fitness__Human_medicine",
    "Health_and_fitness__Nutrition",
    "Health_and_fitness__Public_health",
    "Health_and_fitness__Self_care"
]

In [4]:
# # TrainingArgument which enables to use mps (m1 max)
# class TrainingArgumentsWithMPSSupport(TrainingArguments):
#     @property
#     def device(self) -> torch.device:
#         if torch.backends.mps.is_available():
#             return torch.device("mps")  
#         elif torch.cuda.is_available():
#             return torch.device("cuda")
#         else:
#             return torch.device("cpu")

## Tokeniser

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(examples):
    return tokenizer(examples["text"], max_length=1024, truncation=True)

## Preparing the dataset

In [6]:
# TODO
block_size = 1000

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

## Train or Eval

In [None]:
dict_eval_results = {}
dict_perplexity = {}

# for dataset_checkpoint in list_dataset_checkpoints:
for dataset_checkpoint in ["Health_and_fitness"]:
    # load dataset
    dataset_name = dataset_checkpoint.replace('.', '__')
    dataset = datasets.load_dataset("machelreid/m2d2", dataset_checkpoint)
    
    # tokenise dataset
    tokenized_dataset = dataset.map(
        tokenize_function, batched=True, num_proc=8, remove_columns=["text"]
    )
    
    # group text
    lm_dataset = tokenized_dataset.map(
        group_texts,
        batched=True,
        batch_size=64,  # TODO
        num_proc=8,
    )

    # load model
    model = AutoModelForCausalLM.from_pretrained(model_checkpoint).to("mps")

    # set training argument
    training_args = TrainingArgumentsWithMPSSupport(
        f"{model_name}-finetuned-m2d2-{dataset_name}",
        learning_rate=5e-5,
        max_grad_norm=0.1,
        push_to_hub=False, 
        max_steps=1_000_000,
        save_steps=1_000,
        eval_steps=1_000,
        evaluation_strategy = "steps",
        warmup_steps=10_000,
        lr_scheduler_type="polynomial",
        adam_beta1=0.9,
        adam_beta2=0.99,
        adam_epsilon=1e-6,
        weight_decay=0.01,
        load_best_model_at_end = True,
        use_mps_device=True,   # TODO
        
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        callbacks=[EarlyStoppingCallback],
        train_dataset=lm_dataset["train"],
        eval_dataset=lm_dataset["test"]   # Test로 바꿈
    )
    
    # train model (if neccessary, commented)
    trainer.train()    

    eval_results = trainer.evaluate()
    if "eval_loss" in eval_results.keys():
        eval_loss = math.exp(eval_results['eval_loss'])
        
    else:
        eval_loss = None
    
    dict_eval_results[dataset_name] = eval_results
    dict_perplexity[dataset_name] = eval_loss
    
    print(f"{model_name} = Perplexity: {eval_loss}")        
    print(eval_results)

Found cached dataset m2d2 (/Users/joon/.cache/huggingface/datasets/machelreid___m2d2/Health_and_fitness/0.0.0/eb235f33a5de3163c10549b7f63c906910539c8a8c0ec5ade1285ccbf5067d00)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /Users/joon/.cache/huggingface/datasets/machelreid___m2d2/Health_and_fitness/0.0.0/eb235f33a5de3163c10549b7f63c906910539c8a8c0ec5ade1285ccbf5067d00/cache-9fc4eeabe2741c88_*_of_00008.arrow
Loading cached processed dataset at /Users/joon/.cache/huggingface/datasets/machelreid___m2d2/Health_and_fitness/0.0.0/eb235f33a5de3163c10549b7f63c906910539c8a8c0ec5ade1285ccbf5067d00/cache-a4b0ecac7bc283c1_*_of_00008.arrow
Loading cached processed dataset at /Users/joon/.cache/huggingface/datasets/machelreid___m2d2/Health_and_fitness/0.0.0/eb235f33a5de3163c10549b7f63c906910539c8a8c0ec5ade1285ccbf5067d00/cache-88293f18cd12c497_*_of_00008.arrow
Loading cached processed dataset at /Users/joon/.cache/huggingface/datasets/machelreid___m2d2/Health_and_fitness/0.0.0/eb235f33a5de3163c10549b7f63c906910539c8a8c0ec5ade1285ccbf5067d00/cache-5295766e498366dc_*_of_00008.arrow
Loading cached processed dataset at /Users/joon/.cache/huggingface/datasets/machelreid___m2d2/Health_and

Step,Training Loss,Validation Loss


# Results

In [9]:
dict_perplexity

{'Health_and_fitness': 24.175176392925994}

In [11]:
dict_perplexity

{'Health_and_fitness__Exercise': 22.791927840699504,
 'Health_and_fitness__Health_science': 23.775267478176303,
 'Health_and_fitness__Human_medicine': 25.71238543363651,
 'Health_and_fitness__Nutrition': 21.19498467338595,
 'Health_and_fitness__Public_health': 25.834674628429834,
 'Health_and_fitness__Self_care': 25.788999492656735}

In [12]:
np.mean(list(dict_perplexity.values()))

24.183039924497475

In [20]:
dict_perplexity

{'Health_and_fitness': 47.986888179688385}

In [13]:
dict_perplexity

{'Health_and_fitness__Exercise': 35.101186384838314,
 'Health_and_fitness__Health_science': 35.904504477851646,
 'Health_and_fitness__Human_medicine': 38.50850571936783,
 'Health_and_fitness__Nutrition': 34.13129777051368,
 'Health_and_fitness__Public_health': 38.75826808053444,
 'Health_and_fitness__Self_care': 38.30255614306639}

In [9]:
dict_perplexity

{'Health_and_fitness__Exercise': 46.30568236233967,
 'Health_and_fitness__Health_science': 46.16138126407978,
 'Health_and_fitness__Human_medicine': 49.52238612622225,
 'Health_and_fitness__Nutrition': 46.18263822333697,
 'Health_and_fitness__Public_health': 50.16925618099956,
 'Health_and_fitness__Self_care': 49.67367623750689}

In [22]:
np.mean(list(dict_perplexity.values()))

48.002503399080844

In [13]:
dict_perplexity

{'Health_and_fitness__Exercise': 46.33183278324835,
 'Health_and_fitness__Health_science': 46.036602150949726,
 'Health_and_fitness__Human_medicine': 48.79269617748292,
 'Health_and_fitness__Nutrition': 45.17124924902259,
 'Health_and_fitness__Public_health': 49.91661428786924,
 'Health_and_fitness__Self_care': 49.98512799517897}

In [14]:
np.mean(list(dict_perplexity.values()))

47.705687107291965