모델은 반드시 gpt-2 데이터 다르게

# SCRIPT

## Import Library and Config

### library

In [1]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer
from datasets import load_dataset
from tqdm import tqdm
import json
import torch
import argparse
from datetime import date, datetime
from os import listdir

In [2]:
from itertools import chain

### config

In [3]:
# model_path_format = "/rds/general/user/jj1122/home/projects/m2d2/dataset/{model_id}/models"
# ckpt_path_format = "/checkpoint-{ckpt}"
# cache_dir = "/rds/general/user/jj1122/ephemeral/.cache/huggingface"
# General_referece
# list_of_datasets = [
#     "cs_l1",
#     "nlin_l1",
#     "Health_and_fitness",
#     "Natural_and_physical_sciences",
#     "Religion_and_belief_systems",
#     "Culture_and_the_arts",
#     "General_referece",
#     "econ_l1",
#     "History_and_events",
#     "Human_activites",
#     "Mathematics_and_logic",
# #     "astro-ph_l1",
# #     "cond-mat_l1",
#     "eess_l1",
# #     "math_l1",
# #     "physics_l1 (ERROR)",
#     "q-bio_l1",
#     "q-fin_l1",
#     "stat_l1",
#     "Philosophy",
#     "Philosophy_and_thinking",
#     "Society_and_social_sciences",
#     "Technology_and_applied_sciences",

# ]

device_id = "mps"
today_dt = date.today().strftime("%y%m%d")

model_id = "gpt2"
model_path = "gpt2"
model_type = "zsh"

output_file = f"./output_logs/{today_dt}_{model_type}.json"
ds = "Religion_and_belief_systems"
stride = 1024

## Load Model, Tokeniser, Dataset

In [4]:
model = GPT2LMHeadModel.from_pretrained(model_path).to(device_id)
model.eval()
tokenizer = GPT2TokenizerFast.from_pretrained(model_path)

dataset = load_dataset(
    "machelreid/m2d2", ds#, cache_dir=cache_dir
)

Found cached dataset m2d2 (/Users/joon/.cache/huggingface/datasets/machelreid___m2d2/Religion_and_belief_systems/0.0.0/eb235f33a5de3163c10549b7f63c906910539c8a8c0ec5ade1285ccbf5067d00)


  0%|          | 0/3 [00:00<?, ?it/s]

## Tokenisation & Grouping Text

In [5]:
dict_input_ids = {}

for gubun in ["validation"]: # ["train", "validation", "test"]:
    if gubun == "train": continue

    # TOKENISATION
    encodings = tokenizer("\n".join(dataset[gubun]["text"]), return_tensors="pt")

    # GROUPING TEXT
    with torch.no_grad():
        max_length = model.config.n_positions
        nlls = []
        list_input_ids = []
        for i in tqdm(range(0, encodings.input_ids.size(1), stride)):
            begin_loc = max(i + stride - max_length, 0)
            end_loc = min(i + stride, encodings.input_ids.size(1))
            trg_len = end_loc - i  # may be different from stride on last loop
            input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device_id)
            target_ids = input_ids.clone()
            target_ids[:, :-trg_len] = -100
            list_input_ids.append(input_ids)


Token indices sequence length is longer than the specified maximum sequence length for this model (5516181 > 1024). Running this sequence through the model will result in indexing errors
100%|█████████████████████████████████████████████████████████████████████████████████| 5387/5387 [00:02<00:00, 2377.10it/s]


# Results
## Raw Dataset

In [6]:
dataset["validation"]["text"][:5]

['',
 'William Henry Willimon',
 '',
 'William Henry Willimon (born May 15, 1946) is an American theologian and bishop in the United Methodist Church, retired, who for eight years served the North Alabama Conference. He is currently Professor of the Practice of Christian Ministry and Director of the Doctor of Ministry program at Duke Divinity School. He is former Dean of the Chapel at Duke University and is considered by many as one of America\'s best-known and most influential preachers. A Pulpit & Pew Research on Pastoral Leadership survey determined that he was one of the two most frequently read writers by pastors in mainline Protestantism alongside the Roman Catholic writer Henri Nouwen. His books have sold over a million copies. He is also Editor-At-Large of "The Christian Century". His 2019 memoir "Accidental Preacher" was released to wide acclaim, described by Justo L. Gonzalez as "An exceptional example of theology at its best."',
 'Biography.']

## Tokenisation

In [7]:
encodings['input_ids'][0][:10]

tensor([  198, 17121,  8616,  2561, 20473,   198,   198, 17121,  8616,  2561])

In [8]:
print(tokenizer.convert_ids_to_tokens(encodings['input_ids'][0][:10]))

['Ċ', 'William', 'ĠHenry', 'ĠWill', 'imon', 'Ċ', 'Ċ', 'William', 'ĠHenry', 'ĠWill']


In [9]:
len(list(encodings['input_ids'][0]))

5516181

## Grouping Text

In [14]:
list_input_ids[0][:10]

tensor([[  198, 17121,  8616,  ..., 32390,  5535,    11]], device='mps:0')

In [16]:
print(tokenizer.convert_ids_to_tokens(list_input_ids[0][0][:10]))

['Ċ', 'William', 'ĠHenry', 'ĠWill', 'imon', 'Ċ', 'Ċ', 'William', 'ĠHenry', 'ĠWill']


In [28]:
len(list_input_ids)

5387

# Script to Trainer pre-processing

## Tokenisation

In [22]:
def tokenize_function(examples):
    output = tokenizer(examples['text'])
    # clm input could be much much longer than block_size
    return output
    
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=8,
    remove_columns=['text'],
    load_from_cache_file=True,
    desc="Running tokenizer on dataset",
)


Running tokenizer on dataset (num_proc=8):   0%|          | 0/1378270 [00:00<?, ? examples/s]

Running tokenizer on dataset (num_proc=8):   0%|          | 0/108776 [00:00<?, ? examples/s]

Running tokenizer on dataset (num_proc=8):   0%|          | 0/108776 [00:00<?, ? examples/s]

In [36]:
len(list(chain(*(tokenized_datasets['validation']['input_ids']))))

5414308

## group_texts

In [70]:
block_size = tokenizer.model_max_length
if block_size > 1024:
    block_size = 1024

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    num_proc=8,
    load_from_cache_file=True,
    desc=f"Grouping texts in chunks of {block_size}",
)

                                                                                                       

## Results

### Raw dataset

In [71]:
dataset["validation"]["text"][:5]

['',
 'William Henry Willimon',
 '',
 'William Henry Willimon (born May 15, 1946) is an American theologian and bishop in the United Methodist Church, retired, who for eight years served the North Alabama Conference. He is currently Professor of the Practice of Christian Ministry and Director of the Doctor of Ministry program at Duke Divinity School. He is former Dean of the Chapel at Duke University and is considered by many as one of America\'s best-known and most influential preachers. A Pulpit & Pew Research on Pastoral Leadership survey determined that he was one of the two most frequently read writers by pastors in mainline Protestantism alongside the Roman Catholic writer Henri Nouwen. His books have sold over a million copies. He is also Editor-At-Large of "The Christian Century". His 2019 memoir "Accidental Preacher" was released to wide acclaim, described by Justo L. Gonzalez as "An exceptional example of theology at its best."',
 'Biography.']

### Tokenisation

In [72]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1378270
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 108776
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 108776
    })
})

In [73]:
print(tokenized_datasets["validation"]["input_ids"][:5])

[[], [17121, 8616, 2561, 20473], [], [17121, 8616, 2561, 20473, 357, 6286, 1737, 1315, 11, 22717, 8, 318, 281, 1605, 37275, 666, 290, 24233, 287, 262, 1578, 38029, 4564, 11, 9880, 11, 508, 329, 3624, 812, 4983, 262, 2258, 9266, 8785, 13, 679, 318, 3058, 8129, 286, 262, 19939, 286, 4302, 9475, 290, 5890, 286, 262, 9356, 286, 9475, 1430, 379, 11083, 33170, 3961, 13, 679, 318, 1966, 11325, 286, 262, 32939, 379, 11083, 2059, 290, 318, 3177, 416, 867, 355, 530, 286, 2253, 338, 1266, 12, 4002, 290, 749, 14212, 662, 17892, 13, 317, 21624, 15544, 1222, 21805, 4992, 319, 11303, 6864, 26935, 5526, 5295, 326, 339, 373, 530, 286, 262, 734, 749, 6777, 1100, 8786, 416, 43978, 287, 50114, 25310, 1042, 7848, 262, 7993, 7835, 6260, 44485, 44599, 21006, 13, 2399, 3835, 423, 2702, 625, 257, 1510, 9088, 13, 679, 318, 635, 12058, 12, 2953, 12, 21968, 286, 366, 464, 4302, 13641, 1911, 2399, 13130, 24649, 366, 17320, 35182, 3771, 3493, 1, 373, 2716, 284, 3094, 21684, 11, 3417, 416, 2329, 78, 406, 13, 24416, 

In [74]:
print(tokenizer.convert_ids_to_tokens(chain(*tokenized_datasets["validation"]["input_ids"][:5])))

['William', 'ĠHenry', 'ĠWill', 'imon', 'William', 'ĠHenry', 'ĠWill', 'imon', 'Ġ(', 'born', 'ĠMay', 'Ġ15', ',', 'Ġ1946', ')', 'Ġis', 'Ġan', 'ĠAmerican', 'Ġtheolog', 'ian', 'Ġand', 'Ġbishop', 'Ġin', 'Ġthe', 'ĠUnited', 'ĠMethodist', 'ĠChurch', ',', 'Ġretired', ',', 'Ġwho', 'Ġfor', 'Ġeight', 'Ġyears', 'Ġserved', 'Ġthe', 'ĠNorth', 'ĠAlabama', 'ĠConference', '.', 'ĠHe', 'Ġis', 'Ġcurrently', 'ĠProfessor', 'Ġof', 'Ġthe', 'ĠPractice', 'Ġof', 'ĠChristian', 'ĠMinistry', 'Ġand', 'ĠDirector', 'Ġof', 'Ġthe', 'ĠDoctor', 'Ġof', 'ĠMinistry', 'Ġprogram', 'Ġat', 'ĠDuke', 'ĠDivinity', 'ĠSchool', '.', 'ĠHe', 'Ġis', 'Ġformer', 'ĠDean', 'Ġof', 'Ġthe', 'ĠChapel', 'Ġat', 'ĠDuke', 'ĠUniversity', 'Ġand', 'Ġis', 'Ġconsidered', 'Ġby', 'Ġmany', 'Ġas', 'Ġone', 'Ġof', 'ĠAmerica', "'s", 'Ġbest', '-', 'known', 'Ġand', 'Ġmost', 'Ġinfluential', 'Ġpre', 'achers', '.', 'ĠA', 'ĠPul', 'pit', 'Ġ&', 'ĠPew', 'ĠResearch', 'Ġon', 'ĠPast', 'oral', 'ĠLeadership', 'Ġsurvey', 'Ġdetermined', 'Ġthat', 'Ġhe', 'Ġwas', 'Ġone', 'Ġof', 'Ġth

In [75]:
tokenizer.convert_ids_to_tokens([198])

['Ċ']

In [76]:
len(list(chain(*tokenized_datasets["validation"]["input_ids"])))

5414308

### Grouping Text

In [77]:
lm_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 86291
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5230
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5256
    })
})

In [78]:
lm_datasets['validation'][0]['input_ids'][:5]

[17121, 8616, 2561, 20473, 17121]

In [79]:
print(tokenizer.convert_ids_to_tokens(lm_datasets['validation'][0]['input_ids'][:5]))


['William', 'ĠHenry', 'ĠWill', 'imon', 'William']


In [80]:
len(lm_datasets['validation'])

5230

# Perplexity with Trainer pre-processing

In [37]:
def eval_ppl_v3(model, tokenized_datasets, stride, device=device_id):
    max_length = model.config.n_positions
    nlls = []
    encodings = list(chain(*(tokenized_datasets['validation']['input_ids'])))
    n_words = len(encodings)

    encodings = torch.tensor([encodings])
    
    for i in tqdm(range(0, n_words, stride)):
        begin_loc = max(i + stride - max_length, 0)
        end_loc = min(i + stride, n_words)
        trg_len = end_loc - i  # may be different from stride on last loop
        # input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
        input_ids = encodings[:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            neg_log_likelihood = outputs[0] * trg_len

        nlls.append(neg_log_likelihood)

    ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
    return ppl

In [38]:
with torch.no_grad():
    ppl = eval_ppl_v3(model, tokenized_datasets, stride=1024).item()


100%|███████████████████████████████████████████████████████████████████████████████████| 5288/5288 [07:52<00:00, 11.19it/s]


In [39]:
ppl

27.71837043762207

In [None]:
# encodings format 한번더 확 인
encodings = list(chain(*tokenized_datasets["validation"]["input_ids"]))

In [1]:
!pwd

/Users/joon/PycharmProjects/imperial/m2d2/utils
