In [45]:
from transformers import AutoModelForMaskedLM

model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)


In [46]:
distilbert_num_parameters = model.num_parameters() / 1_000_000
print(f"'>>> DistilBERT number of parameters: {round(distilbert_num_parameters)}M'")
print(f"'>>> BERT number of parameters: 110M'")


'>>> DistilBERT number of parameters: 67M'
'>>> BERT number of parameters: 110M'


In [47]:
text = "This is a great [MASK]."


In [48]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


In [49]:
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [50]:
import torch

inputs = tokenizer(text, return_tensors="pt")
print(inputs)
print(inputs["input_ids"])
print(inputs.tokens())
print(f"mask_token_id: {tokenizer.mask_token_id}")
token_logits = model(**inputs).logits
# [MASK]의 위치를 찾고, 해당 logits을 추출합니다.
print(token_logits.shape)
print(torch.where(inputs["input_ids"] == tokenizer.mask_token_id)) # tuple (tensor([0]), tensor([5]))
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1] #mask_token_id = 103
print(mask_token_index)
mask_token_logits = token_logits[0, mask_token_index, :]
print(mask_token_logits.shape)
print(torch.topk(mask_token_logits, 5, dim=1).values[0].tolist())
print(torch.topk(mask_token_logits, 5, dim=1).indices) # torch.Size([1,5])
# 가장 큰 logits값을 가지는 [MASK] 후보를 선택합니다.
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
print(top_5_tokens)
print(tokenizer.mask_token) # [MASK]
print(tokenizer.decode(top_5_tokens))
for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'") # text = "This is a great [MASK]."


{'input_ids': tensor([[ 101, 2023, 2003, 1037, 2307,  103, 1012,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}
tensor([[ 101, 2023, 2003, 1037, 2307,  103, 1012,  102]])
['[CLS]', 'this', 'is', 'a', 'great', '[MASK]', '.', '[SEP]']
mask_token_id: 103
torch.Size([1, 8, 30522])
(tensor([0]), tensor([5]))
tensor([5])
torch.Size([1, 30522])
[7.07273530960083, 6.651431083679199, 6.642458915710449, 6.252985954284668, 5.861802101135254]
tensor([[3066, 3112, 6172, 2801, 8658]])
[3066, 3112, 6172, 2801, 8658]
[MASK]
deal success adventure idea feat
'>>> This is a great deal.'
'>>> This is a great success.'
'>>> This is a great adventure.'
'>>> This is a great idea.'
'>>> This is a great feat.'


In [51]:
from datasets import load_dataset

imdb_dataset = load_dataset("imdb")
imdb_dataset


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [52]:
print(imdb_dataset.column_names)
print(imdb_dataset["train"].column_names)

{'train': ['text', 'label'], 'test': ['text', 'label'], 'unsupervised': ['text', 'label']}
['text', 'label']


In [53]:
sample = imdb_dataset["train"].shuffle(seed=42).select(range(3))

for row in sample:
    print(f"\n'>>> Review: {row['text']}'")
    print(f"'>>> Label: {row['label']}'")


'>>> Review: There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier's plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!). Maybe it's the language, or the spirit, but I think this series is more English than American. By the way, the actors are really good and funny. The acting is not superficial at all...'
'>>> Label: 1'

'>>> Review: This movie is a great. The plot is very true to the book which is a classic written by Mark Twain. The movie starts of with a scene where Hank sings a song with a bunch of kids called "when you stu

In [54]:
sample = imdb_dataset["unsupervised"].shuffle(seed=42).select(range(3))

for row in sample:
    print(f"\n'>>> Review: {row['text']}'")
    print(f"'>>> Label: {row['label']}'")


'>>> Review: If you've seen the classic Roger Corman version starring Vincent Price it's hard to put it out of your head, but you probably should do because this one is totally different. Subtlety has been abandoned in favour of gross-out horror - nudity, gore and all-round unpleasantness. OK it's ridiculous, trashy, sensationalised and historically dubious (did any members of the Inquisition really wear horn-rimmed glasses?), but despite all this it is strangely compelling. I literally couldn't tear myself away from the screen until the end of the movie. If there's a bigger compliment you can pay to a film I don't know what it is.'
'>>> Label: -1'

'>>> Review: For me, this was the most moving film of the decade. Samira Makhmalbaf shows pure bravery and vision in the making. She has an intelligence and gift for speaking to the people, regardless of their nationality or beliefs. I am inspired and touched by her humanity and can only hope that she has touched many people the same way. 

In [55]:
import torch

def tokenize_function(examples):
    result = tokenizer(examples["text"])
    #print(torch.tensor(result["input_ids"]).shape) 
    #print([len(result["input_ids"][i]) for i in range(len(result["input_ids"]))])
    #[363, 304, 133, 185, 495, 154,.....] [..1000개..]
    if tokenizer.is_fast:
        #print(len(result["input_ids"])) # when batched = True, batch size = 1000
        #1000 1000 1000 1000 ....
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
        #result["word_ids"] = [result.word_ids(i) for i, sample in enumerate(result["input_ids"])]
    return result


# 빠른 멀티스레딩을 작동시키기 위해서, batched=True를 지정합니다.
tokenized_datasets = imdb_dataset.map(
    tokenize_function, batched=True, remove_columns=["text", "label"]
)
print(tokenized_datasets)
#print(tokenized_datasets["train"]["input_ids"][:2])
print(tokenized_datasets["train"]["word_ids"][:2])

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (519 > 512). Running this sequence through the model will result in indexing errors


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 50000
    })
})
[[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 143,

In [56]:
imdb_dataset.column_names

{'train': ['text', 'label'],
 'test': ['text', 'label'],
 'unsupervised': ['text', 'label']}

In [57]:
import torch

def tokenize_function(examples):
    result = tokenizer(examples["text"])
    #print(torch.tensor(result["input_ids"]).shape) # dealing with row by row, not batched
#torch.Size([363])
#torch.Size([304])
#torch.Size([133])
#torch.Size([185])
#torch.Size([495]),....
    if tokenizer.is_fast:
        #print(len(result["input_ids"])) # when batched != True, result["inputs_ids"] = the length of review 
        #result["word_ids"] = [ i for i in range(len(result["input_ids"]))]
        result["word_ids"] = result.word_ids() #no batched
    return result


# 빠른 멀티스레딩을 작동시키기 위해서, batched=True를 지정합니다.
tokenized_datasets_no_batch = imdb_dataset.map(
    #tokenize_function, batched=True, remove_columns=["text", "label"]
    tokenize_function, remove_columns=imdb_dataset["train"].column_names,
)
print(tokenized_datasets_no_batch)
#print(tokenized_datasets_no_batch["train"]["input_ids"][:2])
print(tokenized_datasets_no_batch["train"]["word_ids"][:2]) # batch sizes are 363, 304, 133, ...

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 50000
    })
})
[[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 143,

In [58]:
tokenizer.model_max_length

512

In [59]:
chunk_size = 128

In [60]:
# Slicing produces a list of lists for each feature
tokenized_samples = tokenized_datasets["train"][:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")


'>>> Review 0 length: 363'
'>>> Review 1 length: 304'
'>>> Review 2 length: 133'


In [61]:
print(tokenized_samples)

{'input_ids': [[101, 1045, 12524, 1045, 2572, 8025, 1011, 3756, 2013, 2026, 2678, 3573, 2138, 1997, 2035, 1996, 6704, 2008, 5129, 2009, 2043, 2009, 2001, 2034, 2207, 1999, 3476, 1012, 1045, 2036, 2657, 2008, 2012, 2034, 2009, 2001, 8243, 2011, 1057, 1012, 1055, 1012, 8205, 2065, 2009, 2412, 2699, 2000, 4607, 2023, 2406, 1010, 3568, 2108, 1037, 5470, 1997, 3152, 2641, 1000, 6801, 1000, 1045, 2428, 2018, 2000, 2156, 2023, 2005, 2870, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 1996, 5436, 2003, 8857, 2105, 1037, 2402, 4467, 3689, 3076, 2315, 14229, 2040, 4122, 2000, 4553, 2673, 2016, 2064, 2055, 2166, 1012, 1999, 3327, 2016, 4122, 2000, 3579, 2014, 3086, 2015, 2000, 2437, 2070, 4066, 1997, 4516, 2006, 2054, 1996, 2779, 25430, 14728, 2245, 2055, 3056, 2576, 3314, 2107, 2004, 1996, 5148, 2162, 1998, 2679, 3314, 1999, 1996, 2142, 2163, 1012, 1999, 2090, 4851, 8801, 1998, 6623, 7939, 4697, 3619, 1997, 8947, 2055, 2037, 10740, 2006, 4331, 1010, 2016, 2038, 3348, 2007, 2014, 3689, 38

In [62]:
#print(sum([1, 2, 3], [4]))
print(sum([[1, 2, 3]], []))
print(sum([[1, 2, 3],[4, 5, 6, 7]], []))
print(sum([[1, 2, 3],[4, 5, 6, 7]], [8, 9]))
print(sum([[1, 2, 3]], [[4, 5]]))
#print(sum([1, 2, 3], [])) #TypeError: can only concatenate list (not "int") to list

[1, 2, 3]
[1, 2, 3, 4, 5, 6, 7]
[8, 9, 1, 2, 3, 4, 5, 6, 7]
[[4, 5], 1, 2, 3]


In [63]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
} 
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'") # 363 + 304 + 133

'>>> Concatenated reviews length: 800'


In [64]:
print(concatenated_examples)

{'input_ids': [101, 1045, 12524, 1045, 2572, 8025, 1011, 3756, 2013, 2026, 2678, 3573, 2138, 1997, 2035, 1996, 6704, 2008, 5129, 2009, 2043, 2009, 2001, 2034, 2207, 1999, 3476, 1012, 1045, 2036, 2657, 2008, 2012, 2034, 2009, 2001, 8243, 2011, 1057, 1012, 1055, 1012, 8205, 2065, 2009, 2412, 2699, 2000, 4607, 2023, 2406, 1010, 3568, 2108, 1037, 5470, 1997, 3152, 2641, 1000, 6801, 1000, 1045, 2428, 2018, 2000, 2156, 2023, 2005, 2870, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 1996, 5436, 2003, 8857, 2105, 1037, 2402, 4467, 3689, 3076, 2315, 14229, 2040, 4122, 2000, 4553, 2673, 2016, 2064, 2055, 2166, 1012, 1999, 3327, 2016, 4122, 2000, 3579, 2014, 3086, 2015, 2000, 2437, 2070, 4066, 1997, 4516, 2006, 2054, 1996, 2779, 25430, 14728, 2245, 2055, 3056, 2576, 3314, 2107, 2004, 1996, 5148, 2162, 1998, 2679, 3314, 1999, 1996, 2142, 2163, 1012, 1999, 2090, 4851, 8801, 1998, 6623, 7939, 4697, 3619, 1997, 8947, 2055, 2037, 10740, 2006, 4331, 1010, 2016, 2038, 3348, 2007, 2014, 3689, 383

In [65]:
print(tokenized_samples.keys())
print(list(tokenized_samples.keys()))
print(list(tokenized_samples.keys())[0])

dict_keys(['input_ids', 'attention_mask', 'word_ids'])
['input_ids', 'attention_mask', 'word_ids']
input_ids


In [66]:
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 32'


In [67]:
print(chunks)

{'input_ids': [[101, 1045, 12524, 1045, 2572, 8025, 1011, 3756, 2013, 2026, 2678, 3573, 2138, 1997, 2035, 1996, 6704, 2008, 5129, 2009, 2043, 2009, 2001, 2034, 2207, 1999, 3476, 1012, 1045, 2036, 2657, 2008, 2012, 2034, 2009, 2001, 8243, 2011, 1057, 1012, 1055, 1012, 8205, 2065, 2009, 2412, 2699, 2000, 4607, 2023, 2406, 1010, 3568, 2108, 1037, 5470, 1997, 3152, 2641, 1000, 6801, 1000, 1045, 2428, 2018, 2000, 2156, 2023, 2005, 2870, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 1996, 5436, 2003, 8857, 2105, 1037, 2402, 4467, 3689, 3076, 2315, 14229, 2040, 4122, 2000, 4553, 2673, 2016, 2064, 2055, 2166, 1012, 1999, 3327, 2016, 4122, 2000, 3579, 2014, 3086, 2015, 2000, 2437, 2070, 4066, 1997, 4516, 2006, 2054, 1996, 2779, 25430, 14728, 2245, 2055, 3056, 2576, 3314, 2107], [2004, 1996, 5148, 2162, 1998, 2679, 3314, 1999, 1996, 2142, 2163, 1012, 1999, 2090, 4851, 8801, 1998, 6623, 7939, 4697, 3619, 1997, 8947, 2055, 2037, 10740, 2006, 4331, 1010, 2016, 2038, 3348, 2007, 2014, 3689, 

In [68]:
def group_texts(examples):
    # 모든 텍스트들을 결합한다.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # 결합된 텍스트들에 대한 길이를 구한다.
    total_length = len(concatenated_examples["input_ids"])
    #total_length = len(concatenated_examples[list(examples.keys())[0]]) # len(concatenated_examples["input_ids"]) 
    # `chunk_size`보다 작은 경우 마지막 청크를 삭제
    total_length = (total_length // chunk_size) * chunk_size
    # max_len 길이를 가지는 chunk 단위로 슬라이스
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # 새로운 레이블 컬럼을 생성
    result["labels"] = result["input_ids"].copy()
    return result                            

In [69]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 61291
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 59904
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 122957
    })
})

In [70]:
# concantenate the fist 1000 tokenized_datasets itmes 
train_dataset_sample = tokenized_datasets["train"][:1000]
concat_dataset_sample = {k: sum(v, []) for k, v in train_dataset_sample.items()}  
print(len(concat_dataset_sample["input_ids"]))
print(len(concat_dataset_sample["word_ids"]))
print(concat_dataset_sample["input_ids"][:100])

315091
315091
[101, 1045, 12524, 1045, 2572, 8025, 1011, 3756, 2013, 2026, 2678, 3573, 2138, 1997, 2035, 1996, 6704, 2008, 5129, 2009, 2043, 2009, 2001, 2034, 2207, 1999, 3476, 1012, 1045, 2036, 2657, 2008, 2012, 2034, 2009, 2001, 8243, 2011, 1057, 1012, 1055, 1012, 8205, 2065, 2009, 2412, 2699, 2000, 4607, 2023, 2406, 1010, 3568, 2108, 1037, 5470, 1997, 3152, 2641, 1000, 6801, 1000, 1045, 2428, 2018, 2000, 2156, 2023, 2005, 2870, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 1996, 5436, 2003, 8857, 2105, 1037, 2402, 4467, 3689, 3076, 2315, 14229, 2040, 4122, 2000, 4553, 2673, 2016, 2064, 2055, 2166]


In [71]:
print(lm_datasets["train"][:2])

{'input_ids': [[101, 1045, 12524, 1045, 2572, 8025, 1011, 3756, 2013, 2026, 2678, 3573, 2138, 1997, 2035, 1996, 6704, 2008, 5129, 2009, 2043, 2009, 2001, 2034, 2207, 1999, 3476, 1012, 1045, 2036, 2657, 2008, 2012, 2034, 2009, 2001, 8243, 2011, 1057, 1012, 1055, 1012, 8205, 2065, 2009, 2412, 2699, 2000, 4607, 2023, 2406, 1010, 3568, 2108, 1037, 5470, 1997, 3152, 2641, 1000, 6801, 1000, 1045, 2428, 2018, 2000, 2156, 2023, 2005, 2870, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 1996, 5436, 2003, 8857, 2105, 1037, 2402, 4467, 3689, 3076, 2315, 14229, 2040, 4122, 2000, 4553, 2673, 2016, 2064, 2055, 2166, 1012, 1999, 3327, 2016, 4122, 2000, 3579, 2014, 3086, 2015, 2000, 2437, 2070, 4066, 1997, 4516, 2006, 2054, 1996, 2779, 25430, 14728, 2245, 2055, 3056, 2576, 3314, 2107], [2004, 1996, 5148, 2162, 1998, 2679, 3314, 1999, 1996, 2142, 2163, 1012, 1999, 2090, 4851, 8801, 1998, 6623, 7939, 4697, 3619, 1997, 8947, 2055, 2037, 10740, 2006, 4331, 1010, 2016, 2038, 3348, 2007, 2014, 3689, 

In [72]:
tokenizer.decode(lm_datasets["train"]["input_ids"][1])

"as the vietnam war and race issues in the united states. in between asking politicians and ordinary denizens of stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men. < br / > < br / > what kills me about i am curious - yellow is that 40 years ago, this was considered pornographic. really, the sex and nudity scenes are few and far between, even then it's not shot like some cheaply made porno. while my countrymen mind find it shocking, in reality sex and nudity are a major staple in swedish cinema. even ingmar bergman,"

In [73]:
tokenizer.decode(lm_datasets["train"][1]["labels"])

"as the vietnam war and race issues in the united states. in between asking politicians and ordinary denizens of stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men. < br / > < br / > what kills me about i am curious - yellow is that 40 years ago, this was considered pornographic. really, the sex and nudity scenes are few and far between, even then it's not shot like some cheaply made porno. while my countrymen mind find it shocking, in reality sex and nudity are a major staple in swedish cinema. even ingmar bergman,"

In [74]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)


In [75]:
samples = [lm_datasets["train"][i] for i in range(2)]

#for sample in samples:
#    _ = sample.pop("word_ids")

#for chunk in data_collator(samples)["input_ids"]:
#    print(f"\n'>>> {tokenizer.decode(chunk)}'")


In [76]:
print(samples)
print(len(samples[0]["input_ids"]))
print(len(samples[0]["word_ids"]))

[{'input_ids': [101, 1045, 12524, 1045, 2572, 8025, 1011, 3756, 2013, 2026, 2678, 3573, 2138, 1997, 2035, 1996, 6704, 2008, 5129, 2009, 2043, 2009, 2001, 2034, 2207, 1999, 3476, 1012, 1045, 2036, 2657, 2008, 2012, 2034, 2009, 2001, 8243, 2011, 1057, 1012, 1055, 1012, 8205, 2065, 2009, 2412, 2699, 2000, 4607, 2023, 2406, 1010, 3568, 2108, 1037, 5470, 1997, 3152, 2641, 1000, 6801, 1000, 1045, 2428, 2018, 2000, 2156, 2023, 2005, 2870, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 1996, 5436, 2003, 8857, 2105, 1037, 2402, 4467, 3689, 3076, 2315, 14229, 2040, 4122, 2000, 4553, 2673, 2016, 2064, 2055, 2166, 1012, 1999, 3327, 2016, 4122, 2000, 3579, 2014, 3086, 2015, 2000, 2437, 2070, 4066, 1997, 4516, 2006, 2054, 1996, 2779, 25430, 14728, 2245, 2055, 3056, 2576, 3314, 2107], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [77]:
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [78]:
tokenizer.mask_token_id

103

In [79]:
samples = [lm_datasets["train"][i] for i in range(2)]
#the data_collator DataCollatorForLanguageModeling doesn't understand word_ids, 
# the word_ids value with "None" causes error when calling the data collator(). 
# so, it needs to be popped up.
for sample in samples:
    _ = sample.pop("word_ids") 
    #_ = sample.pop("labels")

#print(samples)
#print(type(samples[0]["word_ids"][0])) # None, <class 'NoneType'>, data_collator Error
#ValueError: Unable to create tensor, you should probably activate truncation 
#and/or padding with 'padding=True' 'truncation=True' to have batched tensors 
#with the same length. Perhaps your features (`word_ids` in this case) have 
#excessive nesting (inputs type `list` where type `int` is expected).
#samples[0]["word_ids"][0] = 0 # assign it to O integer 
#print(type(samples[0]["word_ids"][0]))

#print(type(samples)) # <class, 'list'>
print(samples)
data_collator_samples = data_collator(samples) # list of dictionary data type 

print(data_collator_samples)
#print(data_collator_samples) # batched tensor with MASK 

#for chunk in data_collator(samples)["input_ids"]:
for chunk in data_collator_samples["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")
    print(f"\n'>>> {tokenizer.convert_ids_to_tokens(chunk)}'")


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


[{'input_ids': [101, 1045, 12524, 1045, 2572, 8025, 1011, 3756, 2013, 2026, 2678, 3573, 2138, 1997, 2035, 1996, 6704, 2008, 5129, 2009, 2043, 2009, 2001, 2034, 2207, 1999, 3476, 1012, 1045, 2036, 2657, 2008, 2012, 2034, 2009, 2001, 8243, 2011, 1057, 1012, 1055, 1012, 8205, 2065, 2009, 2412, 2699, 2000, 4607, 2023, 2406, 1010, 3568, 2108, 1037, 5470, 1997, 3152, 2641, 1000, 6801, 1000, 1045, 2428, 2018, 2000, 2156, 2023, 2005, 2870, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 1996, 5436, 2003, 8857, 2105, 1037, 2402, 4467, 3689, 3076, 2315, 14229, 2040, 4122, 2000, 4553, 2673, 2016, 2064, 2055, 2166, 1012, 1999, 3327, 2016, 4122, 2000, 3579, 2014, 3086, 2015, 2000, 2437, 2070, 4066, 1997, 4516, 2006, 2054, 1996, 2779, 25430, 14728, 2245, 2055, 3056, 2576, 3314, 2107], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [80]:
for chunk in data_collator_samples["labels"]:
    print(f"\n'>>> {chunk}'")


'>>> tensor([-100, -100, -100, 1045, -100, -100, -100, 3756, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, 2008, -100, -100, -100, -100, 2001, -100,
        -100, -100, -100, -100, -100, -100, -100, 2008, -100, -100, -100, -100,
        -100, -100, -100, -100, 1055, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, 2870, -100, -100,
        -100, -100, -100, 1026, -100, 1013, -100, 1996, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, 2000, -100, -100,
        -100, -100, -100, 2166, -100, -100, -100, -100, 4122, -100, -100, 2014,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, 3314, -100])'

'>>> tensor([ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  1996,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -

In [81]:
samples = lm_datasets["train"][0:2]
#print(samples)
samples = { k:v for k, v in samples.items() if k != "word_ids"}
print(samples) # remove the word_ids
#print(*samples.keys())
#print(zip(*samples.keys()))
print(*samples.values())
#print([v for v in zip(*samples.values())]) # list of tuples
print([dict(zip(samples,v)) for v in zip(*samples.values())])
data_collator_samples = data_collator([dict(zip(samples,v)) for v in zip(*samples.values())])
print(data_collator_samples)
#data_collator([for i in lm_datasets["train"][1:3]])

{'input_ids': [[101, 1045, 12524, 1045, 2572, 8025, 1011, 3756, 2013, 2026, 2678, 3573, 2138, 1997, 2035, 1996, 6704, 2008, 5129, 2009, 2043, 2009, 2001, 2034, 2207, 1999, 3476, 1012, 1045, 2036, 2657, 2008, 2012, 2034, 2009, 2001, 8243, 2011, 1057, 1012, 1055, 1012, 8205, 2065, 2009, 2412, 2699, 2000, 4607, 2023, 2406, 1010, 3568, 2108, 1037, 5470, 1997, 3152, 2641, 1000, 6801, 1000, 1045, 2428, 2018, 2000, 2156, 2023, 2005, 2870, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 1996, 5436, 2003, 8857, 2105, 1037, 2402, 4467, 3689, 3076, 2315, 14229, 2040, 4122, 2000, 4553, 2673, 2016, 2064, 2055, 2166, 1012, 1999, 3327, 2016, 4122, 2000, 3579, 2014, 3086, 2015, 2000, 2437, 2070, 4066, 1997, 4516, 2006, 2054, 1996, 2779, 25430, 14728, 2245, 2055, 3056, 2576, 3314, 2107], [2004, 1996, 5148, 2162, 1998, 2679, 3314, 1999, 1996, 2142, 2163, 1012, 1999, 2090, 4851, 8801, 1998, 6623, 7939, 4697, 3619, 1997, 8947, 2055, 2037, 10740, 2006, 4331, 1010, 2016, 2038, 3348, 2007, 2014, 3689, 

In [82]:
samples = [lm_datasets["train"][i] for i in range(2)]
# the data_collator DataCollatorForLanguageModeling doesn't understand word_ids, so it needs to be poped up.
for sample in samples:
    _ = sample.pop("word_ids")  

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.convert_ids_to_tokens(chunk)}'")



'>>> ['[CLS]', 'i', 'rented', '[MASK]', '##rol', 'curious', '-', 'yellow', 'from', 'my', 'video', 'store', 'because', '[MASK]', 'all', 'the', 'controversy', 'that', 'surrounded', '[MASK]', 'when', '[MASK]', '[MASK]', 'first', 'released', 'in', '1967', '.', 'i', 'also', 'heard', 'ordinance', 'at', '[MASK]', 'it', 'was', '[MASK]', 'by', 'u', '.', 's', '.', '[MASK]', 'if', 'it', 'ever', 'tried', 'to', 'enter', 'this', 'country', ',', 'therefore', 'being', 'a', 'fan', 'of', 'films', 'considered', '"', 'controversial', '"', 'i', 'really', 'had', 'to', 'see', 'this', 'for', 'myself', '.', '<', '[MASK]', '/', '>', '<', '[MASK]', '/', '>', '[MASK]', 'plot', '##bber', 'centered', 'around', 'a', 'young', 'swedish', 'drama', 'student', 'named', 'lena', 'who', 'wants', 'to', 'learn', 'everything', 'she', '[MASK]', '[MASK]', 'life', '.', 'in', 'luxury', '[MASK]', 'wants', 'to', 'focus', 'her', 'attention', '[MASK]', 'to', 'making', 'some', 'sort', 'of', 'documentary', 'on', 'what', '[MASK]', 'aver

In [83]:
print(samples)

[{'input_ids': [101, 1045, 12524, 1045, 2572, 8025, 1011, 3756, 2013, 2026, 2678, 3573, 2138, 1997, 2035, 1996, 6704, 2008, 5129, 2009, 2043, 2009, 2001, 2034, 2207, 1999, 3476, 1012, 1045, 2036, 2657, 2008, 2012, 2034, 2009, 2001, 8243, 2011, 1057, 1012, 1055, 1012, 8205, 2065, 2009, 2412, 2699, 2000, 4607, 2023, 2406, 1010, 3568, 2108, 1037, 5470, 1997, 3152, 2641, 1000, 6801, 1000, 1045, 2428, 2018, 2000, 2156, 2023, 2005, 2870, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 1996, 5436, 2003, 8857, 2105, 1037, 2402, 4467, 3689, 3076, 2315, 14229, 2040, 4122, 2000, 4553, 2673, 2016, 2064, 2055, 2166, 1012, 1999, 3327, 2016, 4122, 2000, 3579, 2014, 3086, 2015, 2000, 2437, 2070, 4066, 1997, 4516, 2006, 2054, 1996, 2779, 25430, 14728, 2245, 2055, 3056, 2576, 3314, 2107], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [84]:
#print(samples[0]["word_ids"])
print(samples[0]["input_ids"])
#mapping = collections.defaultdict(list)


[101, 1045, 12524, 1045, 2572, 8025, 1011, 3756, 2013, 2026, 2678, 3573, 2138, 1997, 2035, 1996, 6704, 2008, 5129, 2009, 2043, 2009, 2001, 2034, 2207, 1999, 3476, 1012, 1045, 2036, 2657, 2008, 2012, 2034, 2009, 2001, 8243, 2011, 1057, 1012, 1055, 1012, 8205, 2065, 2009, 2412, 2699, 2000, 4607, 2023, 2406, 1010, 3568, 2108, 1037, 5470, 1997, 3152, 2641, 1000, 6801, 1000, 1045, 2428, 2018, 2000, 2156, 2023, 2005, 2870, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 1996, 5436, 2003, 8857, 2105, 1037, 2402, 4467, 3689, 3076, 2315, 14229, 2040, 4122, 2000, 4553, 2673, 2016, 2064, 2055, 2166, 1012, 1999, 3327, 2016, 4122, 2000, 3579, 2014, 3086, 2015, 2000, 2437, 2070, 4066, 1997, 4516, 2006, 2054, 1996, 2779, 25430, 14728, 2245, 2055, 3056, 2576, 3314, 2107]


In [85]:
import numpy as np
wwm_probability = 0.2
mask = np.random.binomial(1, wwm_probability, 125)
print(len(mask)) # 125
print(type(mask)) # <class 'numpy.ndarray'>
print(mask.shape) # (125,) 
print(mask)
print(np.where(mask)) 
print(type(np.where(mask))) # <class 'tuple'>
print(np.where(mask)[0])
print(type(np.where(mask)[0])) # <class 'numpy.ndarray'>
#for m  in np.where(mask)[0]:
#    print(type(m)) # <class 'numpy.int64'>
#    print(m)  # 12
#    print(type(m.item())) # <class 'int'>
#    print(m.item()) #12

125
<class 'numpy.ndarray'>
(125,)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 1 0 0]
(array([ 23,  24,  25,  32,  38,  49,  60,  61,  66,  69,  84,  85,  92,
       102, 103, 105, 118, 122]),)
<class 'tuple'>
[ 23  24  25  32  38  49  60  61  66  69  84  85  92 102 103 105 118 122]
<class 'numpy.ndarray'>


In [86]:
import collections

# Create a defaultdict with a list as the default factory
d = collections.defaultdict(list)

# Add elements to the dictionary
d['a'].append(1)
d['a'].append(2)
d['b'].append(3)

# Accessing a non-existent key creates a new list
print(d['c'])  # Output: []

# Print the contents of the dictionary
print(d)  # Output: defaultdict(<class 'list'>, {'a': [1, 2], 'b': [3], 'c': []})


[]
defaultdict(<class 'list'>, {'a': [1, 2], 'b': [3], 'c': []})


In [87]:
import collections
import numpy as np

from pprint import pprint
from transformers import default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    #print(features)
    for feature in features:
        word_ids = feature.pop("word_ids")

        # 단어와 해당 토큰 인덱스 간의 map 생성
        #print(f"word_ids{word_ids}")
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        #pprint(f"mapping length: {len(mapping)}, mapping{mapping}")
        # 무작위로 단어 마스킹
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        #print(f"mask{mask}")
        input_ids = feature["input_ids"]
        #print(f"@input_ids{input_ids}")
        labels = feature["labels"]
        #print(f"*labels{labels}")
        new_labels = [-100] * len(labels)
        #print(f"*{new_labels}")
        for word_id in np.where(mask)[0]: # np.where(mask) is tuple (i.e., (mask_array)) and np.where(mask)[0] is numpy.ndarray
            word_id = word_id.item()
            #print(word_id)
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id # mask_token_id: 103 [MASK]
        #print(f"@@@new input_ids{input_ids}")
        #print(f"***new labels{new_labels}")
    #print(f"features: {features}")
    #results = default_data_collator(features) 
    #print(f"results: {results}") # Check the contents of the batched tensors
    return default_data_collator(features)
    #return results



In [119]:
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)
print(batch)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

for chunk in batch["labels"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


{'input_ids': tensor([[  101,  1045, 12524,  1045,  2572,  8025,  1011,  3756,  2013,   103,
           103,  3573,  2138,  1997,   103,  1996,  6704,   103,   103,   103,
          2043,  2009,  2001,   103,   103,  1999,  3476,  1012,   103,   103,
          2657,  2008,  2012,   103,   103,   103,  8243,  2011,  1057,  1012,
           103,   103,   103,   103,  2009,  2412,   103,   103,  4607,  2023,
          2406,   103,  3568,  2108,  1037,  5470,  1997,  3152,  2641,  1000,
          6801,  1000,   103,  2428,  2018,   103,  2156,  2023,  2005,  2870,
          1012,  1026,  7987,  1013,  1028,  1026,  7987,  1013,  1028,   103,
          5436,  2003,  8857,  2105,  1037,   103,   103,  3689,   103,  2315,
         14229,  2040,   103,  2000,  4553,  2673,  2016,  2064,  2055,  2166,
          1012,  1999,  3327,  2016,  4122,   103,  3579,  2014,  3086,  2015,
          2000,  2437,  2070,  4066,  1997,  4516,  2006,  2054,  1996,  2779,
         25430, 14728,  2245,  2055,  

In [89]:
tokenizer.mask_token_id

103

In [90]:
mask = np.random.binomial(1, wwm_probability, 125)
mask

array([0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0])

In [91]:
print(mask)

[0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0
 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 1 1 1 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 1 1 0 0 0 0]


In [92]:
for i in np.where(mask):
    print(i)

[  2   8  10  16  22  26  27  37  41  45  57  63  64  66  67  68  78  84
  86  99 116 119 120]


In [93]:
for i in np.where(mask)[0]:
    print(i)

2
8
10
16
22
26
27
37
41
45
57
63
64
66
67
68
78
84
86
99
116
119
120


In [94]:
train_size = 10_000
test_size = int(0.1 * train_size)

print(lm_datasets)

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 61291
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 59904
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 122957
    })
})


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1000
    })
})

In [95]:
print("aa/bb/cc/dd".split("/")[-1])
model_name = model_checkpoint.split("/")[-1]
print(model_name)
print(model_checkpoint)

dd
distilbert-base-uncased
distilbert-base-uncased


In [96]:
len(downsampled_dataset["train"]) // 64

156

In [97]:
from transformers import TrainingArguments

batch_size = 64
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-imdb",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    #push_to_hub=True,
    #fp16=True,
    logging_steps=logging_steps,
)


In [98]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
    tokenizer = tokenizer,
)


Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [99]:
import math

eval_results = trainer.evaluate()
print(eval_results)
print(math.exp(3))
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")


{'eval_loss': 3.088183641433716, 'eval_runtime': 2.1295, 'eval_samples_per_second': 469.592, 'eval_steps_per_second': 7.513}
20.085536923187668
>>> Perplexity: 21.94


In [100]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,2.6749,2.507911
2,2.59,2.448393
3,2.5298,2.481698


TrainOutput(global_step=471, training_loss=2.5984599362513063, metrics={'train_runtime': 111.1221, 'train_samples_per_second': 269.973, 'train_steps_per_second': 4.239, 'total_flos': 994208670720000.0, 'train_loss': 2.5984599362513063, 'epoch': 3.0})

In [101]:
eval_results = trainer.evaluate()
#print(eval_results)
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")


>>> Perplexity: 12.06


In [102]:
batch = {'col1': [1, 2, 3], 'col2': [4, 5, 6], 'col3': [7, 8, 9]}
features = [dict(zip(batch, t)) for t in zip(*batch.values())]
print(features)


[{'col1': 1, 'col2': 4, 'col3': 7}, {'col1': 2, 'col2': 5, 'col3': 8}, {'col1': 3, 'col2': 6, 'col3': 9}]


In [103]:
batch = {'col1': [[1, 2, 3],[11,12,13]], 'col2': [[4, 5, 6],[41,45,46]], 'col3': [[7, 8, 9],[77,78,79]]}
features = [dict(zip(batch, t)) for t in zip(*batch.values())]
print(features)

[{'col1': [1, 2, 3], 'col2': [4, 5, 6], 'col3': [7, 8, 9]}, {'col1': [11, 12, 13], 'col2': [41, 45, 46], 'col3': [77, 78, 79]}]


In [104]:
def insert_random_mask(batch):
    #print(len(batch["input_ids"])) # 1000
    #print(type(batch))
    #print(batch.values())
    #for t in zip(*batch.values()):
    #    print(dict(zip(batch,t)))
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    #print(features)
    masked_inputs = data_collator(features)
    #print(masked_inputs)
    # 데이터셋의 각 컬럼에 대해서 새로운 "masked" 컬럼을 생성
    #return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}
    return {"masked_" + k: v for k, v in masked_inputs.items()}


In [105]:
downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1000
    })
})

In [106]:
#downsampled_dataset = downsampled_dataset.remove_columns(["word_ids"])

In [107]:
print(downsampled_dataset["train"][:2])

{'input_ids': [[2060, 2084, 4438, 3185, 23176, 2015, 1011, 2113, 2505, 2055, 2009, 1012, 2002, 2001, 1037, 5016, 1999, 1996, 2449, 2005, 2012, 2560, 1037, 5476, 1012, 2002, 2071, 2031, 2180, 1996, 2914, 2400, 2005, 2023, 2836, 1010, 2029, 2052, 2031, 2445, 2032, 2048, 1999, 1037, 5216, 1010, 2004, 2002, 2180, 2009, 2005, 2652, 3434, 19351, 3126, 1996, 2095, 2077, 1012, 2026, 2219, 5448, 2003, 2008, 2096, 2002, 11121, 2000, 2058, 18908, 1037, 2978, 1010, 1045, 2145, 2228, 2002, 2001, 2028, 1997, 1996, 2307, 5889, 1997, 1996, 1000, 3585, 2287, 1012, 1000, 3649, 2112, 2002, 2209, 1025, 2017, 2020, 15544, 19510, 2098, 2000, 1996, 3898, 3666, 2032, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 4406, 1996, 19351, 3126, 2535, 1010, 1045, 2245, 2023, 2466, 19203, 1997, 1037, 2210], [2147, 1997, 2396, 1999, 2049, 2219, 2157, 1012, 1037, 2143, 2472, 2003, 3929, 4709, 2000, 2224, 3787, 2013, 4507, 1998, 2381, 2000, 3443, 1037, 2088, 1997, 2010, 2219, 1012, 19063, 2401, 2003, 1037, 6919, 2

In [108]:
downsampled_dataset = downsampled_dataset.remove_columns(["word_ids"])
eval_dataset = downsampled_dataset["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=downsampled_dataset["test"].column_names,
)
eval_dataset

Dataset({
    features: ['masked_input_ids', 'masked_attention_mask', 'masked_labels'],
    num_rows: 1000
})

In [109]:
eval_dataset = eval_dataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
    }
)
eval_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})

In [110]:
#print(type(eval_dataset["input_ids"][0]))
#print(eval_dataset[:2])

In [111]:
#len(downsampled_dataset["train"]["input_ids"])

In [112]:
#len(eval_dataset["input_ids"])

In [113]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

batch_size = 64
train_dataloader = DataLoader(
    downsampled_dataset["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator, #[MASK} inserted randomly and batched tensor
)
eval_dataloader = DataLoader(
    eval_dataset,  # fixed dataset with [MASK} 
    batch_size=batch_size, 
    collate_fn=default_data_collator # batched tensor
)


In [116]:
#for i, batch in enumerate(eval_dataloader):
#    print(batch)
#    if i > 0:
#        break

{'input_ids': tensor([[ 1012,  2138,  2017,  ...,   103,  2619,  2478],
        [ 2007,  2010,   103,  ...,  1013,   103,  1026],
        [ 1997,  1996,  2678,  ...,  5875,  3602,  1024],
        ...,
        [ 2001,   103,   103,  ...,  1010,   103,  1996],
        [  103, 13819,  1997,  ...,  2005,  1996, 14392],
        [ 2178, 22822,   103,  ...,  1996,  3422,  2144]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[ -100,  -100,  -100,  ...,  2005,  -100,  -100],
        [ -100,  -100,  2155,  ...,  -100,  1028,  -100],
        [ -100,  -100,  -100,  ...,  -100,  -100,  -100],
        ...,
        [ -100,  2014,  1999,  ...,  -100,  2431,  -100],
        [ 3811,  -100,  1997,  ...,  -100,  -100,  -100],
        [ -100,  -100, 12356,  ...,  -100,  -100,  -100]])}
{'input_ids':

In [90]:
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [91]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [92]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [93]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [94]:
output_dir = "distilbert-base-uncased-finetuned-imdb-accelerate"

In [95]:
from tqdm.auto import tqdm
import torch
import math

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # 학습
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # 평가
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        #print(loss)
        #print(len(loss.repeat(batch_size))) # 64
        losses.append(accelerator.gather(loss.repeat(batch_size)))
        #print(len(losses))
        #print(losses)

    losses = torch.cat(losses)
    #print(len(losses)) # 64 * 16
    #print(losses)
    #print(len(eval_dataset)) #1000
    losses = losses[: len(eval_dataset)] #1024개중에 앞 1000개만 사용
    try:
        perplexity = math.exp(torch.mean(losses)) # 1000개의 평균
    except OverflowError:
        perplexity = float("inf")

    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        #repo.push_to_hub(
        #    commit_message = f"Training in progress epoch {epoch}", blocking=False
        #)



  0%|          | 0/471 [00:00<?, ?it/s]

>>> Epoch 0: Perplexity: 12.133413542746464
[2024-03-23 16:16:59,839] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
>>> Epoch 1: Perplexity: 11.628669620753206
>>> Epoch 2: Perplexity: 11.436385726679568


In [6]:
## Trainer()  
## data collator /w token masking or whole word masking 
## DataCollatorForLanguageModeling or DataCollatorForWholeWordMask  
## Note: Choose DataCollatorForLanguageModeling or DataCollatorForWholeWordMask
import numpy as np
import math
import collections

from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import default_data_collator, DataCollatorForLanguageModeling
from transformers import DataCollatorForWholeWordMask
from transformers import TrainingArguments, Trainer


from datasets import load_dataset

model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

imdb_dataset = load_dataset("imdb")

def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# 빠른 멀티스레딩을 작동시키기 위해서, batched=True를 지정합니다.
tokenized_datasets = imdb_dataset.map(
    tokenize_function, batched=True, remove_columns=["text", "label"]
)

chunk_size = 128

def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // chunk_size) * chunk_size
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result                            

lm_datasets = tokenized_datasets.map(group_texts, batched=True)


# token maskiing
data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer, mlm_probability=0.15)

# whole word masking
#data_collator = DataCollatorForWholeWordMask(
#            tokenizer=tokenizer, mlm_probability=0.15)

wwm_probability = 0.2

train_size = 10_000
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)

batch_size = 64
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-imdb",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    #fp16=True,
    logging_steps=logging_steps,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
)

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

trainer.train()

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")


Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


>>> Perplexity: 21.94


Epoch,Training Loss,Validation Loss
1,2.6749,2.507911
2,2.59,2.448393
3,2.5298,2.481698


>>> Perplexity: 12.06


In [3]:
## Accelerator() w/o Trainer()
## fixed maksed eval dataset 
## token masking /w DataCollatorForLanguageModeling
## note: insert_random_mask() for token masking

import numpy as np
import math
import collections
import torch

from torch.utils.data import DataLoader

from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import default_data_collator, DataCollatorForLanguageModeling

from torch.optim import AdamW
from accelerate import Accelerator

from transformers import get_scheduler

from tqdm.auto import tqdm

from datasets import load_dataset

model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

imdb_dataset = load_dataset("imdb")

def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# 빠른 멀티스레딩을 작동시키기 위해서, batched=True를 지정합니다.
tokenized_datasets = imdb_dataset.map(
    tokenize_function, batched=True, remove_columns=["text", "label"]
)

chunk_size = 128

def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // chunk_size) * chunk_size
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result                            

lm_datasets = tokenized_datasets.map(group_texts, batched=True)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
wwm_probability = 0.2

train_size = 10_000
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)

def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    # 데이터셋의 각 컬럼에 대해서 새로운 "masked" 컬럼을 생성
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

downsampled_dataset = downsampled_dataset.remove_columns(["word_ids"])
eval_dataset = downsampled_dataset["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=downsampled_dataset["test"].column_names,
)

eval_dataset = eval_dataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
    }
)

batch_size = 64
train_dataloader = DataLoader(
    downsampled_dataset["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)

eval_dataloader = DataLoader(
    eval_dataset,
    batch_size=batch_size, 
    collate_fn=default_data_collator
)

model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

optimizer = AdamW(model.parameters(), lr=5e-5)

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

progress_bar = tqdm(range(num_training_steps))
model_name = "distilbert-base-uncased-finetuned-imdb-accelerate"
output_dir = model_name

for epoch in range(num_train_epochs):
    # 학습
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # 평가
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        losses.append(accelerator.gather(loss.repeat(batch_size)))

    losses = torch.cat(losses)
    losses = losses[: len(eval_dataset)]
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        #repo.push_to_hub(
        #    commit_message = f"Training in progress epoch {epoch}", blocking=False
        #)



Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


  0%|          | 0/471 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


>>> Epoch 0: Perplexity: 11.84233567197074
>>> Epoch 1: Perplexity: 11.30760600844206
>>> Epoch 2: Perplexity: 11.098844897972281


In [5]:
## Accelerator() w/o Trainer()
## fixed maksed eval dataset 
## whole word masking /w DataCollatorForWholeWordMask, 
## note: whole_word_masking_data_collator()

import numpy as np
import math
import collections
import torch

from torch.utils.data import DataLoader

from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import default_data_collator, DataCollatorForLanguageModeling

from torch.optim import AdamW
from accelerate import Accelerator

from transformers import get_scheduler

from tqdm.auto import tqdm

from datasets import load_dataset

model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

imdb_dataset = load_dataset("imdb")

def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# 빠른 멀티스레딩을 작동시키기 위해서, batched=True를 지정합니다.
tokenized_datasets = imdb_dataset.map(
    tokenize_function, batched=True, remove_columns=["text", "label"]
)

chunk_size = 128

def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // chunk_size) * chunk_size
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result                            

lm_datasets = tokenized_datasets.map(group_texts, batched=True)

#data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
data_collator = DataCollatorForWholeWordMask(tokenizer=tokenizer, mlm_probability=0.15)
wwm_probability = 0.2

def whole_word_masking_data_collator(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    for feature in features:
        word_ids = feature.pop("word_ids")

        # 단어와 해당 토큰 인덱스 간의 map 생성
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # 무작위로 단어 마스킹
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
    features = default_data_collator(features) # batched tensor
    #print(features)
    #return default_data_collator(features)
    return(features)

train_size = 10_000
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)

def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    # 데이터셋의 각 컬럼에 대해서 새로운 "masked" 컬럼을 생성
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

#downsampled_dataset = downsampled_dataset.remove_columns(["word_ids"])

eval_dataset = downsampled_dataset["test"].map(
    #insert_random_mask,
    whole_word_masking_data_collator,
    batched=True,
    #remove_columns=downsampled_dataset["test"].column_names,
)

#print(eval_dataset)
#Dataset({
#    features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
#    num_rows: 1000
#})

eval_dataset = eval_dataset.remove_columns(["word_ids"])

#print(eval_dataset)
#Dataset({
#    features: ['input_ids', 'attention_mask', 'labels'],
#    num_rows: 1000
#})

batch_size = 64
train_dataloader = DataLoader(
    downsampled_dataset["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)

eval_dataloader = DataLoader(
    eval_dataset, #fixed masked dataset
    batch_size=batch_size, 
    collate_fn=default_data_collator # batched tensor
)

#eval_dataloader = DataLoader(
#    downsampled_dataset["test"],
#    batch_size=batch_size, 
#    collate_fn=data_collator,
#)
accelerator = Accelerator()

model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

optimizer = AdamW(model.parameters(), lr=5e-5)

model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

progress_bar = tqdm(range(num_training_steps))
model_name = "distilbert-base-uncased-finetuned-imdb-accelerate"
output_dir = model_name

for epoch in range(num_train_epochs):
    # 학습
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # 평가
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        losses.append(accelerator.gather(loss.repeat(batch_size)))

    losses = torch.cat(losses)
    losses = losses[: len(eval_dataset)]
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        #repo.push_to_hub(
        #    commit_message = f"Training in progress epoch {epoch}", blocking=False
        #)



Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


  0%|          | 0/471 [00:00<?, ?it/s]

>>> Epoch 0: Perplexity: 2.34792219470663
>>> Epoch 1: Perplexity: 2.3093322381638712
>>> Epoch 2: Perplexity: 2.299610522178341


In [7]:
## Accelerator() w/o Trainer()
## fixed maksed eval dataset 
## token masking /w DataCollatorForLanguageModeling
## note: insert_random_mask() for token masking

import numpy as np
import math
import collections
import torch

from torch.utils.data import DataLoader

from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import default_data_collator, DataCollatorForLanguageModeling

from torch.optim import AdamW
from accelerate import Accelerator

from transformers import get_scheduler
from tqdm.auto import tqdm
from datasets import load_dataset

imdb_dataset = load_dataset("imdb")

model_checkpoint = "distilbert-base-uncased"

def get_dataloaders(accelerator: Accelerator, batch_size: int = 64):

    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

    def tokenize_function(examples):
        result = tokenizer(examples["text"])
        if tokenizer.is_fast:
            result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
        return result

    # 빠른 멀티스레딩을 작동시키기 위해서, batched=True를 지정합니다.
    # Apply the method we just defined to all the examples in all the splits of the dataset
    # starting with the main process first:
    with accelerator.main_process_first():
        tokenized_datasets = imdb_dataset.map(
            tokenize_function, batched=True, remove_columns=["text", "label"]
        )

    chunk_size = 128

    def group_texts(examples):
        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        total_length = (total_length // chunk_size) * chunk_size
        result = {
            k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
            for k, t in concatenated_examples.items()
        }
        result["labels"] = result["input_ids"].copy()
        return result                            

    lm_datasets = tokenized_datasets.map(group_texts, batched=True)

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
    wwm_probability = 0.2

    train_size = 10_000
    test_size = int(0.1 * train_size)

    downsampled_dataset = lm_datasets["train"].train_test_split(
        train_size=train_size, test_size=test_size, seed=42
    )

    def insert_random_mask(batch):
        features = [dict(zip(batch, t)) for t in zip(*batch.values())]
        masked_inputs = data_collator(features)
        # 데이터셋의 각 컬럼에 대해서 새로운 "masked" 컬럼을 생성
        return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

    downsampled_dataset = downsampled_dataset.remove_columns(["word_ids"])
    #print(downsampled_dataset)
    #DatasetDict({
    #train: Dataset({
    #    features: ['input_ids', 'attention_mask', 'labels'],
    #    num_rows: 10000
    #})
    #test: Dataset({
    #    features: ['input_ids', 'attention_mask', 'labels'],
    #    num_rows: 1000
    #})
    #})

    batch_size = batch_size
    train_dataloader = DataLoader(
        downsampled_dataset["train"],
        shuffle=True,
        batch_size=batch_size,
        collate_fn=data_collator,
    )

    
    eval_dataset = downsampled_dataset["test"].map(
        insert_random_mask,
        batched=True,
        remove_columns=downsampled_dataset["test"].column_names,
    )

    eval_dataset = eval_dataset.rename_columns(
        {
            "masked_input_ids": "input_ids",
            "masked_attention_mask": "attention_mask",
            "masked_labels": "labels",
        }
    )
    
    eval_dataloader = DataLoader(
        eval_dataset,
        batch_size=batch_size, 
        collate_fn=default_data_collator
    )

    return train_dataloader, eval_dataloader, tokenizer, eval_dataset

#def training_function(config, args):
def training_function():

    batch_size = 64
    
    accelerator = Accelerator()

    train_dataloader, eval_dataloader, tokenizer, eval_dataset = get_dataloaders(accelerator, batch_size=batch_size)
    
    model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

    optimizer = AdamW(model.parameters(), lr=5e-5)

    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader
    )

    num_train_epochs = 3
    num_update_steps_per_epoch = len(train_dataloader)
    num_training_steps = num_train_epochs * num_update_steps_per_epoch

    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )

    progress_bar = tqdm(range(num_training_steps))
    model_name = "distilbert-base-uncased-finetuned-imdb-accelerate"
    output_dir = model_name

    for epoch in range(num_train_epochs):
        # 학습
        model.train()
        for batch in train_dataloader:
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

        # 평가
        model.eval()
        losses = []
        for step, batch in enumerate(eval_dataloader):
            with torch.no_grad():
                outputs = model(**batch)

            loss = outputs.loss
            losses.append(accelerator.gather(loss.repeat(batch_size)))

        losses = torch.cat(losses)
        losses = losses[: len(eval_dataset)]
        try:
            perplexity = math.exp(torch.mean(losses))
        except OverflowError:
            perplexity = float("inf")

        #print(f">>> Epoch {epoch}: Perplexity: {perplexity}")
        accelerator.print(f">>> Epoch {epoch}: Perplexity: {perplexity}")
        
        # Save and upload
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
        if accelerator.is_main_process:
            tokenizer.save_pretrained(output_dir)
            #repo.push_to_hub(
            #    commit_message = f"Training in progress epoch {epoch}", blocking=False
            #)

#model_name = "bert-finetuned-ner-accelerate"
#repo_name = get_full_repo_name(model_name)
#print(repo_name)
#repo = Repository(output_dir, clone_from=repo_name) # git lfs 설치 에러러

#notebook_launcher(function, args, num_processes, mixed_precision, use_port, master_addr, node_rank, num_nodes)
from accelerate import notebook_launcher

notebook_launcher(training_function, num_processes=4)
        
#def main():
#    parser = argparse.ArgumentParser(description="Simple example of training script.")
#    parser.add_argument(
#        "--mixed_precision",
#        type=str,
#        default=None,
#        choices=["no", "fp16", "bf16", "fp8"],
#        help="Whether to use mixed precision. Choose"
#        "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
#        "and an Nvidia Ampere GPU.",
#    )
#    parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
#    args = parser.parse_args()
#    config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
#    training_function(config, args)
#
#
#if __name__ == "__main__":
#    main()

Launching training on 4 GPUs.


Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


  0%|          | 0/120 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


>>> Epoch 0: Perplexity: 12.087873485461591
>>> Epoch 1: Perplexity: 11.503850315751773
>>> Epoch 2: Perplexity: 11.377066481088654
