In [1]:
# !pip install sentence-transformers
# !pip install datasets
# !pip install accelerate

In [2]:
from transformers import AutoModelForMaskedLM
from torch.utils.data import DataLoader
from transformers import AutoTokenizer


In [3]:
# model_checkpoint = "roberta-base"
# model_checkpoint = "bert-base-uncased"
model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [4]:
distilbert_num_parameters = model.num_parameters() / 1_000_000
print(f"'>>> BERT number of parameters: 110M'")
print(model_checkpoint + ' : ' + str(round(distilbert_num_parameters)) + 'M')

'>>> BERT number of parameters: 110M'
distilbert-base-uncased : 67M


In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [6]:
from datasets import load_dataset

data_dir = r"C:\Users\porch\OneDrive - Universite de Liege\dataset SE\User Stories\Ready to Go\Further Pre-training\no_duplicate"
us_dataset = load_dataset("text", data_dir = data_dir)
us_dataset

Using custom data configuration default-fe029f803c31063a
Reusing dataset text (C:\Users\porch\.cache\huggingface\datasets\text\default-fe029f803c31063a\0.0.0\acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 10673
    })
})

In [7]:
sample = us_dataset["train"].shuffle(seed=42).select(range(3))

for row in sample:
    print(f"\n'>>> Review: {row['text']}'")


Loading cached shuffled indices for dataset at C:\Users\porch\.cache\huggingface\datasets\text\default-fe029f803c31063a\0.0.0\acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08\cache-a0c7705194d6e0d1.arrow



'>>> Review: i want my door cam needs to notify me when this is unusual activity outside, example more than 3-4 person approaching the house'

'>>> Review: I don't have to'

'>>> Review: i want to know how long it will take to archive'


In [8]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = us_dataset.map(
    tokenize_function, batched=True, remove_columns=["text"]
)
tokenized_datasets



  0%|          | 0/11 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 10673
    })
})

In [9]:
chunk_size = 128

In [10]:
# Slicing produces a list of lists for each feature
tokenized_samples = tokenized_datasets["train"][:20]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")

'>>> Review 0 length: 16'
'>>> Review 1 length: 9'
'>>> Review 2 length: 9'
'>>> Review 3 length: 12'
'>>> Review 4 length: 15'
'>>> Review 5 length: 20'
'>>> Review 6 length: 9'
'>>> Review 7 length: 15'
'>>> Review 8 length: 16'
'>>> Review 9 length: 10'
'>>> Review 10 length: 12'
'>>> Review 11 length: 22'
'>>> Review 12 length: 12'
'>>> Review 13 length: 13'
'>>> Review 14 length: 16'
'>>> Review 15 length: 17'
'>>> Review 16 length: 20'
'>>> Review 17 length: 20'
'>>> Review 18 length: 10'
'>>> Review 19 length: 12'


In [11]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

'>>> Concatenated reviews length: 285'


In [12]:
print(concatenated_examples)
for k in tokenized_samples.keys():
    print(k) 

{'input_ids': [101, 1045, 2215, 2000, 2031, 1996, 2260, 1011, 2539, 1011, 2418, 3972, 20624, 5644, 13995, 102, 101, 1045, 2215, 2000, 25136, 1996, 4219, 3931, 102, 101, 2009, 3503, 1996, 2047, 20138, 2640, 6782, 102, 101, 1045, 2215, 2000, 3189, 2000, 1996, 6736, 2055, 5310, 5604, 102, 101, 2027, 2024, 5204, 1997, 2037, 5857, 2000, 2437, 20138, 1037, 2488, 1057, 2595, 102, 101, 1045, 2215, 2000, 2693, 2006, 2000, 2461, 1016, 1997, 4830, 5910, 2030, 6904, 5910, 4899, 3931, 10086, 2015, 102, 101, 1045, 2064, 2131, 6226, 2015, 2013, 4105, 102, 101, 1045, 2215, 2000, 2693, 2006, 2000, 2461, 1016, 1997, 2188, 13704, 10086, 2015, 102, 101, 1045, 2215, 2000, 2693, 2006, 2000, 2461, 1017, 1997, 1996, 2393, 3931, 10086, 2015, 102, 101, 1045, 2215, 2000, 2022, 2583, 2000, 8833, 2488, 102, 101, 1045, 2064, 13460, 23416, 3314, 2007, 3327, 27842, 1998, 4972, 102, 101, 1045, 2215, 2000, 5587, 1996, 14409, 2006, 1037, 6904, 5910, 12339, 2000, 2022, 6310, 2043, 1996, 10172, 9153, 5809, 3431, 102, 101,

In [13]:
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 29'


In [14]:
for i in range(0, total_length, chunk_size):
    print(total_length)
  # print(i, i + chunk_size)
  # total_length = (total_length // chunk_size) * chunk_size
    print((total_length // chunk_size) * chunk_size)
#     print(total_length // chunk_size)



285
256
285
256
285
256


In [15]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [16]:
us_datasets = tokenized_datasets.map(group_texts, batched=True)
us_datasets

  0%|          | 0/11 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1254
    })
})

In [17]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [18]:
samples = [us_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] i want to have the 12 - [MASK] - 2017 del [MASK]ons processed [SEP] [CLS] i want to redesign the resources page [SEP] [CLS] it matches the new broker design [MASK] [SEP] [CLS] i want to report to the agencies about user testing [SEP] [CLS] they are aware of their [MASK] to making broker a better [MASK] [MASK] [SEP] [CLS] i [MASK] to move on to round 2 of [MASK]bs or fabs landing page edits [SEP] [CLS] i can get [MASK]s from leadership [SEP] [CLS] i want [MASK] move on to round 2 of homepage edits [SEP] [CLS] i want [MASK] move on to round 3 of the help page edits [SEP] [CLS] i want to be able to'

'>>> log better [SEP] [CLS] i can troubleshoot issues with particular submissions and [MASK] [SEP] [CLS] i want to add the [MASK] on a fabs submission [MASK] be modified when the publishstatus [MASK] [SEP] [CLS] [MASK] know when the status of the submission has changed [SEP] [CLS] i want new relic to provide useful data across all applications [SEP] [CLS] i want to move on [MASK] 

In [19]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [20]:
samples = [us_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] i want to have the 12 - [MASK] - 2017 [MASK] [MASK] [MASK] processed [SEP] [CLS] i want to redesign the resources page [SEP] [CLS] it [MASK] the new broker design styles [SEP] [CLS] i [MASK] [MASK] report to the agencies about user [MASK] [SEP] [CLS] they are aware [MASK] their contributions [MASK] making broker a [MASK] ux [SEP] [CLS] i want to move on to [MASK] 2 of dabs or fabs landing [MASK] edits [SEP] [CLS] i can get approvals from leadership [SEP] [CLS] i want to move [MASK] to round 2 of homepage edits [SEP] [CLS] i [MASK] to move on to round 3 of the help [MASK] edits [SEP] [CLS] [MASK] want [MASK] be able to'

'>>> log [MASK] [SEP] [CLS] i can troubleshoot [MASK] with particular [MASK] and functions [SEP] [CLS] [MASK] want to [MASK] the updates on a [MASK] [MASK] submission to be [MASK] [MASK] the [MASK] [MASK] [MASK] changes [SEP] [CLS] i know when [MASK] status of the submission has [MASK] [SEP] [CLS] i want new relic [MASK] provide useful data [MASK] all applic

In [21]:
train_size = len(us_datasets['train'])
test_size = int(0.1 * train_size)
print(test_size)
downsampled_dataset = us_datasets["train"].train_test_split(
    train_size=train_size - test_size, test_size=test_size, seed=42
)
downsampled_dataset

125


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1129
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 125
    })
})

In [22]:
# from transformers import TrainingArguments

# batch_size = 32
# # Show the training loss with every epoch
# logging_steps = len(downsampled_dataset["train"]) // batch_size
# model_name = model_checkpoint.split("/")[-1]

# training_args = TrainingArguments(
#     output_dir=f"{model_name}-finetuned-imdb",
#     overwrite_output_dir=True,
#     evaluation_strategy="epoch",
#     learning_rate=2e-5,
#     weight_decay=0.01,
#     per_device_train_batch_size=batch_size,
#     per_device_eval_batch_size=batch_size,
#     push_to_hub=False,
#     fp16=True,
#     logging_steps=logging_steps,
# )

In [23]:
# from transformers import Trainer

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=downsampled_dataset["train"],
#     eval_dataset=downsampled_dataset["test"],
#     data_collator=data_collator,
#     tokenizer=tokenizer,
# )

In [24]:
# import math

# eval_results = trainer.evaluate()
# print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [25]:
# trainer.train()

In [26]:
# eval_results = trainer.evaluate()
# print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [27]:
# import torch
# torch.cuda.empty_cache()
# del trainer

In [28]:
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    # Create a new "masked" column for each column in the dataset
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

In [29]:
downsampled_dataset = downsampled_dataset.remove_columns(["word_ids"])
eval_dataset = downsampled_dataset["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=downsampled_dataset["test"].column_names,
)
eval_dataset = eval_dataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
    }
)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [31]:
downsampled_dataset["train"]
# downsampled_dataset = downsampled_dataset.remove_columns(["token_type_ids"])

# downsampled_dataset["train"]['token_type_ids']

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1129
})

In [32]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

batch_size = 16
train_dataloader = DataLoader(
    downsampled_dataset["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)
eval_dataloader = DataLoader(
    eval_dataset, 
    batch_size=batch_size, 
    collate_fn=default_data_collator
)

In [33]:
from accelerate import Accelerator


In [34]:
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)


accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [35]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [36]:
from tqdm.auto import tqdm
import torch
import math
output_dir = model_checkpoint + '_us'
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        losses.append(accelerator.gather(loss.repeat(batch_size)))

    losses = torch.cat(losses)
    losses = losses[: len(eval_dataset)]
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)

  0%|          | 0/213 [00:00<?, ?it/s]

>>> Epoch 0: Perplexity: 9.80658280496515
>>> Epoch 1: Perplexity: 8.848528035560083
>>> Epoch 2: Perplexity: 8.64938196503863
