## Data Tokenization and Preprocessing

In [1]:
import pandas as pd
import numpy as np
import datasets

In [2]:
articles = datasets.load_dataset('online_news_popularity_data')

Found cached dataset online_news_popularity_data (/home/leeparkuky/.cache/huggingface/datasets/online_news_popularity_data/online_news_popularity_data/1.0.0/63eb244b62e86df6ad3ae3034fcbddd6ed2840885e607a97d5e8f49afab926e0)


  0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
from transformers import AutoTokenizer
ckpt = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(ckpt)

In [4]:
def concatenate_fernandes_variables(examples):
    fernandes = [val for key, val in examples.items() if key not in ['title','content','shares','shares_class']]
    fernandes = np.array(fernandes).T.tolist()
    return {'fernandes': fernandes}

In [5]:
articles_concat = articles.map(concatenate_fernandes_variables, batched = True, batch_size = 64, num_proc = 16,
                              remove_columns = [x for x in articles.column_names['train'] if x not in ['title','content','shares','shares_class']] )

Loading cached processed dataset at /home/leeparkuky/.cache/huggingface/datasets/online_news_popularity_data/online_news_popularity_data/1.0.0/63eb244b62e86df6ad3ae3034fcbddd6ed2840885e607a97d5e8f49afab926e0/cache-d040c91994313900_*_of_00016.arrow


In [6]:
def tokenize(examples):
    text = [title + content for title, content in zip(examples['title'], examples['content'])]
    return tokenizer(text, max_length = 512, truncation = True, padding = True)

In [7]:
articles_tokenized = articles_concat.map(tokenize, batched = True, batch_size = 64, num_proc = 16,
                   remove_columns = ['shares','shares_class','title','content'])

Loading cached processed dataset at /home/leeparkuky/.cache/huggingface/datasets/online_news_popularity_data/online_news_popularity_data/1.0.0/63eb244b62e86df6ad3ae3034fcbddd6ed2840885e607a97d5e8f49afab926e0/cache-5d17139988fb0a89_*_of_00016.arrow


In [8]:
articles_tokenized.set_format('pt')

# Tesing if "MashableBertForMaskedLM" works

In [10]:
from Transformer_Models import MashableBertForMaskedLM

model_ckpt = 'bert-base-uncased'
model = MashableBertForMaskedLM(model_ckpt)

In [11]:
model(**{k:v for k,v in articles_tokenized['train'][:3].items()})

MaskedLMOutput(loss=None, logits=tensor([[[ 0.0000, -0.1032,  0.2310,  ..., -0.2509,  0.3616, -0.3281],
         [ 0.0000, -0.1271,  0.6196,  ...,  0.4624, -0.5777, -0.2988],
         [ 0.0000, -0.1600,  0.7666,  ..., -0.0365,  0.0541,  0.2170],
         ...,
         [ 0.0000,  0.0435,  0.6025,  ..., -0.1321,  0.3602, -0.1723],
         [ 0.0000, -1.0553,  1.2979,  ...,  0.0959, -0.4221, -0.0721],
         [ 0.0000, -0.9967,  0.8056,  ...,  0.5052, -0.7550,  0.2571]],

        [[ 0.0000, -0.4462,  0.0446,  ..., -0.1923, -0.0159,  0.3493],
         [ 0.0000, -0.4520,  0.7410,  ...,  0.0731,  0.1499,  0.2078],
         [ 0.0000,  0.6072,  0.7652,  ..., -0.3357, -0.5743, -0.6377],
         ...,
         [ 0.0000,  0.3018,  0.8404,  ...,  0.4595, -0.0907,  0.0118],
         [ 0.0000, -0.5260,  0.8660,  ...,  0.5451, -0.1357,  0.0369],
         [ 0.0000, -0.4300,  0.7983,  ...,  0.5482, -0.3280,  0.1211]],

        [[ 0.0000, -0.2728,  0.1408,  ..., -0.1378,  0.0690, -0.1533],
         [ 0

# Organizing Training Job for MaskedLM

In [9]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [10]:
from Transformer_Models import MashableBertForMaskedLM

model_ckpt = 'bert-base-uncased'
model = MashableBertForMaskedLM(model_ckpt)

In [11]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

In [12]:
print_gpu_utilization()

GPU memory occupied: 880 MB.


In [13]:
from transformers import TrainingArguments, Trainer
import torch

training_args = TrainingArguments(
    output_dir="pretraining-mashablebert",
    overwrite_output_dir = True,
    evaluation_strategy="epoch",
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 4,
    fp16 = True,
    learning_rate=2e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    push_to_hub=False,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim = 'adafactor'
)


trainer = Trainer(
    model=model.to(torch.device('cuda')),
    args=training_args,
    train_dataset=articles_tokenized["train"],
    eval_dataset=articles_tokenized["train"].shuffle().select(range(64)),
    data_collator=data_collator,
)



In [14]:
print_gpu_utilization()

GPU memory occupied: 1908 MB.


In [15]:
trainer.train(resume_from_checkpoint = True)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
8,6.4075,6.452059
9,6.3981,6.355864


TrainOutput(global_step=12370, training_loss=1.2272211270652285, metrics={'train_runtime': 3554.9628, 'train_samples_per_second': 111.416, 'train_steps_per_second': 3.48, 'total_flos': 1.3346207328043008e+17, 'train_loss': 1.2272211270652285, 'epoch': 10.0})

In [21]:
trainer.eval_dataset = articles_tokenized['train'].train_test_split(.2)['test']

In [22]:
trainer.evaluate()

{'eval_loss': 6.367098331451416,
 'eval_runtime': 95.5804,
 'eval_samples_per_second': 82.883,
 'eval_steps_per_second': 20.726,
 'epoch': 10.0}

In [26]:
trainer.args.num_train_epochs = 13

In [27]:
trainer.train(resume_from_checkpoint = True)

Epoch,Training Loss,Validation Loss
9,6.3981,6.366686
10,6.3991,6.37288
11,6.3982,6.375506
12,6.3911,6.371051


TrainOutput(global_step=16081, training_loss=1.6235503040211954, metrics={'train_runtime': 6458.4946, 'train_samples_per_second': 79.725, 'train_steps_per_second': 2.49, 'total_flos': 1.735006952645591e+17, 'train_loss': 1.6235503040211954, 'epoch': 13.0})

In [31]:
trainer

<transformers.trainer.Trainer at 0x7f159fce64d0>

In [32]:
trainer.save_model('MashableBertForMaskedLM_Pretrained')

In [44]:
from Transformer_Models import MashableBertModel

bertmodel = MashableBertModel('MashableBertForMaskedLM_Pretrained')
# bertmodel.load_state_dict(torch.load('sample_weight.pth'))

In [61]:
weights = torch.load('MashableBertForMaskedLM_Pretrained/pytorch_model.bin')

In [62]:
# rename keys, drop unused keys for basemodel, then save the weights
for key in [key for key in weights.keys() if key[5:] not in bertmodel.state_dict().keys()]:
    del weights[key]
for key in [key for key in weights.keys() if key[5:] in bertmodel.state_dict().keys()]:
    weights[key[5:]] = weights[key]
    del weights[key]


In [63]:
bertmodel.load_state_dict(weights)

<All keys matched successfully>

In [57]:
#Finally save the basemodel weight in Model Weights folder
torch.save(bertmodel.state_dict(), 'Model Weights/MashableBertModel_Pretrained.pth')