In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from datasets import load_dataset

articles = load_dataset('online_news_popularity_data')

In [None]:
from random import shuffle

def duplicate_shuffle_concatenate_texts(batch):
    title = list(batch['title'])
    content = list(batch['content'])
    full_text = [t + ' \n\n' + c for t,c in zip(title, content)]
    texts = ''
    for _ in range(3):
        shuffle(full_text)
        texts += ' \n\n\n'.join(full_text)
        texts += ' \n\n\n'
    texts = texts[:-3] 
    result = {'text': []}
    for i in range(len(texts)//800):
        result['text'].append(texts[i*800: (i+1)*800])
    return result

In [None]:
article_texts = articles.shuffle().map(duplicate_shuffle_concatenate_texts, batched = True,
            batch_size = 64, remove_columns=articles["train"].column_names)

In [None]:
from transformers import AutoTokenizer

model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
def preprocess_function(batch):
    return tokenizer(batch['text'], padding = False, truncation = True)

In [None]:
tokenized_article_texts = article_texts.shuffle().map(
    preprocess_function,
    batched=True,
    num_proc=10,
    batch_size = 64,
    remove_columns= ['text'],
)

In [None]:
tokenized_article_texts

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
from transformers import AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained(model_ckpt)

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
                output_dir="to_delete",
                evaluation_strategy="epoch",
                learning_rate=2e-5,
                per_device_train_batch_size = batch_size,
                per_device_eval_batch_size = batch_size,
                num_train_epochs=3,
                weight_decay=0.01,
                push_to_hub=False,
                disable_tqdm = False,
                save_strategy = 'epoch',
                fp16 = True,
                resume_from_checkpoint = False)

trainer = Trainer(
    model=model.to(device),
    args=training_args,
    train_dataset=tokenized_article_texts["train"],
    eval_dataset=tokenized_article_texts["validation"],
    data_collator=data_collator,
)

trainer.train()

In [1]:
from transformers import DistilBertModel

In [2]:
model = DistilBertModel.from_pretrained('domain_adaptation_final')

Some weights of the model checkpoint at domain_adaptation_final were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
