In [None]:
from transformers import BertConfig, BertModel

# Building the config
config = BertConfig()

# Building the model from the config
model = BertModel(config)


In [None]:
print(config)

In [None]:
print(model)

In [None]:
from transformers import BertModel

checkpoint = "bert-base-cased"
model = BertModel.from_pretrained(checkpoint)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
model.classifier

### Model call

In [None]:
sentenses = [
    "I’ve been waiting for a HuggingFace course my whole life.",
    "I hate this so much!"
]

# tokenizer.train_from_iterator(batched_sentences_generator)

token0 = tokenizer.tokenize(sentenses[0])
token1 = tokenizer.tokenize(sentenses[1])
tokens = [token0, token1]

In [None]:
tokens

In [None]:
ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens]

In [None]:
input0 = tokenizer(sentenses[0])
type(input0)

In [None]:
input0.word_ids()

In [None]:
inputs = tokenizer(sentenses, padding=True, return_tensors="pt")  # truncation=True, max_length=4)
inputs # = inputs.to(device)


In [None]:
output = model(**inputs, output_hidden_states=True, output_attentions=True)

In [None]:
output.keys()

In [None]:
print(model.config.id2label)
print(model.config.label2id)

output.logits

In [None]:
type(output.hidden_states), len(output.hidden_states), output.hidden_states[0].shape

In [None]:
type(output.attentions), len(output.attentions), output.attentions[0].shape

In [None]:
import torch

inputs['labels'] = torch.tensor([0, 1], dtype=torch.int64)

In [None]:
output = model(**inputs)
output.keys()

In [None]:
model.eval()
with torch.inference_mode():
    print(torch.nn.functional.softmax(model(**inputs).logits, dim=-1))

### Datasets

In [None]:
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")
# mine_datasets = load_dataset("text", data_files={'train': "data/mrpc.txt", 'test': '...'})

In [None]:
raw_datasets

In [None]:
raw_train_dataset = raw_datasets['train']

In [None]:
raw_train_dataset[0], len(raw_train_dataset)

In [None]:
raw_train_dataset.features

#### Preprocessing

In [None]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
def tokenize_function(example):
    # dict[str, list | np.array | torch.tesnor]
    # returns dict[str, list | np.array | torch.tesnor]
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenize_function({'sentence1': 'abc', 'sentence2': 'cde'})

In [None]:
tokenized_datasets_1 = raw_datasets.map(tokenize_function, num_proc=2, remove_columns=['sentence1', 'sentence2'])
tokenized_datasets_1

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True, num_proc=2, remove_columns=['sentence1', 'sentence2'])
tokenized_datasets

In [None]:
raw_train_dataset[0]

In [None]:
raw_train_dataset[0:2]

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True, num_proc=2, remove_columns=['sentence1', 'sentence2'])
tokenized_datasets

In [None]:
tokenized_datasets.save_to_disk('data/glue/mrpc/tokenized')

from datasets import load_from_disk
tokenized_datasets = load_from_disk('data/glue/mrpc/tokenized')

In [None]:
tokenized_datasets

In [None]:
from datasets import Dataset
from pandas import DataFrame

dataset = Dataset.from_pandas(DataFrame({'a' : [1, 2, 3]}))
dataset

#### dynamic batching

In [None]:
samples = tokenized_datasets["train"][:8]
samples = {
    k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]
}
[len(x) for x in samples["input_ids"]]


In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

### Trainer

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments("test-trainer")

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

#### trainer with metrics

In [None]:
import numpy as np
from datasets import load_metric

def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch", num_train_epochs=5)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer_output = trainer.train()

#### complete example for SST-2

In [None]:
from datasets import load_dataset, load_metric
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding, 
    TrainingArguments,
    Trainer,
)

In [None]:
raw_datasets = load_dataset("glue", "sst2")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import numpy as np

def compute_metrics(eval_preds):
    metric = load_metric("glue", "sst2")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
training_args = TrainingArguments(
    "test-trainer", 
    evaluation_strategy="epoch", # save_strategy, log_strategy, 
    num_train_epochs=3, 
    per_device_train_batch_size=16,
    report_to=None,
    # a lot of other arguments
)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)