In [6]:
from utils import init_experiment
model_name = "bert-base-uncased"
init_experiment({"model": model_name})

[34m[1mwandb[0m: Currently logged in as: [33mkydlicek-hynek[0m ([33mhynky[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [7]:
import pandas as pd
train = pd.read_parquet("../artifacts/feature_store/yelp2020/train/")
test = pd.read_parquet("../artifacts/feature_store/yelp2020/test/")


In [12]:
# we will use small sample of the data 800k
train = train.sample(10000)
# split into train and validation
train = train.sample(frac=0.9, random_state=42)
val = train.drop(train.index)

In [18]:
from transformers import AutoTokenizer, PreTrainedTokenizer
from datasets import Dataset, ClassLabel
labels = test["usefulness"].unique().tolist()


def add_tokenization(dst: Dataset, tokenizer: PreTrainedTokenizer):
    def tokenize(batch):
        return tokenizer(batch["text"], padding="max_length", truncation=True)
    
    return dst.map(tokenize, batched=True, batch_size=1000)

def convert_to_hf_trainable_dataset(df: pd.DataFrame, tokenizer: PreTrainedTokenizer):
    dataset = Dataset.from_pandas(df)
    dataset = dataset.cast_column("usefulness", ClassLabel(names=labels))
    dataset = dataset.rename_column("usefulness", "labels")
    dataset = add_tokenization(dataset, tokenizer)
    return dataset

In [22]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

train_dataset = convert_to_hf_trainable_dataset(train, tokenizer)
val_dataset = convert_to_hf_trainable_dataset(val, tokenizer)
test_dataset = convert_to_hf_trainable_dataset(test, tokenizer)

Casting the dataset:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/350647 [00:00<?, ? examples/s]

Map:   0%|          | 0/350647 [00:00<?, ? examples/s]

In [35]:
from transformers import AutoModelForSequenceClassification

def freeze_all_but_classifier_and_last_layers(model, num_layers):
    for param in model.base_model.parameters():
        param.requires_grad = False

    for layer in model.base_model.encoder.layer[-num_layers:]:
        for param in layer.parameters():
            param.requires_grad = True
    
    model.classifier.requires_grad = True
    return model

def get_model_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

transformer = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(labels))
transformer = freeze_all_but_classifier_and_last_layers(transformer, 1)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [34]:
get_model_params(transformer)

2307

In [179]:
from transformers import TrainingArguments
from transformers import Trainer
from utils import get_result, get_model_folder

training_args = TrainingArguments(
    output_dir=get_model_folder(),
    num_train_epochs=1,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    logging_steps=10,
    report_to="wandb",
)



compute_metrics = lambda pred: {"f1_macro": get_result(pred.label_ids, pred.predictions.argmax(-1)).metric_value}
trainer = Trainer(
    model=transformer,
    args=training_args,
    train_dataset=train_yelp_dataset,
    eval_dataset=val_yelp_dataset,
    compute_metrics=compute_metrics,
)

{'input_ids': [101, 7592, 1010, 2023, 2028, 6251, 999, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
trainer.train()