In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from datasets import Features, Sequence, Value

In [2]:
features = Features({
    'url_legal': Value('string'),
    'license': Value('string'),
    'excerpt': Sequence(Value('string')),
    'target': Value('float32')
})

In [3]:
d = load_dataset('csv', data_files={'train':'train.csv'})
d = d.map(lambda ex: {'labels': ex['target']}, batched=True)
data = d['train']
data = data.remove_columns(['target', 'standard_error', 'id'])

Using custom data configuration default-4d9f3027cc262fd9
Reusing dataset csv (/Users/eleme/.cache/huggingface/datasets/csv/default-4d9f3027cc262fd9/0.0.0/e138af468cb14e747fb46a19c787ffcfa5170c821476d20d5304287ce12bbc23)
Loading cached processed dataset at /Users/eleme/.cache/huggingface/datasets/csv/default-4d9f3027cc262fd9/0.0.0/e138af468cb14e747fb46a19c787ffcfa5170c821476d20d5304287ce12bbc23/cache-6ba05e9cad204f5d.arrow


In [4]:
MODEL_NAME = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=1)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [5]:
MAX_LENGTH = 512
data = data.map(lambda e: tokenizer(e['excerpt'], truncation=True, padding='max_length', max_length=MAX_LENGTH), batched=True)

Loading cached processed dataset at /Users/eleme/.cache/huggingface/datasets/csv/default-4d9f3027cc262fd9/0.0.0/e138af468cb14e747fb46a19c787ffcfa5170c821476d20d5304287ce12bbc23/cache-1112d41b6b15e79d.arrow


In [6]:
data.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [7]:
data

Dataset({
    features: ['attention_mask', 'excerpt', 'input_ids', 'labels', 'license', 'token_type_ids', 'url_legal'],
    num_rows: 2834
})

In [8]:
# Compute metrics is an argument of the Trainer
def compute_metrics(pred_results):
    """For computing RMSE inside the training loop"""
    y_pred = pred_results.predictions.squeeze()
    y_true = pred_results.label_ids
    return {"rmse": rmse(y_true, y_pred)}

In [16]:
dataset = data.train_test_split(test_size=0.3)

In [17]:
dataset

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'excerpt', 'input_ids', 'labels', 'license', 'token_type_ids', 'url_legal'],
        num_rows: 1983
    })
    test: Dataset({
        features: ['attention_mask', 'excerpt', 'input_ids', 'labels', 'license', 'token_type_ids', 'url_legal'],
        num_rows: 851
    })
})

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    learning_rate=3e-4,
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
    do_train=True,
    do_eval=True,
    no_cuda=False,
    load_best_model_at_end=True,
    # eval_steps=100,
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=dataset['train'],         # training dataset
    eval_dataset=dataset['test'],            # evaluation dataset
    compute_metrics=compute_metrics
)

train_out = trainer.train()
