- Load a dataset

- Make it as a iterator

- Load a tokeniser

- Write a processing function 

- Map it to the dataset 

- Create a new tokeniser

- Train it with the dataset

- Write the post processing function

- Run the evaluation 

In [1]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig
)
from datasets import load_dataset

In [18]:
# work with imdb data again
imdb_ds = load_dataset('imdb')
glue_cola = load_dataset('glue', 'cola')

In [4]:
imdb_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [19]:
glue_cola

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

In [5]:
model_name = "distilbert-base-uncased"
model_config = AutoConfig.from_pretrained(model_name)

In [6]:
model_config.architectures

['DistilBertForMaskedLM']

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [8]:
max_len = tokenizer.model_max_length

In [9]:
imdb_ds['train'].features['label']

ClassLabel(names=['neg', 'pos'], id=None)

In [34]:
glue_cola['train'][0]['label']

1

In [10]:
max_sent_len = max([len(row['text'].split(' ')) for row in imdb_ds['train']])
max_sent_len

2470

In [12]:
max_sent_len = max([len(row['text'].split(' ')) for row in imdb_ds['test']])
max_sent_len

2278

In [30]:
def token_cola(row):
    return tokenizer(row['text'],padding=True,
                    truncation=True)

In [15]:
token_cola(imdb_ds['train'][0])

{'input_ids': [101, 1045, 12524, 1045, 2572, 8025, 1011, 3756, 2013, 2026, 2678, 3573, 2138, 1997, 2035, 1996, 6704, 2008, 5129, 2009, 2043, 2009, 2001, 2034, 2207, 1999, 3476, 1012, 1045, 2036, 2657, 2008, 2012, 2034, 2009, 2001, 8243, 2011, 1057, 1012, 1055, 1012, 8205, 2065, 2009, 2412, 2699, 2000, 4607, 2023, 2406, 1010, 3568, 2108, 1037, 5470, 1997, 3152, 2641, 1000, 6801, 1000, 1045, 2428, 2018, 2000, 2156, 2023, 2005, 2870, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 1996, 5436, 2003, 8857, 2105, 1037, 2402, 4467, 3689, 3076, 2315, 14229, 2040, 4122, 2000, 4553, 2673, 2016, 2064, 2055, 2166, 1012, 1999, 3327, 2016, 4122, 2000, 3579, 2014, 3086, 2015, 2000, 2437, 2070, 4066, 1997, 4516, 2006, 2054, 1996, 2779, 25430, 14728, 2245, 2055, 3056, 2576, 3314, 2107, 2004, 1996, 5148, 2162, 1998, 2679, 3314, 1999, 1996, 2142, 2163, 1012, 1999, 2090, 4851, 8801, 1998, 6623, 7939, 4697, 3619, 1997, 8947, 2055, 2037, 10740, 2006, 4331, 1010, 2016, 2038, 3348, 2007, 2014, 3689, 383

In [31]:
tokened_ds = imdb_ds.map(token_cola,)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [32]:
tokened_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 50000
    })
})

In [63]:
# testing if there is labels apart from 0 and 1

set([row['label'] for row in glue_cola['test']])

{-1}

In [33]:
id2label = {0:"neg", 1:"pos"}
label2id = {"neg":0, "pos":1}

In [34]:
# no need to push to cuda
model_wts = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                              num_labels=2,
                                                              id2label=id2label,
                                                              label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# no need to push to cuda, no additional parameters
model_wts = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                              num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [53]:
CUDA_LAUNCH_BLOCKING="1"

In [35]:
from evaluate import load
from datasets import load_metric
accuracy = load('accuracy')
metric = load_metric('glue', 'cola')

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [46]:
targs = TrainingArguments(
    output_dir="/home/kamal/training_files/prac",
    report_to="none",
    push_to_hub=False,
    evaluation_strategy="epoch",
    # eval_steps=500,
    save_strategy="epoch",
    # save_steps=500,
    per_device_eval_batch_size=16,
    per_device_train_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    learning_rate=2e-5,
    load_best_model_at_end=True,
    metric_for_best_model=metric,
)

In [37]:
import numpy as np

def compute_accuracy(eval_pred):
    # print(eval_pred)
    preds, labels = eval_pred,
    preds = np.argmax(preds, axis=1)
    return accuracy.compute(predictions=preds, references=labels)

In [15]:
def compute_corr(eval_pred):
    # print(eval_pred)
    preds, labels = eval_pred,
    preds = np.argmax(preds, axis=1)
    return metric.compute(predictions=preds, references=labels)

In [40]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [47]:

trainer = Trainer(
    train_dataset=tokened_ds['train'],
    eval_dataset=tokened_ds['test'],
    compute_metrics=compute_accuracy,
    args=targs,
    model=model_wts,
    data_collator=data_collator
)

In [48]:
trainer.train()

Epoch,Training Loss,Validation Loss


ValueError: not enough values to unpack (expected 2, got 1)