- Load a dataset

- Make it as a iterator

- Load a tokeniser

- Write a processing function 

- Map it to the dataset 

- Create a new tokeniser

- Train it with the dataset

- Write the post processing function

- Run the evaluation 

In [2]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset

In [3]:
imdb_ds = load_dataset("imdb")

In [4]:
imdb_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [5]:
model_name = "distilbert-base-uncased"
model_path = "/home/kamal/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411/"
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [6]:
tokenizer.max_model_input_sizes

{'distilbert-base-uncased': 512,
 'distilbert-base-uncased-distilled-squad': 512,
 'distilbert-base-cased': 512,
 'distilbert-base-cased-distilled-squad': 512,
 'distilbert-base-german-cased': 512,
 'distilbert-base-multilingual-cased': 512}

In [6]:
tokenizer_from_name = AutoTokenizer.from_pretrained(model_name)

In [7]:
tokenizer_from_name.model_max_length

512

In [7]:
def tokenise_ds(row):
    return tokenizer_from_name(row['text'], padding=True,
                               truncation=True, 
                               max_length=tokenizer_from_name.model_max_length,
                              return_tensors='pt')

In [8]:
tokenise_imdb = imdb_ds.map(tokenise_ds, batched=True, num_proc=3)

In [17]:
from rich import print

print(tokenise_imdb['train'][0])

In [16]:
import evaluate

accuracy = evaluate.load('accuracy')

In [23]:
import numpy as np
def compute_accuracy(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=1)
    return accuracy.compute(predictions=preds, references=labels)

In [11]:
id2label = {0:"negative", 1:"positive"}
label2id = {"negative":0, "positive":1}

In [19]:
import torch
del model_wt
torch.cuda.empty_cache()

In [12]:
model_wt = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=model_path,
    num_labels=2,
    id2label=id2label,
    label2id= label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at /home/kamal/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411/ and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
moved = model_wt.to('cuda')

In [24]:
targs = TrainingArguments(
    output_dir='/home/kamal/training_files/distil_imbd',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy='steps',
    save_strategy='steps',
    save_steps=1500,
    eval_steps=500,
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="none"
)

In [25]:
trainer = Trainer(
    model=model_wt,
    args=targs,
    train_dataset=tokenise_imdb['train'],
    eval_dataset=tokenise_imdb['test'],
    compute_metrics=compute_accuracy
)

In [26]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
500,0.0641,0.393557,0.91616
1000,0.1437,0.412702,0.9044
1500,0.116,0.311612,0.9252
2000,0.0983,0.319082,0.92812
2500,0.0964,0.2913,0.92888
3000,0.0847,0.314138,0.9302


TrainOutput(global_step=3126, training_loss=0.10026636386024441, metrics={'train_runtime': 1485.889, 'train_samples_per_second': 33.65, 'train_steps_per_second': 2.104, 'total_flos': 6623369932800000.0, 'train_loss': 0.10026636386024441, 'epoch': 2.0})

In [27]:
trained_model_path = "/home/kamal/training_files/distil_imbd/checkpoint-3000/"
trained_model = AutoModelForSequenceClassification.from_pretrained(trained_model_path)

In [48]:
trained_model.to('cuda')

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [49]:
test = "The training process was a good experience, and took a lot of time"
tokened_test = tokenizer_from_name(test, return_tensors='pt').to('cuda')
tokened_test

{'input_ids': tensor([[ 101, 1996, 2731, 2832, 2001, 1037, 2204, 3325, 1010, 1998, 2165, 1037,
         2843, 1997, 2051,  102]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [50]:
model_out = trained_model(**tokened_test).logits

In [51]:
prediction = model_out.argmax().item()
trained_model.config.id2label[prediction]

'positive'

In [42]:
imdb_ds['unsupervised'][10]

{'text': "This isn't the worst comedy of all-time, but that is about the best thing that I can say about this pathetic film. I didn't laugh once, or even smile once during this bomb. There was usually something going on on-screen, so I didn't get TOO bored, but most of the jokes here were simply awful. The final sequence is nothing more than a long series of people falling through doors and stumbling all over the place. Needless to say, it was a fitting way to end a movie that was impossible for me to like.",
 'label': -1}

In [52]:
def get_sentiment(row):
    text = row['text']
    tokened_text = tokenizer_from_name(text, truncation=True,
                                      padding=True, max_length=512,
                                      return_tensors='pt').to('cuda')
    model_out = trained_model(**tokened_text).logits
    prediction = model_out.argmax().item()
    return {'prediction': trained_model.config.id2label[prediction]}

In [45]:
get_sentiment(imdb_ds['unsupervised'][0])

{'prediction': 'positive'}

In [53]:
predicting_imdb = imdb_ds['unsupervised'].map(get_sentiment)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [58]:
predicting_imdb.save_to_disk(dataset_path='~/.cache/huggingface/datasets/imdb_unsupervised_predicted')

Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]