In [1]:
pip install datasets

Note: you may need to restart the kernel to use updated packages.


In [2]:
import random
import numpy as np
import torch
import torch.nn as nn
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast, Trainer, TrainingArguments
from transformers.optimization import AdamW

from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score



In [3]:
#To regenrate the same output
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

#Setting torch to gpu if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
print(device)

cuda


In [5]:
imdb_dataset = load_dataset('imdb')
train_dataset = imdb_dataset['train']
print(train_dataset[0])
test_dataset = imdb_dataset['test']

print("No of Train dataset = ",len(train_dataset), " No of Test Dataset = ", len(test_dataset))

Found cached dataset imdb (C:/Users/rjkhe/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


  0%|          | 0/3 [00:00<?, ?it/s]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [6]:
#Downloading the tokenizer for the base model
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# use the tokenizer to preprocess the dataset by encoding and padding the datasets.
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)


In [7]:
#Apply the tokenizing function on the datasets
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))

test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

Loading cached processed dataset at C:\Users\rjkhe\.cache\huggingface\datasets\imdb\plain_text\1.0.0\d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0\cache-23472a8ecfa5b9c9.arrow


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [8]:
print(train_dataset[0])


{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [9]:
def train_model(model,save_name):

    training_args = TrainingArguments(
        output_dir='./results',          # output directory to save the model
        evaluation_strategy='steps',
        eval_steps = 1116000,            # giving random number greater than batch values to skip evaluation
        save_total_limit=5,
        learning_rate=5e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        num_train_epochs=5, 
        weight_decay=0.01,
        push_to_hub=False,
    )

    trainer = Trainer(
        model=model,                         
        args=training_args,                  
        train_dataset=train_dataset,         
        eval_dataset=None,
        compute_metrics=lambda pred: {'accuracy': accuracy_score(pred.label_ids, pred.predictions.argmax(-1))},
    )
    
    trainer.save_model("./results/saved_model"+save_name)
    trainer.train()
    eval_results = trainer.evaluate(test_dataset)
    print(eval_results)


In [None]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2).to(device)
print("Fine Tuning")
train_model(model,"model1")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classi

Fine Tuning


Saving model checkpoint to ./results/saved_modelmodel1
Configuration saved in ./results/saved_modelmodel1\config.json
Model weights saved in ./results/saved_modelmodel1\pytorch_model.bin
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 25000
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 7815
  Number of trainable parameters = 66955010


Step,Training Loss,Validation Loss


In [None]:

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2).to(device)

# TO freeze the layers other than the classifier which is the last
for layer_name, param in model.named_parameters():
    if 'classifier' not in layer_name:  
        param.requires_grad = False
        
for layer_name, param in model.named_parameters():
    if param.requires_grad:
        print(layer_name," layer is not frozen")
    else:
        print(layer_name," layer is frozen")

print("Linear Probing")
train_model(model,"model2")