[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/khetansarvesh/NLP/blob/main/Sentence-Level-Classification/In_Context_Learning_Movie_Review_Classification.ipynb)

In [13]:
# !pip install transformers # to get the finetuned models from hugging-face hub
# !pip install datasets

In [2]:
from collections import defaultdict, Counter
import json
from matplotlib import pyplot as plt
import numpy as np
import torch

## Input

In [3]:
inputs = "I'm excited to learn about Hugging Face Transformers!"

## Tokenization : Text2Numeric

In [5]:
# Initialize the tokenizer
from transformers import DistilBertTokenizer, DistilBertTokenizerFast, AutoTokenizer

# Method 1 :
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased")      # written in Python
print(tokenizer)

# Method 2 :
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-cased")  # written in Rust
print(tokenizer)

# Method 3 : (most used)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased") # Defaults to Fast
print(tokenizer)

DistilBertTokenizer(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
DistilBertTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padd

In [15]:
print(f'''Start :             {inputs}''')

input_tokens = tokenizer.tokenize(inputs)
print(f'''Tokenize :          {input_tokens}''')

input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
start_token = tokenizer.cls_token_id
end_token = tokenizer.sep_token_id
print(f'''Input Ids :         [{start_token}]{input_ids}[{end_token}]''')

tokenized_inputs = tokenizer(inputs, return_tensors="pt")
print(f'''Tokens :            {tokenized_inputs}''')

decoded_str = tokenizer.decode(input_ids)
print(f'''Decoded :           {decoded_str}''')

Start :             I'm excited to learn about Hugging Face Transformers!
Tokenize :          ['I', "'", 'm', 'excited', 'to', 'learn', 'about', 'Hu', '##gging', 'Face', 'Transformers', '!']
Input Ids :         [101][146, 112, 182, 7215, 1106, 3858, 1164, 20164, 10932, 10289, 25267, 106][102]
Tokens :            {'input_ids': tensor([[  101,   146,   112,   182,  7215,  1106,  3858,  1164, 20164, 10932,
         10289, 25267,   106,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Decoded :           I'm excited to learn about Hugging Face Transformers!


In [16]:
input_t = tokenizer._tokenizer.encode(inputs)
print(f"Number of tokens:       {len(input_t)}")
print(f"Ids:                    {input_t.ids}")
print(f"Tokens:                 {input_t.tokens}")
print(f"Special tokens mask:    {input_t.special_tokens_mask}")

Number of tokens:       14
Ids:                    [101, 146, 112, 182, 7215, 1106, 3858, 1164, 20164, 10932, 10289, 25267, 106, 102]
Tokens:                 ['[CLS]', 'I', "'", 'm', 'excited', 'to', 'learn', 'about', 'Hu', '##gging', 'Face', 'Transformers', '!', '[SEP]']
Special tokens mask:    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]


In [18]:
# if you are adding padding to your question then it will be represented as 0 in the numeric form
print (f"Pad token: {tokenizer.pad_token} | Pad token id: {tokenizer.pad_token_id}")

Pad token: [PAD] | Pad token id: 0


In [20]:
# BATCH ENCODING : pass multiple strings into the tokenizer and pad them as you need
model_inputs = tokenizer(
                          [ "Hugging Face Transformers is great!",
                            "The quick brown fox jumps over the lazy dog.",
                            "Then the dog got up and ran away because she didn't like foxes."],
                          return_tensors="pt",
                          padding=True,
                          truncation=True)

print(model_inputs)

{'input_ids': tensor([[  101, 20164, 10932, 10289, 25267,  1110,  1632,   106,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  1109,  3613,  3058, 17594, 15457,  1166,  1103, 16688,  3676,
           119,   102,     0,     0,     0,     0,     0,     0,     0],
        [  101,  1599,  1103,  3676,  1400,  1146,  1105,  1868,  1283,  1272,
          1131,  1238,   112,   189,  1176, 17594,  1279,   119,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [21]:
# BATCH DECODING : Similary you can also do batch decoding
print(tokenizer.batch_decode(model_inputs.input_ids))
print()
print( "Batch Decode: (no special characters)")
print(tokenizer.batch_decode(model_inputs.input_ids, skip_spetial_tokens=True))

['[CLS] Hugging Face Transformers is great! [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]', '[CLS] The quick brown fox jumps over the lazy dog. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]', "[CLS] Then the dog got up and ran away because she didn't like foxes. [SEP]"]

Batch Decode: (no special characters)
['[CLS] Hugging Face Transformers is great! [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]', '[CLS] The quick brown fox jumps over the lazy dog. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]', "[CLS] Then the dog got up and ran away because she didn't like foxes. [SEP]"]


## Passing Input to Model

In [23]:
# initialize the model
from transformers import AutoModelForSequenceClassification, DistilBertForSequenceClassification

# Method 1 :
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased', num_labels=2)

# Method 2 :
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-cased', num_labels=2)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
# Initialize the model which will take above numerics as input
outputs = model(**tokenized_inputs)
print(outputs)

SequenceClassifierOutput(loss=None, logits=tensor([[-0.1370, -0.0432]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


## Sentiment

In [25]:
labels = ['NEGATIVE', 'POSITIVE']
prediction = torch.argmax(outputs.logits)
print (f'''Sentiment : {labels[prediction]}''')

Sentiment : POSITIVE
