[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/khetansarvesh/NLP/blob/main/Sentence-Level-Classification/In_Context_Learning_Movie_Review_Classification.ipynb)

## Installing Libraries

In [1]:
!pip install transformers # to get the finetuned models from hugging-face hub
!pip install datasets

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m60.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m52.9 MB/s[0m eta [36m0:00:00[0m
Col

## Importing Libraries

In [2]:
from collections import defaultdict, Counter
import json
from matplotlib import pyplot as plt
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

## Input

In [10]:
inputs = "I'm excited to learn about Hugging Face Transformers!"

## Tokenization : Text2Numeric

In [33]:
print(f'''Start :             {inputs}''')

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("siebert/sentiment-roberta-large-english")

input_tokens = tokenizer.tokenize(inputs)
print(f'''Tokenize :          {input_tokens}''')

input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
start_token = tokenizer.convert_tokens_to_ids('<s>')
end_token = tokenizer.convert_tokens_to_ids('</s>')
print(f'''Input Ids :         [{start_token}]{input_ids}[{end_token}]''')

tokenized_inputs = tokenizer(inputs, return_tensors="pt")
print(f'''Tokens :            {tokenized_inputs}''')

decoded_str = tokenizer.decode(input_ids)
print(f'''Decoded :           {decoded_str}''')



print()
print()
print()
input_t = tokenizer._tokenizer.encode(inputs)
print(f"Number of tokens:       {len(input_t)}")
print(f"Ids:                    {input_t.ids}")
print(f"Tokens:                 {input_t.tokens}")
print(f"Special tokens mask:    {input_t.special_tokens_mask}")

Start :             I'm excited to learn about Hugging Face Transformers!
Tokenize :          ['I', "'m", 'Ġexcited', 'Ġto', 'Ġlearn', 'Ġabout', 'ĠHug', 'ging', 'ĠFace', 'ĠTransformers', '!']
Input Ids :         [0][100, 437, 2283, 7, 1532, 59, 30581, 3923, 12346, 34379, 328][2]
Tokens :            {'input_ids': tensor([[    0,   100,   437,  2283,     7,  1532,    59, 30581,  3923, 12346,
         34379,   328,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Decoded :           I'm excited to learn about Hugging Face Transformers!



Number of tokens:       13
Ids:                    [0, 100, 437, 2283, 7, 1532, 59, 30581, 3923, 12346, 34379, 328, 2]
Tokens:                 ['<s>', 'I', "'m", 'Ġexcited', 'Ġto', 'Ġlearn', 'Ġabout', 'ĠHug', 'ging', 'ĠFace', 'ĠTransformers', '!', '</s>']
Special tokens mask:    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]


In [46]:
# if you are adding padding to your question then it will be represented as 1 in the numeric form
print (f"Pad token: {tokenizer.pad_token} | Pad token id: {tokenizer.pad_token_id}")
print()
print()

# You can pass multiple strings into the tokenizer and pad them as you need
print('Batch Encoding')
model_inputs = tokenizer(
                          [ "Hugging Face Transformers is great!",
                            "The quick brown fox jumps over the lazy dog.",
                            "Then the dog got up and ran away because she didn't like foxes."],
                          return_tensors="pt",
                          padding=True,
                          truncation=True)

print(model_inputs)
print()
print()

# Similary you can also do batch decoding
print('Batch Decoding')
print(tokenizer.batch_decode(model_inputs.input_ids))
print()
print( "Batch Decode: (no special characters)")
print(tokenizer.batch_decode(model_inputs.input_ids, skip_spetial_tokens=True))

Pad token: <pad> | Pad token id: 1


Batch Encoding
{'input_ids': tensor([[    0, 40710,  3923, 12346, 34379,    16,   372,   328,     2,     1,
             1,     1,     1,     1,     1,     1,     1,     1],
        [    0,   133,  2119,  6219, 23602, 13855,    81,     5, 22414,  2335,
             4,     2,     1,     1,     1,     1,     1,     1],
        [    0, 12948,     5,  2335,   300,    62,     8,  2075,   409,   142,
            79,   399,    75,   101, 23602,   293,     4,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


Batch Decoding
['<s>Hugging Face Transformers is great!</s><pad><pad><pad><pad><pad><pad><pad><pad><pad>', '<s>The quick brown fox jumps over the lazy dog.</s><pad><pad><pad><pad><pad><pad>', "<s>Then the dog got up and ran away because she didn't like foxes.</s>"]

Batch Decode: (n

## Passing Input to Model

In [47]:
# Initialize the model which will take above numerics as input
model = AutoModelForSequenceClassification.from_pretrained( "siebert/sentiment-roberta-large-english")
outputs = model(**tokenized_inputs)
print(outputs)

SequenceClassifierOutput(loss=None, logits=tensor([[-3.7605,  2.9262]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


## Sentiment

In [None]:
labels = ['NEGATIVE', 'POSITIVE']
prediction = torch.argmax(outputs.logits)
print (f'''Sentiment : {labels[prediction]}''')