In [1]:
# BEHIND THE PIPELINE: SENTIMENT ANALYSIS
#Sentiment analysis is a natural language processing (NLP) technique used to determine 
# the emotional tone behind a piece of text, classifying it as positive, negative, 
# or neutral.

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
model.eval()

text="I expected more from this course, it felt rushed and incomplete."

inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
    logits = model(**inputs).logits

probabilities = torch.softmax(logits, dim=1)

predicted_class_id=probabilities.argmax().item()
confidence=probabilities.max().item()

label=model.config.id2label[predicted_class_id]

print(f"Prediction: {label}")
print(f"Confidence: {confidence:.4f}")


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Loading weights:   0%|          | 0/104 [00:00<?, ?it/s]

Prediction: NEGATIVE
Confidence: 0.9997


In [1]:
#Inspect and Compare Model Internals
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name,output_hidden_states=True)
model.eval()
text = "The movie was fantastic! I really enjoyed it."

inputs = tokenizer(text, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits
hidden_states = outputs.hidden_states
print(f"Logits: {logits}")
print(f"Number of hidden states: {len(hidden_states)}")
print(f"Shape of each hidden state: {[state.shape for state in hidden_states]}")



Loading weights:   0%|          | 0/104 [00:00<?, ?it/s]

Logits: tensor([[-4.3398,  4.6898]])
Number of hidden states: 7
Shape of each hidden state: [torch.Size([1, 12, 768]), torch.Size([1, 12, 768]), torch.Size([1, 12, 768]), torch.Size([1, 12, 768]), torch.Size([1, 12, 768]), torch.Size([1, 12, 768]), torch.Size([1, 12, 768])]


In [2]:
#Compare Tokenization Strategies + Full Encodeâ€“Decode Cycle
#I will:
# Load a real tokenizer
# Tokenize tricky text
# Inspect tokens
# Convert tokens -> IDs
# Decode back to text
# Observe subword behavior

import torch
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

text="hello bhaiya kaise ho aap log"

tokens=tokenizer.tokenize(text)
print(f"Tokens: {tokens}")

encoding = tokenizer(text)
input_ids = encoding["input_ids"]
print(f"Input IDs: {input_ids}")

decoded_text=tokenizer.decode(input_ids)
print(decoded_text)

Tokens: ['hello', 'b', '##hai', '##ya', 'kai', '##se', 'ho', 'aa', '##p', 'log']
Input IDs: [101, 7592, 1038, 10932, 3148, 11928, 3366, 7570, 9779, 2361, 8833, 102]
[CLS] hello bhaiya kaise ho aap log [SEP]
