# Chapter 4: Text Classification - Solutions

Concise solutions for all tasks (Easy, Medium, Hard).

## Setup

In [None]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
import numpy as np

data = load_dataset("rotten_tomatoes")
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

## Easy Task 1: Zero-Shot Classification

In [None]:
test_reviews = [
    "This movie was absolutely fantastic! A masterpiece!",
    "Terrible waste of time. Very disappointing.",
    "An okay film, nothing special but watchable.",
]

labels = ["A negative movie review", "A positive movie review"]

label_embeddings = model.encode(labels)
review_embeddings = model.encode(test_reviews)
sim_matrix = cosine_similarity(review_embeddings, label_embeddings)

print("Classification Results:")
for i, review in enumerate(test_reviews):
    prediction = np.argmax(sim_matrix[i])
    confidence = sim_matrix[i][prediction]
    margin = abs(sim_matrix[i][0] - sim_matrix[i][1])
    print(f"\nReview {i+1}: '{review}'")
    print(f"Predicted: {labels[prediction]}")
    print(f"Confidence: {confidence:.3f}")
    print(f"Margin: {margin:.3f}")

## Easy Task 2: Classifier Strategy

In [None]:
y_true = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
classifier_conservative = np.array([0, 0, 0, 0, 0, 0, 0, 1, 1, 1])
classifier_aggressive = np.array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1])
classifier_balanced = np.array([0, 0, 0, 1, 0, 0, 1, 1, 1, 1])

def analyze_classifier(name, y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    
    print(f"\n{name}")
    print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}")
    return precision, recall, f1

results = {}
for name, clf in [("Conservative", classifier_conservative), 
                   ("Aggressive", classifier_aggressive),
                   ("Balanced", classifier_balanced)]:
    p, r, f = analyze_classifier(name, y_true, clf)
    results[name] = (p, r, f)

## Easy Task 3: Temperature Effects

In [None]:
original_probs = np.array([0.50, 0.30, 0.12, 0.05, 0.03])
tokens = ["positive", "negative", "neutral", "good", "bad"]

def apply_temperature(probs, temperature):
    if temperature == 0:
        result = np.zeros_like(probs)
        result[np.argmax(probs)] = 1.0
        return result
    logits = np.log(probs + 1e-10)
    scaled_logits = logits / temperature
    exp_logits = np.exp(scaled_logits)
    return exp_logits / np.sum(exp_logits)

temperatures = [0, 0.5, 1.0, 2.0]

for temp in temperatures:
    new_probs = apply_temperature(original_probs, temp)
    print(f"\nTemperature = {temp}")
    for i, token in enumerate(tokens):
        print(f"  {token:10s}: {new_probs[i]:.3f}")

## Easy Task 4: Embedding Similarity

In [None]:
texts = [
    "Amazing movie! Absolutely loved it!",
    "Fantastic film, highly recommend!",
    "Great cinematography and acting",
    "Terrible waste of time",
    "Very disappointing and boring",
    "Poor acting and weak plot",
    "It was okay, nothing special",
    "Some good parts, some bad parts",
    "The weather is nice today",
    "I like eating pizza"
]

embeddings = model.encode(texts)
similarity_matrix = cosine_similarity(embeddings)

print(f"Each text: {embeddings.shape[1]}-dimensional vector")
print(f"\nSimilarity between Text 1 and Text 2: {similarity_matrix[0][1]:.3f}")
print(f"Similarity between Text 1 and Text 4: {similarity_matrix[0][3]:.3f}")

# Find similar texts
positive_idx = 0
similar_to_positive = []
for i in range(len(texts)):
    if i != positive_idx and similarity_matrix[positive_idx][i] > 0.5:
        similar_to_positive.append((i, similarity_matrix[positive_idx][i]))

print(f"\nTexts similar to '{texts[positive_idx]}':")
for idx, sim in sorted(similar_to_positive, key=lambda x: x[1], reverse=True):
    print(f"  Text {idx+1} (sim={sim:.3f}): '{texts[idx]}'")