In [2]:
import warnings
warnings.filterwarnings('ignore', message='Token indices sequence length is longer than the specified maximum sequence length for this model')

# Install required libraries
!pip install transformers datasets torch

# Import libraries
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset

# Load pre-trained BERT sentiment model
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
classifier = pipeline("sentiment-analysis", model=model_name)

# Test some example sentences
samples = [
    "This internship opportunity at KAIST is exciting and inspiring!",
    "The implementation was difficult and confusing."
]

print("\n--- Sample Predictions ---")
for s in samples:
    # Truncate input to max_length 512 if it's longer
    result = classifier(s, max_length=512, truncation=True)[0]
    print(f"Text: {s}\nSentiment: {result['label']} | Score: {result['score']:.4f}\n")

# Evaluate on IMDB dataset (first 100 samples to keep it fast)
dataset = load_dataset("imdb", split="test[:100]")

correct = 0
for text, label in zip(dataset["text"], dataset["label"]):
    # Truncate input to max_length 512 if it's longer
    pred = classifier(text, max_length=512, truncation=True)[0]["label"]

    # Model returns 1-5 stars, convert to positive/negative
    pred_label = 1 if int(pred[0]) >= 3 else 0  # 3-5 stars positive, 1-2 negative

    if pred_label == label:
        correct += 1

accuracy = correct / len(dataset)
print(f"\n✅ Accuracy on 100 IMDB test samples: {accuracy:.2%}")



Device set to use cpu



--- Sample Predictions ---
Text: This internship opportunity at KAIST is exciting and inspiring!
Sentiment: 5 stars | Score: 0.7196

Text: The implementation was difficult and confusing.
Sentiment: 2 stars | Score: 0.5274


✅ Accuracy on 100 IMDB test samples: 60.00%
