In [1]:
# 09c: Sentiment Scoring with DistilBERT (PyTorch-only)
# Step 1: Disable TensorFlow to avoid loading TF backends
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"

In [2]:
# Step 2: Imports
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F
import pandas as pd
from tqdm import tqdm

In [3]:
# Step 3: Load filtered news dataset
df = pd.read_csv('../data/processed/filtered_energy_news.csv')
tqdm.pandas()

In [4]:
# Step 4: Load PyTorch model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [5]:
# Step 5: Define sentiment classification function (PyTorch-only)
def classify_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        logits = model(**inputs).logits
        probs = F.softmax(logits, dim=1)
        confidence, predicted = torch.max(probs, dim=1)
        label = model.config.id2label[predicted.item()]
    return pd.Series([label, confidence.item()])

In [6]:
# Step 6: Apply to headlines
df[['sentiment', 'sentiment_score']] = df['headline'].astype(str).progress_apply(classify_sentiment)

100%|██████████| 9388/9388 [06:52<00:00, 22.78it/s]


In [7]:
# Step 7: Save results
df.to_csv('../data/processed/filtered_energy_news_with_sentiment.csv', index=False)
print("✅ Sentiment scoring complete and saved to filtered_energy_news_with_sentiment.csv")

✅ Sentiment scoring complete and saved to filtered_energy_news_with_sentiment.csv
