In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch

from transformers import pipeline
from tqdm import tqdm

In [2]:
news_df = pd.read_csv('../data/news_merged.csv')
news_df['combined_text'] = news_df['title'].fillna("") + ". " + news_df['snippet'].fillna("")
news_df = news_df[news_df["combined_text"].str.strip() != ""]
news_df["combined_text"] = news_df["combined_text"].str.strip()

In [4]:
sentiment_pipeline = pipeline("sentiment-analysis", model="siebert/sentiment-roberta-large-english", framework="pt")
results = sentiment_pipeline(news_df["combined_text"].tolist(), batch_size=16, truncation=True)

news_df["sentiment_label"] = [r["label"] for r in results]
news_df["sentiment_score"] = [r["score"] for r in results]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Device set to use cpu


In [None]:
news_df.to_csv("../data/news_with_sentiment.csv", index=False)

news_df[["title", "sentiment_label", "sentiment_score"]].head()

Unnamed: 0,title,sentiment_label,sentiment_score
0,What Happens if There’s a Covid Outbreak at th...,POSITIVE,0.988315
1,This Conversation Will Change How You Think Ab...,POSITIVE,0.998378
2,‘The New Bauhaus’ Review: Rethinking an Approa...,POSITIVE,0.998725
3,Apple delays its return to office as the Delta...,NEGATIVE,0.999465
4,Can Apple’s AirTags Find Lost Pets?,POSITIVE,0.99287


In [17]:
news_df

Unnamed: 0,title,snippet,pub_date,web_url,section,source,year,month,title_word_count,snippet_word_count,combined_text,sentiment_label,sentiment_score
0,What Happens if There’s a Covid Outbreak at th...,Kara Swisher finds out why the 2020 Tokyo Game...,2021-07-19 09:00:10,https://www.nytimes.com/2021/07/19/opinion/swa...,Opinion,The New York Times,2021,7,10,26,What Happens if There’s a Covid Outbreak at th...,POSITIVE,0.988315
1,This Conversation Will Change How You Think Ab...,Modern work culture is built on a broken model...,2021-07-20 09:00:07,https://www.nytimes.com/2021/07/20/opinion/ezr...,Opinion,The New York Times,2021,7,9,12,This Conversation Will Change How You Think Ab...,POSITIVE,0.998378
2,‘The New Bauhaus’ Review: Rethinking an Approa...,This documentary on the interdisciplinary arti...,2021-07-20 11:00:05,https://www.nytimes.com/2021/07/20/movies/the-...,Movies,The New York Times,2021,7,9,18,‘The New Bauhaus’ Review: Rethinking an Approa...,POSITIVE,0.998725
3,Apple delays its return to office as the Delta...,Employees are now expected to come back to the...,2021-07-20 15:42:40,https://www.nytimes.com/2021/07/20/technology/...,Technology,The New York Times,2021,7,11,19,Apple delays its return to office as the Delta...,NEGATIVE,0.999465
4,Can Apple’s AirTags Find Lost Pets?,We look at the pros and cons of using Apple’s ...,2021-07-20 16:06:15,https://www.nytimes.com/2021/07/20/technology/...,Technology,The New York Times,2021,7,6,20,Can Apple’s AirTags Find Lost Pets?. We look a...,POSITIVE,0.992870
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4317,Jimmy Fallon Fans the Flames of Burning MAGA Hats,"People torched the hats in videos, apparently ...",2025-07-16 06:42:23,https://www.nytimes.com/2025/07/16/arts/televi...,Arts,The New York Times,2025,7,9,29,Jimmy Fallon Fans the Flames of Burning MAGA H...,NEGATIVE,0.997346
4318,"How to Keep Love Alive, With Rob Delaney of ‘D...",The Emmy-nominated actor talks about his role ...,2025-07-16 09:00:29,https://www.nytimes.com/2025/07/16/podcasts/ro...,Podcasts,The New York Times,2025,7,12,22,"How to Keep Love Alive, With Rob Delaney of ‘D...",POSITIVE,0.998707
4319,Project 2025’s Other Project,Inside the plan from the Heritage Foundation t...,2025-07-16 10:00:10,https://www.nytimes.com/2025/07/16/podcasts/th...,Podcasts,The New York Times,2025,7,4,16,Project 2025’s Other Project. Inside the plan ...,NEGATIVE,0.991663
4320,"Tariffs Push Prices Up, and the Supreme Court’...","Plus, why that e-book cost your library $50.",2025-07-16 10:00:12,https://www.nytimes.com/2025/07/16/podcasts/th...,Podcasts,The New York Times,2025,7,10,8,"Tariffs Push Prices Up, and the Supreme Court’...",NEGATIVE,0.997372


In [18]:
news_df['section'].unique()

array(['Opinion', 'Movies', 'Technology', 'Business', 'U.S.', 'Magazine',
       'World', 'Arts', 'Today’s Paper', 'Sports', 'Corrections',
       'Gameplay', 'Podcasts', 'Well', 'Theater', 'Food',
       'The Learning Network', 'Books', 'Briefing', 'New York', 'At Home',
       'Fashion', 'T Magazine', 'Style', 'Travel', 'Smarter Living',
       'Your Money', 'Science', 'Climate', 'Real Estate', 'Health',
       'Times Insider', 'Obituaries', 'Special Series', 'The Upshot',
       'Multimedia/Photos', 'Weather'], dtype=object)

In [19]:
sections_to_drop = [
    'Arts', 'Opinion', 'Sports', 'Corrections', 'Gameplay', 'Well', 'Theater',
    'Food', 'The Learning Network', 'Fashion', 'Style', 'Travel',
    'Real Estate', 'Health', 'Obituaries', 'Special Series',
    'The Upshot', 'Multimedia/Photos', 'Weather'
]

filtered_news_df = news_df[~news_df['section'].isin(sections_to_drop)].copy()

In [21]:
filtered_news_df

Unnamed: 0,title,snippet,pub_date,web_url,section,source,year,month,title_word_count,snippet_word_count,combined_text,sentiment_label,sentiment_score
2,‘The New Bauhaus’ Review: Rethinking an Approa...,This documentary on the interdisciplinary arti...,2021-07-20 11:00:05,https://www.nytimes.com/2021/07/20/movies/the-...,Movies,The New York Times,2021,7,9,18,‘The New Bauhaus’ Review: Rethinking an Approa...,POSITIVE,0.998725
3,Apple delays its return to office as the Delta...,Employees are now expected to come back to the...,2021-07-20 15:42:40,https://www.nytimes.com/2021/07/20/technology/...,Technology,The New York Times,2021,7,11,19,Apple delays its return to office as the Delta...,NEGATIVE,0.999465
4,Can Apple’s AirTags Find Lost Pets?,We look at the pros and cons of using Apple’s ...,2021-07-20 16:06:15,https://www.nytimes.com/2021/07/20/technology/...,Technology,The New York Times,2021,7,6,20,Can Apple’s AirTags Find Lost Pets?. We look a...,POSITIVE,0.992870
5,Biden to Name a Critic of Big Tech as the Top ...,"Jonathan Kanter, a longtime antitrust lawyer, ...",2021-07-20 18:04:21,https://www.nytimes.com/2021/07/20/business/ka...,Business,The New York Times,2021,7,13,22,Biden to Name a Critic of Big Tech as the Top ...,POSITIVE,0.985043
6,The F.T.C. votes to use its leverage to make i...,An advocacy group for technology companies cri...,2021-07-22 02:02:59,https://www.nytimes.com/2021/07/21/us/politics...,U.S.,The New York Times,2021,7,17,20,The F.T.C. votes to use its leverage to make i...,NEGATIVE,0.997571
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4316,Tariffs Begin to Leave an Imprint on the Economy,"Also, Republicans in Congress followed Trump’s...",2025-07-15 21:41:05,https://www.nytimes.com/2025/07/15/briefing/ta...,Briefing,The New York Times,2025,7,9,17,Tariffs Begin to Leave an Imprint on the Econo...,NEGATIVE,0.990673
4318,"How to Keep Love Alive, With Rob Delaney of ‘D...",The Emmy-nominated actor talks about his role ...,2025-07-16 09:00:29,https://www.nytimes.com/2025/07/16/podcasts/ro...,Podcasts,The New York Times,2025,7,12,22,"How to Keep Love Alive, With Rob Delaney of ‘D...",POSITIVE,0.998707
4319,Project 2025’s Other Project,Inside the plan from the Heritage Foundation t...,2025-07-16 10:00:10,https://www.nytimes.com/2025/07/16/podcasts/th...,Podcasts,The New York Times,2025,7,4,16,Project 2025’s Other Project. Inside the plan ...,NEGATIVE,0.991663
4320,"Tariffs Push Prices Up, and the Supreme Court’...","Plus, why that e-book cost your library $50.",2025-07-16 10:00:12,https://www.nytimes.com/2025/07/16/podcasts/th...,Podcasts,The New York Times,2025,7,10,8,"Tariffs Push Prices Up, and the Supreme Court’...",NEGATIVE,0.997372


In [23]:
filtered_news_df.to_csv('../data/filtered_news_with_sentiment.csv')