In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from gensim.models import Word2Vec

Scraping today's news headlines

In [None]:
# Function to scrape headlines from a given URL
def scrape_headlines(url, tag_name, class_name):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        headlines = soup.find_all(tag_name, class_=class_name)
        return [headline.text.strip() for headline in headlines]
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from {url}: {e}")
        return []

# MSNBC
msnbc_url = 'https://www.msnbc.com/'
msnbc_headlines = scrape_headlines(msnbc_url, 'h3', 'styles_headline__vGca_') #Working

# BBC
bbc_url = 'https://www.bbc.com/'
bbc_headlines = scrape_headlines(bbc_url, 'h3', 'media__title') #Working

# NPR
npr_url = 'https://www.npr.org/'
npr_headlines = scrape_headlines(npr_url, 'h3', 'title') #Working

# Print headlines
print("MSNBC Headlines:")
for i, headline in enumerate(msnbc_headlines, start=1):
    print(f"{i}. {headline}")

print("\nBBC Headlines:")
for i, headline in enumerate(bbc_headlines, start=1):
    print(f"{i}. {headline}")

print("\nNPR Headlines:")
for i, headline in enumerate(npr_headlines, start=1):
    print(f"{i}. {headline}")


MSNBC Headlines:
1. Why a city surrounded by water is suddenly in a panic over its drinking supply
2. The SCOTUS myth that's being disproved by the Sen. Menendez indictment
3. Biden knew he had to get to striking autoworkers before Trump
4. Moms are driving our economic recovery. Why is Congress pushing them over a cliff?
5. Deion Sanders is winning at Colorado. That's not why people are mad.
6. This was indicted Senator Bob Menendez's biggest mistake
7. The 3 most destructive world events of my lifetime all have one man in common
8. Free Covid tests are back to remind us what good governance looks like
9. John Fetterman shaved and he’s speaking better. Cue the conspiracy theories.
10. Before you read Cassidy Hutchinson's book 'Enough,' remember her bravery

BBC Headlines:
1. Travis King in US custody after North Korea expulsion
2. Kevin McCarthy's job on the line as shutdown looms
3. Scientists closer to solving mystery of antimatter
4. A 3D map of an eerie asteroid
5. The US defector

**Sentiment Analysis**

Load data

In [None]:

csv_url = "http://ianvetter.cyou/sentiment_data.csv"
local_filename = "data.csv"
response = requests.get(csv_url)

if response.status_code == 200:
    with open(local_filename, 'wb') as file:
        file.write(response.content)
    print(f"Downloaded {local_filename}")
else:
    print(f"Failed to download {csv_url}")

data = pd.read_csv('data.csv')
data.head()


Downloaded data.csv


(44626, 11)

Model  (LSTM regression into sentiment score)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from gensim.models import Word2Vec

headlines = data["Title"]
sentiments = data["SentimentTitle"]

# Tokenize and create a sequence of word indices for each headline
def tokenize_and_index(headlines, word_to_index, max_length):
    indexed_headlines = []
    for headline in headlines:
        tokens = headline.split()
        indexed_sequence = [word_to_index[token] if token in word_to_index else word_to_index['<UNK>'] for token in tokens]
        padding = [word_to_index['<PAD>']] * (max_length - len(indexed_sequence))
        indexed_headline = indexed_sequence + padding
        indexed_headlines.append(indexed_headline)
    return np.array(indexed_headlines)

# Create a vocabulary and word-to-index mapping
word_to_index = {'<PAD>': 0, '<UNK>': 1}
max_tokens = 0
for headline in headlines:
    tokens = headline.split()
    if (len(tokens) > max_tokens):
      max_tokens = len(tokens)
    for token in tokens:
        if token not in word_to_index:
            word_to_index[token] = len(word_to_index)


# Pad sequences to a fixed length
max_sequence_length = max_tokens + 2
X = tokenize_and_index(headlines, word_to_index, max_sequence_length)
y = np.array(sentiments, dtype=np.float32)  # Convert sentiments to float32

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a PyTorch dataset
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.int64)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = CustomDataset(X_train, y_train)
test_dataset = CustomDataset(X_test, y_test)

class SentimentModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, lin_dim, output_dim):
        super(SentimentModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc1 = nn.Linear(hidden_dim, lin_dim)
        self.fc2 = nn.Linear(lin_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        hidden, _ = self.rnn(embedded)
        output = self.fc1(hidden)
        output = self.fc2(output[:, -1, :])  # Get the last output of the sequence
        return output

# Define hyperparameters
vocab_size = len(word_to_index)
embedding_dim = 200
hidden_dim = 64
lin_dim = 32
output_dim = 1  # Output dimension for regression

learning_rate = 0.001
batch_size = 32
num_epochs = 10

# Create the model
model = SentimentModel(vocab_size, embedding_dim, hidden_dim, lin_dim, output_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

losses = []

# Training loop
for epoch in range(num_epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        predictions = model(batch_X)
        loss = criterion(predictions.squeeze(), batch_y)  # Squeeze to match dimensions
        loss.backward()
        optimizer.step()
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')
        losses.append(loss.item())


# Evaluation
model.eval()
with torch.no_grad():
    test_loss = 0
    for batch_X, batch_y in test_loader:
        predictions = model(batch_X)
        test_loss += criterion(predictions.squeeze(), batch_y).item()

    test_loss /= len(test_loader)

print(f'Mean Squared Error on the test set: {test_loss:.4f}')


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch [7/10], Loss: 0.004609841387718916
Epoch [7/10], Loss: 0.0007749184733256698
Epoch [7/10], Loss: 0.0008217815193347633
Epoch [7/10], Loss: 0.0008388591813854873
Epoch [7/10], Loss: 0.0009889688808470964
Epoch [7/10], Loss: 0.0010835143039003015
Epoch [7/10], Loss: 0.0008212517714127898
Epoch [7/10], Loss: 0.0012982534244656563
Epoch [7/10], Loss: 0.0015791531186550856
Epoch [7/10], Loss: 0.0006726961582899094
Epoch [7/10], Loss: 0.0005622739554382861
Epoch [7/10], Loss: 0.0012378953397274017
Epoch [7/10], Loss: 0.0007135291234590113
Epoch [7/10], Loss: 0.0011850351002067327
Epoch [7/10], Loss: 0.0015186809469014406
Epoch [7/10], Loss: 0.0007442575297318399
Epoch [7/10], Loss: 0.00047873533912934363
Epoch [7/10], Loss: 0.001511965412646532
Epoch [7/10], Loss: 0.0006120912730693817
Epoch [7/10], Loss: 0.0005008046864531934
Epoch [7/10], Loss: 0.0017974975053220987
Epoch [7/10], Loss: 0.0015753802144899964
Epoch [7/10]

Test out on today's headlines!

In [1]:
input = torch.tensor(tokenize_and_index(bbc_headlines, word_to_index, max_sequence_length))
sentiment_scores = model(input)


print("\nNPR Headlines:")
for i, headline in enumerate(bbc_headlines, start=1):
    print(f"{i}. {headline}", "\n", " Sentiment Score: ", round(float(sentiment_scores[i-1]), 2), "\n")


NameError: ignored