In [1]:
# Import libraries
import pandas as pd
import numpy as np
from transformers import pipeline, DistilBertTokenizer, DistilBertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import re
from datetime import datetime

# Initialize DistilBERT sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Initialize DistilBERT tokenizer and model for embeddings
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# S&P 500 tickers and company names (partial list, extend as needed)
sp500_tickers = {
    'AAPL': 'Apple Inc.',
    'MSFT': 'Microsoft Corporation',
    'AMZN': 'Amazon.com Inc.',
    'NVDA': 'Nvidia Corporation',
    'GOOGL': 'Alphabet Inc. Class A',
    'GOOG': 'Alphabet Inc. Class C',
    'TSLA': 'Tesla Inc.',
    'META': 'Meta Platforms Inc.',
    'BRK-B': 'Berkshire Hathaway Inc. Class B',
    'JPM': 'JPMorgan Chase & Co.',
    'WMT': 'Walmart Inc.',
    'V': 'Visa Inc.',
    'MA': 'Mastercard Incorporated',
    'UNH': 'UnitedHealth Group Incorporated',
    'XOM': 'Exxon Mobil Corporation',
    'PG': 'Procter & Gamble Co.',
    'COST': 'Costco Wholesale Corp.',
    'JNJ': 'Johnson & Johnson',
    'HD': 'Home Depot Inc.',
    'MRK': 'Merck & Co. Inc.',
    'LLY': 'Eli Lilly and Company',
    'AVGO': 'Broadcom Inc.',
    'BAC': 'Bank of America Corporation',
    'CVX': 'Chevron Corporation',
    'NFLX': 'Netflix Inc.',
    'ADBE': 'Adobe Inc.',
    'KO': 'Coca-Cola Company',
    'PEP': 'PepsiCo Inc.',
    'CRM': 'Salesforce Inc.',
    'AMD': 'Advanced Micro Devices',
    'INTC': 'Intel Corporation',
    'CSCO': 'Cisco Systems Inc.',
    'ORCL': 'Oracle Corporation',
    'IBM': 'International Business Machines',
    'PFE': 'Pfizer Inc.',
    'DIS': 'Walt Disney Company',
    'CMCSA': 'Comcast Corporation',
    'VZ': 'Verizon Communications Inc.',
    'T': 'AT&T Inc.',
    'WFC': 'Wells Fargo & Company',
    'GS': 'Goldman Sachs Group Inc.',
    'C': 'Citigroup Inc.',
    'CAT': 'Caterpillar Inc.',
    'BA': 'Boeing Company',
    'GE': 'General Electric Company',
    'HON': 'Honeywell International Inc.',
    'SBUX': 'Starbucks Corporation',
    'NKE': 'NIKE Inc.',
    'MCD': 'McDonald’s Corporation',
    'LOW': 'Lowe’s Companies Inc.'
}

# Define categories
categories = [
    'News - Positive Sentiment', 'News - Negative Sentiment', 'News - New Products',
    'News - Layoffs', 'News - Analyst Comments', 'News - Stocks', 'News - Dividends',
    'News - Corporate Earnings', 'News - Mergers & Acquisitions', 'News - Store Openings',
    'News - Product Recalls', 'News - Adverse Events', 'News - Personnel Changes',
    'News - Stock Rumors'
]

# Category descriptions for embedding-based tagging (simulating fine-tuned model)
category_descriptions = {
    'News - Positive Sentiment': 'Stock price increases, optimistic outlook, strong performance',
    'News - Negative Sentiment': 'Stock price declines, negative outlook, poor performance',
    'News - New Products': 'Launch or announcement of new products or services',
    'News - Layoffs': 'Company announces job cuts or downsizing',
    'News - Analyst Comments': 'Analyst reports, forecasts, or concerns about the company',
    'News - Stocks': 'General news about stock price or equity movements',
    'News - Dividends': 'Announcements about dividend payouts or changes',
    'News - Corporate Earnings': 'Reports on company earnings, revenue, or EPS',
    'News - Mergers & Acquisitions': 'Mergers, acquisitions, or buyouts involving the company',
    'News - Store Openings': 'New store openings or business expansion',
    'News - Product Recalls': 'Product defects, recalls, or safety issues',
    'News - Adverse Events': 'Lawsuits, regulatory scrutiny, or negative events',
    'News - Personnel Changes': 'Changes in executives, CEO, or key personnel',
    'News - Stock Rumors': 'Speculation or rumors about stock or company actions'
}

# Function to get DistilBERT embeddings
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()  # CLS token embedding

# Function to map headlines to tickers
def map_to_ticker(text):
    text_lower = text.lower()
    for ticker, name in sp500_tickers.items():
        if ticker.lower() in text_lower or name.lower() in text_lower:
            return ticker
    return 'Unknown'

# Function for multi-label tagging using embeddings
def tag_categories(text):
    text_embedding = get_embeddings(text)
    tags = []
    for category, desc in category_descriptions.items():
        desc_embedding = get_embeddings(desc)
        similarity = cosine_similarity(text_embedding, desc_embedding)[0][0]
        if similarity > 0.85:  # Threshold (adjust based on testing)
            tags.append(category)
    return tags if tags else ['None']

# Apply sentiment analysis
def get_sentiment(text):
    result = sentiment_analyzer(text)[0]
    label = result['label']
    score = result['score']
    return label, score

Device set to use cpu


In [17]:
import pandas as pd
df = pd.read_csv('data/BloombergNews100.csv')

In [18]:
df['Ticker'] = df['Headline'].apply(map_to_ticker)

In [20]:
df['Categories'] = df['Headline'].apply(tag_categories)

In [22]:
df[['Sentiment', 'Sentiment_Score']] = df['Headline'].apply(get_sentiment).apply(pd.Series)

In [23]:
df.to_csv('data/News Tagging/news_tags_distilbert.csv')