In [None]:
!pip install openai

In [None]:
import pandas as pd
import yfinance as yf
from datetime import date, timedelta
import openai
from openai import OpenAI
import json
import time
from tqdm.notebook import tqdm

In [None]:
# Load m7 financial news dataset from (https://huggingface.co/datasets/itsalissonsilva/mag7-news-dataset)
df = pd.read_csv('df_m7.csv', quotechar='"', engine='python', on_bad_lines='skip')

In [None]:
df['Date'] = pd.to_datetime(df['Date'])

# Filter NVDA 2023
df_nvda = df[(df['Stock_symbol'] == 'NVDA') & (df['Date'].dt.year == 2023)].copy()

In [None]:
# Remove time (keep just the date)
df_nvda['DateOnly'] = df_nvda['Date'].dt.date

# Keep only first article per day
df_nvda_one_per_day = df_nvda.sort_values('Date').drop_duplicates(subset='DateOnly', keep='first')

In [None]:
# Generate full list of 2023 dates
all_2023_dates = set(date(2023, 1, 1) + timedelta(days=i) for i in range(365))

# Extract actual article dates
nvda_dates = set(df_nvda_one_per_day['Date'].dt.date)

# Find missing dates
missing_nvda = sorted(all_2023_dates - nvda_dates)

# Show results
print(f"Missing NVDA days in 2023 ({len(missing_nvda)}):")
print(missing_nvda)

In [None]:
# Download price data

nvda_ticker = yf.Ticker("NVDA")
price_nvda = nvda_ticker.history(start="2023-01-01", end="2024-01-01")
price_nvda = price_nvda[['Close']].reset_index()
price_nvda['DateOnly'] = price_nvda['Date'].dt.date
df_nvda_one_per_day['DateOnly'] = df_nvda_one_per_day['Date'].dt.date

# Merge
df_nvda_merged = pd.merge(df_nvda_one_per_day, price_nvda, on='DateOnly', how='left')
df_nvda_merged.drop(columns=['DateOnly'], inplace=True)

In [None]:
# Keep only articles on trading days

valid_nvda_dates = set(price_nvda['DateOnly'])
df_nvda_trading_days = df_nvda_one_per_day[df_nvda_one_per_day['DateOnly'].isin(valid_nvda_dates)].copy()

# Merge with price data from before
df_nvda_merged = pd.merge(df_nvda_trading_days, price_nvda[['DateOnly', 'Close']], on='DateOnly', how='left')

df_nvda_merged.drop(columns=['DateOnly'], inplace=True)

In [None]:
client = OpenAI(api_key='YOURAPIKEY')

In [None]:
def analyze_sentiment(summary, model="gpt-4"):
    prompt = f"""
You are a financial sentiment analyst.

Given the following news summary about a stock, assign a score from 0 to 1 (rounded to 2 decimal places) for each of the following sentiment dimensions:

- Optimism (positive forward-looking sentiment)
- Uncertainty (vagueness, ambiguity, or risk)
- Surprise (unexpected developments)
- Immediacy (urgency or time sensitivity)
- Relief (easing of prior concerns)

Only respond in valid JSON format.

Summary:
\"\"\"{summary}\"\"\"
"""
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
        )
        content = response.choices[0].message.content
        return json.loads(content)
    except Exception as e:
        print(f"Error: {e}")
        return {
            "Optimism": None,
            "Uncertainty": None,
            "Surprise": None,
            "Immediacy": None,
            "Relief": None
        }


In [None]:
# Use tqdm to monitor progress
summaries = df_nvda_merged['Lsa_summary'].fillna("").tolist()
scores = []

for summary in tqdm(summaries, desc="Scoring sentiment"):
    if not summary.strip():
        scores.append({
            "Optimism": None,
            "Uncertainty": None,
            "Surprise": None,
            "Immediacy": None,
            "Relief": None
        })
        continue
    try:
        result = analyze_sentiment(summary)
    except:
        result = {
            "Optimism": None,
            "Uncertainty": None,
            "Surprise": None,
            "Immediacy": None,
            "Relief": None
        }
    scores.append(result)
    time.sleep(1)  # adjust if needed

In [None]:
df_nvda_final['Date'] = pd.to_datetime(df_nvda_final['Date'])

print("Date range:", df_nvda_final['Date'].min().date(), "to", df_nvda_final['Date'].max().date())

print("Unique days with articles:", df_nvda_final['Date'].dt.date.nunique())

# Check for missing close prices
missing_prices = df_nvda_final['Close'].isna().sum()
print("Missing 'Close' prices:", missing_prices)

In [None]:
df_nvda_final['Date'] = pd.to_datetime(df_nvda_final['Date'])

start_date = df_nvda_final['Date'].min().strftime('%Y-%m-%d')
end_date = df_nvda_final['Date'].max().strftime('%Y-%m-%d')

filename = f"nvda_final_{start_date}_to_{end_date}.csv"

df_nvda_final.to_csv(filename, index=False)