In [None]:
!pip install openai



In [None]:
import pandas as pd
import yfinance as yf
from datetime import date, timedelta
import openai
from openai import OpenAI
import json
import time
from tqdm.notebook import tqdm

In [None]:
# Load m7 financial news dataset from (https://huggingface.co/datasets/itsalissonsilva/mag7-news-dataset)
df = pd.read_csv('df_m7.csv', quotechar='"', engine='python', on_bad_lines='skip')

In [None]:
df['Date'] = pd.to_datetime(df['Date'])

df_nvda = df[
    (df['Stock_symbol'] == 'NVDA') &
    (df['Date'].dt.year.isin([2022, 2023]))
].copy()

In [None]:
df['Date'] = pd.to_datetime(df['Date'])

df_nvda = df[
    (df['Stock_symbol'] == 'NVDA') &
    (df['Date'].dt.year.isin([2022, 2023]))
].copy()

df_nvda['Year'] = df_nvda['Date'].dt.year
df_nvda['DateOnly'] = df_nvda['Date'].dt.date

counts_per_year = (
    df_nvda.groupby('Year')['DateOnly']
           .nunique()
           .reset_index(name='UniqueDays')
)

print(counts_per_year)


   Year  UniqueDays
0  2022         344
1  2023         343


In [None]:
df_nvda['DateOnly'] = df_nvda['Date'].dt.date

df_nvda_one_per_day = df_nvda.sort_values('Date').drop_duplicates(subset='DateOnly', keep='first')

In [None]:
all_2023_dates = set(date(2023, 1, 1) + timedelta(days=i) for i in range(365))

nvda_dates = set(df_nvda_one_per_day['Date'].dt.date)

missing_nvda = sorted(all_2023_dates - nvda_dates)

print(f"Missing NVDA days in 2023 ({len(missing_nvda)}):")
print(missing_nvda)

Missing NVDA days in 2023 (22):
[datetime.date(2023, 1, 7), datetime.date(2023, 1, 21), datetime.date(2023, 1, 28), datetime.date(2023, 3, 11), datetime.date(2023, 3, 12), datetime.date(2023, 5, 6), datetime.date(2023, 5, 20), datetime.date(2023, 12, 17), datetime.date(2023, 12, 18), datetime.date(2023, 12, 19), datetime.date(2023, 12, 20), datetime.date(2023, 12, 21), datetime.date(2023, 12, 22), datetime.date(2023, 12, 23), datetime.date(2023, 12, 24), datetime.date(2023, 12, 25), datetime.date(2023, 12, 26), datetime.date(2023, 12, 27), datetime.date(2023, 12, 28), datetime.date(2023, 12, 29), datetime.date(2023, 12, 30), datetime.date(2023, 12, 31)]


In [None]:
nvda_ticker = yf.Ticker("NVDA")
price_nvda = nvda_ticker.history(start="2022-01-01", end="2024-01-01")
price_nvda = price_nvda[['Close']].reset_index()
price_nvda['DateOnly'] = price_nvda['Date'].dt.date
df_nvda_one_per_day['DateOnly'] = df_nvda_one_per_day['Date'].dt.date

df_nvda_merged = pd.merge(df_nvda_one_per_day, price_nvda, on='DateOnly', how='left')

In [None]:
counts_per_year_merged = (
    df_nvda_merged.groupby('Year')['DateOnly']
                  .nunique()
                  .reset_index(name='UniqueDays')
)

print(counts_per_year_merged)

date_ranges_merged = (
    df_nvda_merged.groupby('Year')['DateOnly']
                  .agg(['min', 'max'])
                  .reset_index()
)

print(date_ranges_merged)

   Year  UniqueDays
0  2022         250
1  2023         241
   Year         min         max
0  2022  2022-01-03  2022-12-30
1  2023  2023-01-03  2023-12-15


In [None]:
valid_nvda_dates = set(price_nvda['DateOnly'])
df_nvda_trading_days = df_nvda_one_per_day[df_nvda_one_per_day['DateOnly'].isin(valid_nvda_dates)].copy()

df_nvda_merged = pd.merge(df_nvda_trading_days, price_nvda[['DateOnly', 'Close']], on='DateOnly', how='left')


In [None]:
client = OpenAI(api_key='YOURAPIKEYHERE')

In [None]:
def analyze_sentiment(summary, model="gpt-4"):
    prompt = f"""
You are a financial sentiment analyst.

Given the following news summary about a stock, assign a score from 0 to 1 (rounded to 2 decimal places) for each of the following sentiment dimensions:

- Optimism (positive forward-looking sentiment)
- Uncertainty (vagueness, ambiguity, or risk)
- Surprise (unexpected developments)
- Immediacy (urgency or time sensitivity)
- Relief (easing of prior concerns)

Only respond in valid JSON format.

Summary:
\"\"\"{summary}\"\"\"
"""
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
        )
        content = response.choices[0].message.content
        return json.loads(content)
    except Exception as e:
        print(f"Error: {e}")
        return {
            "Optimism": None,
            "Uncertainty": None,
            "Surprise": None,
            "Immediacy": None,
            "Relief": None
        }


In [None]:
summaries = df_nvda_merged['Lsa_summary'].fillna("").tolist()
scores = []

for summary in tqdm(summaries, desc="Scoring sentiment"):
    if not summary.strip():
        scores.append({
            "Optimism": None,
            "Uncertainty": None,
            "Surprise": None,
            "Immediacy": None,
            "Relief": None
        })
        continue
    try:
        result = analyze_sentiment(summary)
    except:
        result = {
            "Optimism": None,
            "Uncertainty": None,
            "Surprise": None,
            "Immediacy": None,
            "Relief": None
        }
    scores.append(result)
    time.sleep(1)  # adjust if needed

Scoring sentiment:   0%|          | 0/491 [00:00<?, ?it/s]

In [None]:
scores.head()

AttributeError: 'list' object has no attribute 'head'

In [None]:
df_nvda_merged.head()

Unnamed: 0,Date,Article_title,Stock_symbol,Url,Article,Lsa_summary,Luhn_summary,Textrank_summary,Lexrank_summary,Year,DateOnly,Close
0,2022-01-03 00:00:00+00:00,3 Surefire Metaverse Stocks That Could Make Yo...,NVDA,https://www.nasdaq.com/articles/3-surefire-met...,The metaverse has created quite a buzz as comp...,Nvidia: Powering the metaverse No discussion r...,Nvidia: Powering the metaverse No discussion r...,Nvidia: Powering the metaverse No discussion r...,Nvidia: Powering the metaverse No discussion r...,2022,2022-01-03,30.066137
1,2022-01-04 00:00:00+00:00,Why This Semiconductor Stock's Price Jumped Mo...,NVDA,https://www.nasdaq.com/articles/why-this-semic...,Today's video focuses on Advanced Micro Device...,The update informs investors that AMD now expe...,Today's video focuses on Advanced Micro Device...,Today's video focuses on Advanced Micro Device...,"AMD released an update on Dec. 30, 2021, about...",2022,2022-01-04,29.236647
2,2022-01-05 00:00:00+00:00,"Why Intel Popped, but AMD and Nvidia Dropped T...",NVDA,https://www.nasdaq.com/articles/why-intel-popp...,"What happened\nIt's Jan. 5 and at long last, t...","In contrast, Intel rivals Nvidia (NASDAQ: NVDA...","In contrast, Intel rivals Nvidia (NASDAQ: NVDA...","In contrast, Intel rivals Nvidia (NASDAQ: NVDA...","In contrast, Intel rivals Nvidia (NASDAQ: NVDA...",2022,2022-01-05,27.553715
3,2022-01-06 00:00:00+00:00,Tesla Briefly Led the Nasdaq to New Heights. H...,NVDA,https://www.nasdaq.com/articles/tesla-briefly-...,"On Monday, a sea of green glossed over the U.S...","Wednesday brought a very different tune, as th...",A heavyweight in the U.S. stock market Daniel ...,A heavyweight in the U.S. stock market Daniel ...,A heavyweight in the U.S. stock market Daniel ...,2022,2022-01-06,28.126673
4,2022-01-07 00:00:00+00:00,What Is the Metaverse (And How Can I Invest In...,NVDA,https://www.nasdaq.com/articles/what-is-the-me...,Just as we were getting accustomed to the idea...,"For example, Nvidia (NVDA) created Omniverse, ...","For example, Nvidia (NVDA) created Omniverse, ...","For example, Nvidia (NVDA) created Omniverse, ...","For example, Nvidia (NVDA) created Omniverse, ...",2022,2022-01-07,27.197369


In [None]:
scores_df = pd.DataFrame(scores)  # len matches df_nvda_merged
df_nvda_scored = pd.concat([df_nvda_merged.reset_index(drop=True), scores_df], axis=1)

df_nvda_final = df_nvda_scored.copy()
df_nvda_final['Date'] = pd.to_datetime(df_nvda_final['Date'], errors='coerce')



print("Date range:", df_nvda_final['Date'].min().date(), "to", df_nvda_final['Date'].max().date())
print("Unique days with articles:", df_nvda_final['Date'].dt.date.nunique())
missing_prices = df_nvda_final['Close'].isna().sum() if 'Close' in df_nvda_final.columns else "column 'Close' not found"
print("Missing 'Close' prices:", missing_prices)

Date range: 2022-01-03 to 2023-12-15
Unique days with articles: 491
Missing 'Close' prices: 0


In [None]:
df_nvda_final['Date'] = pd.to_datetime(df_nvda_final['Date'])

start_date = df_nvda_final['Date'].min().strftime('%Y-%m-%d')
end_date = df_nvda_final['Date'].max().strftime('%Y-%m-%d')

filename = f"nvda_final_{start_date}_to_{end_date}.csv"

df_nvda_final.to_csv(filename, index=False)

In [None]:
count_2022 = (df_nvda_final['Date'].dt.year == 2022).sum()
print("Number of rows in 2022:", count_2022)

unique_days_2022 = df_nvda_final.loc[df_nvda_final['Date'].dt.year == 2022, 'Date'].dt.date.nunique()
print("Unique days with articles in 2022:", unique_days_2022)


Number of rows in 2022: 250
Unique days with articles in 2022: 250
