---
# 1: SETUP, CONFIGURATION & LIBRARY IMPORTS

In [1]:
# Installs the latest development versions of key libraries to ensure compatibility.
#

# --- 1. Install Libraries from the latest source to ensure compatibility ---
print("▶ Installing/updating required libraries from the latest sources...")
!pip install -q pandas numpy torch yfinance newsapi-python praw
!pip install -q --upgrade bitsandbytes
!pip install -q --upgrade git+https://github.com/huggingface/transformers.git
!pip install -q --upgrade git+https://github.com/huggingface/accelerate.git

# --- 2. Library Imports ---
import yfinance as yf
import pandas as pd
import praw
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from newsapi import NewsApiClient
from datetime import datetime, timedelta
from kaggle_secrets import UserSecretsClient
import torch

# --- 3. Main Configuration ---
TICKER_SYMBOL = 'NVDA'
SEARCH_KEYWORD = 'NVIDIA'
SUBREDDITS_TO_SEARCH = ['wallstreetbets', 'investing', 'stocks', 'technology', 'techstocks']
POST_LIMIT_PER_SUBREDDIT = 25
COMMENT_LIMIT_PER_POST = 10

print("✔ Cell 1/4: All libraries installed and configuration complete. Please restart the session and run all cells.")

▶ Installing/updating required libraries from the latest sources...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m99.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m80.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m

2025-08-29 17:51:48.814231: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756489909.007679      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756489909.064763      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


✔ Cell 1/4: All libraries installed and configuration complete. Please restart the session and run all cells.


---
# 2. DATA GATHERING AGENT FUNCTIONS

In [2]:
# This cell contains all functions responsible for fetching data from external APIs.
#

def get_api_credentials():
    """Retrieves all necessary API credentials from Kaggle Secrets."""
    print("▶ Attempting to load API credentials from Kaggle Secrets...")
    credentials = {}
    try:
        user_secrets = UserSecretsClient()
        credentials['news_api_key'] = user_secrets.get_secret("NEWS_API_KEY")
        credentials['reddit_client_id'] = user_secrets.get_secret("REDDIT_CLIENT_ID")
        credentials['reddit_client_secret'] = user_secrets.get_secret("REDDIT_CLIENT_SECRET")
        credentials['reddit_user_agent'] = f'FinSentimentTracker v0.2 by {user_secrets.get_secret("KAGGLE_USERNAME")}'
        print("✔ All API credentials loaded successfully.")
        return credentials
    except Exception as e:
        print(f"ERROR: Could not retrieve one or more API keys. Please check your Kaggle Secrets.")
        return None

def fetch_financial_news(api_key, keyword):
    """Fetches recent financial news using NewsAPI."""
    print(f"\\nFetching news articles for '{keyword}'...")
    try:
        newsapi = NewsApiClient(api_key=api_key)
        seven_days_ago = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
        all_articles = newsapi.get_everything(q=keyword, language='en', from_param=seven_days_ago, sort_by='relevancy', page_size=100)
        articles_df = pd.DataFrame(all_articles['articles'])
        articles_df['source'] = articles_df['source'].apply(lambda x: x.get('name', 'Unknown Source'))
        articles_df.rename(columns={'title': 'text_for_display'}, inplace=True)
        print(f"✔ {len(articles_df)} news articles fetched successfully.")
        return articles_df[['source', 'content']]
    except Exception as e:
        print(f"ERROR: Could not fetch news. Details: {e}")
        return pd.DataFrame() # Return empty DataFrame on error

def fetch_reddit_discussions_advanced(creds, keyword, subreddits, post_limit, comment_limit):
    """Fetches top posts AND their top comments from specified subreddits."""
    print(f"\\nFetching Reddit posts and comments for '{keyword}'...")
    try:
        reddit = praw.Reddit(client_id=creds['reddit_client_id'],
                             client_secret=creds['reddit_client_secret'],
                             user_agent=creds['reddit_user_agent'])
        all_texts = []
        for subreddit_name in subreddits:
            subreddit = reddit.subreddit(subreddit_name)
            submissions = subreddit.search(keyword, sort='top', time_filter='week', limit=post_limit)
            for submission in submissions:
                all_texts.append({'source': f'Reddit Post (r/{subreddit_name})', 'content': submission.title})
                if submission.selftext:
                    all_texts.append({'source': f'Reddit Post (r/{subreddit_name})', 'content': submission.selftext})
                submission.comments.replace_more(limit=0)
                for i, comment in enumerate(submission.comments.list()):
                    if i >= comment_limit: break
                    all_texts.append({'source': f'Reddit Comment (r/{subreddit_name})', 'content': comment.body})
        reddit_df = pd.DataFrame(all_texts)
        print(f"✔ {len(reddit_df)} total texts fetched from Reddit.")
        return reddit_df
    except Exception as e:
        print(f"ERROR: Failed to fetch from Reddit. Details: {e}")
        return pd.DataFrame() # Return empty DataFrame on error
        
print("✔ Cell 2/4: Data gathering functions are defined.")

✔ Cell 2/4: Data gathering functions are defined.


---
# 3. SENTIMENT ANALYSIS AND SUMMARIZATION FUNCTIONS

In [3]:
# This cell defines the functions for running the FinBERT model and synthesizing the results.
#

def analyze_sentiment(data_dataframe):
    """Analyzes the sentiment of a dataframe with a 'content' column."""
    if data_dataframe is None or data_dataframe.empty:
        print("INFO: DataFrame is empty. Skipping sentiment analysis.")
        return None
    print("\\n▶ Initializing FinBERT sentiment analysis pipeline...")
    sentiment_pipeline = pipeline("sentiment-analysis", model="ProsusAI/finbert", device=0 if torch.cuda.is_available() else -1)
    print("✔ FinBERT pipeline ready.")
    texts_to_analyze = data_dataframe['content'].fillna('').tolist()
    if not any(texts_to_analyze):
        print("WARNING: No text content found to analyze.")
        return data_dataframe
    print(f"▶ Analyzing sentiment for {len(texts_to_analyze)} texts...")
    sentiment_results = sentiment_pipeline(texts_to_analyze, truncation=True, batch_size=16)
    data_dataframe[['sentiment_label', 'sentiment_score']] = pd.DataFrame(sentiment_results)
    print("✔ Sentiment analysis complete.")
    return data_dataframe

def summarize_sentiment_data(analyzed_df, keyword):
    """Calculates key metrics and formats them into a text summary for the LLM."""
    if analyzed_df is None or 'sentiment_label' not in analyzed_df.columns:
        print("ERROR: Sentiment analysis data missing.")
        return None
    print("\\n▶ Synthesizing sentiment data into a final report...")
    sentiment_distribution = analyzed_df['sentiment_label'].value_counts(normalize=True).mul(100)
    summary = {
        'positive': sentiment_distribution.get('positive', 0),
        'negative': sentiment_distribution.get('negative', 0),
        'neutral': sentiment_distribution.get('neutral', 0),
        'dominant': sentiment_distribution.idxmax(),
        'avg_score': analyzed_df['sentiment_score'].mean()
    }
    pos_examples = analyzed_df[analyzed_df['sentiment_label'] == 'positive']
    neg_examples = analyzed_df[analyzed_df['sentiment_label'] == 'negative']
    summary['top_positive'] = pos_examples.loc[pos_examples['sentiment_score'].idxmax()]['content'][:150] if not pos_examples.empty else "N/A"
    summary['top_negative'] = neg_examples.loc[neg_examples['sentiment_score'].idxmax()]['content'][:150] if not neg_examples.empty else "N/A"
    
    summary_text = f"""
### Financial Sentiment Analysis Report: {keyword} ###
**1. Overall Sentiment Distribution:**
- Positive: {summary['positive']:.2f}%
- Negative: {summary['negative']:.2f}%
- Neutral: {summary['neutral']:.2f}%
**2. Key Insights:**
- The dominant sentiment in recent online discussions is **{summary['dominant']}**.
- The average model confidence score was **{summary['avg_score']:.4f}**.
**3. Representative Snippets:**
- **Example of Positive Sentiment:** "{summary['top_positive']}..."
- **Example of Negative Sentiment:** "{summary['top_negative']}..."
This summary is based on {len(analyzed_df)} texts from news and Reddit.
"""
    print("✔ Data-driven summary for the LLM has been generated.")
    return summary_text

print("✔ Cell 3/4: Analysis and summarization functions are defined.")

✔ Cell 3/4: Analysis and summarization functions are defined.


---
# 4. MAIN EXECUTION PIPELINE

In [4]:
# This cell runs the entire data gathering, analysis, and summarization process.
#

if __name__ == '__main__':
    api_credentials = get_api_credentials()
    
    # Initialize an empty DataFrame to hold all content
    all_content_df = pd.DataFrame()

    if api_credentials:
        # Step 1: Gather data from all sources
        news_df = fetch_financial_news(api_credentials['news_api_key'], SEARCH_KEYWORD)
        reddit_df = fetch_reddit_discussions_advanced(api_credentials, SEARCH_KEYWORD, SUBREDDITS_TO_SEARCH, POST_LIMIT_PER_SUBREDDIT, COMMENT_LIMIT_PER_POST)
        
        # Combine the dataframes
        all_content_df = pd.concat([news_df, reddit_df], ignore_index=True)
            
    if not all_content_df.empty:
        print(f"\\n--- Total of {len(all_content_df)} texts gathered for analysis ---")
        
        # Step 2: Analyze the combined data
        analyzed_df = analyze_sentiment(all_content_df)
        
        if analyzed_df is not None:
            # Step 3: Summarize the results
            llm_briefing = summarize_sentiment_data(analyzed_df, SEARCH_KEYWORD)
            if llm_briefing:
                print("\\n" + "="*25 + " FINAL BRIEFING FOR LLM " + "="*25)
                print(llm_briefing)
                print("="*75)
    else:
        print("\\nExecution halted: No data could be fetched from any source.")

▶ Attempting to load API credentials from Kaggle Secrets...
✔ All API credentials loaded successfully.
\nFetching news articles for 'NVIDIA'...
✔ 100 news articles fetched successfully.
\nFetching Reddit posts and comments for 'NVIDIA'...
✔ 480 total texts fetched from Reddit.
\n--- Total of 580 texts gathered for analysis ---
\n▶ Initializing FinBERT sentiment analysis pipeline...


config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0


✔ FinBERT pipeline ready.
▶ Analyzing sentiment for 580 texts...
✔ Sentiment analysis complete.
\n▶ Synthesizing sentiment data into a final report...
✔ Data-driven summary for the LLM has been generated.

### Financial Sentiment Analysis Report: NVIDIA ###
**1. Overall Sentiment Distribution:**
- Positive: 10.52%
- Negative: 12.76%
- Neutral: 76.72%
**2. Key Insights:**
- The dominant sentiment in recent online discussions is **neutral**.
- The average model confidence score was **0.8114**.
**3. Representative Snippets:**
- **Example of Positive Sentiment:** "NVIDIA Corporation (NASDAQ:NVDA) is among the best stocks to buy now according to AI. J.W. Cole Advisors Inc. has increased its stake in NVIDIA Corpor..."
- **Example of Negative Sentiment:** "MRVL also down -12% via earnings in afterhours..."
This summary is based on 580 texts from news and Reddit.



---
# 4. FIXING: "ImportError: Using bitsandbytes 4-bit quantization requires the latest version of bitsandbytes: pip install -U bitsandbytes"

In [5]:
%pip show bitsandbytes
%pip install -U bitsandbytes
%pip show bitsandbytes
%pip show transformers

Name: bitsandbytes
Version: 0.47.0
Summary: k-bit optimizers and matrix multiplication routines.
Home-page: https://github.com/bitsandbytes-foundation/bitsandbytes
Author: 
Author-email: Tim Dettmers <dettmers@cs.washington.edu>
License: MIT License

Copyright (c) Facebook, Inc. and its affiliates.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHA

---
# 5. LLM CRITIQUE GENERATION (with Professional Persona)

In [6]:
# This script loads Meta-Llama 3, feeds it the synthesized data,
# and generates a human-readable financial critique using a detailed persona.
#

!pip install --upgrade transformers accelerate bitsandbytes

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
from kaggle_secrets import UserSecretsClient

# This is the summary text generated from the previous cell.
# For the code to run, this variable must contain that summary string.
llm_briefing = """
### Financial Sentiment Analysis Report: NVIDIA ###
**1. Overall Sentiment Distribution:**
- Positive: 10.23%
- Negative: 12.70%
- Neutral: 77.07%
**2. Key Insights:**
- The dominant sentiment in recent online discussions is **neutral**.
- The average model confidence score was **0.8113**.
**3. Representative Snippets:**
- **Example of Positive Sentiment:** "NVIDIA Corporation (NASDAQ:NVDA) is among the best stocks to buy now according to AI. J.W. Cole Advisors Inc. has increased its stake in NVIDIA Corpor..."
- **Example of Negative Sentiment:** "MRVL also down -12% via earnings in afterhours..."
This summary is based on 567 texts from news and Reddit.
"""

# --- 2. AGENT 3: THE SCRIBE (Llama 3) ---

def load_llm(model_id="meta-llama/Meta-Llama-3-8B-Instruct"):
    """
    Loads the specified Large Language Model with 4-bit quantization.
    """
    print(f"\\n▶ Loading the Large Language Model: {model_id}...")
    print("This may take a few minutes...")
    
    try:
        user_secrets = UserSecretsClient()
        hf_token = user_secrets.get_secret("HUGGING_FACE_HUB_TOKEN")
    except:
        print("WARNING: 'HUGGING_FACE_HUB_TOKEN' not found. Model download may fail.")
        hf_token = None

    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        token=hf_token,
        quantization_config=quantization_config,
        device_map="auto",
    )
    
    print("✔ LLM loaded successfully!")
    return model, tokenizer

def generate_critique(model, tokenizer, summary_report):
    """
    Generates a financial critique using the LLM based on the provided summary.
    """
    print("▶ Preparing prompt for the Scribe Agent (Llama 3)...")
    
    # UPDATED: Using the user's advanced system prompt for a professional persona
    messages = [
        {
            "role": "system",
            "content": "You are a senior financial analyst with 15+ years of experience in equity research and market sentiment analysis at a top-tier investment firm. Your task is to produce a concise, data-driven, and strictly objective analysis of a company's recent public sentiment based exclusively on the quantitative metrics and qualitative examples provided. Adhere to the following structured format and guidelines:\n\n**Analysis Structure:**\n1. **Executive Headline** (10-15 words): Craft a precise, actionable headline that captures the sentiment trend and its potential market impact.\n2. **Quantitative Overview** (100-150 words): Interpret the sentiment distribution with specific attention to the exact percentage breakdowns.\n3. **Qualitative Deep-Dive** (100-150 words): Analyze the specific examples provided, focusing on key themes in positive and negative sentiment.\n\n**Critical Constraints:**\n- Base all statements solely on the provided data—no external information or speculation.\n- Maintain strict neutrality—avoid promotional or alarmist language.\n- Use precise financial terminology.",
        },
        {"role": "user", "content": summary_report},
    ]

    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    print("▶ Generating final critique...")
    
    outputs = model.generate(**inputs, max_new_tokens=768, do_sample=True, temperature=0.6, top_p=0.9, eos_token_id=tokenizer.eos_token_id)
    response_text = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    
    return response_text

# --- 3. EXECUTION ---
if __name__ == '__main__':
    if 'llm_briefing' in locals() and llm_briefing:
        # Load the LLM and tokenizer
        llm_model, llm_tokenizer = load_llm()
        
        # Generate the final report using the new persona
        final_report = generate_critique(llm_model, llm_tokenizer, llm_briefing)
        
        print("\\n" + "="*25 + " FINAL AI-GENERATED REPORT " + "="*25)
        print(final_report)
        print("="*75)
    else:
        # This is a fallback in case the briefing wasn't generated
        print("ERROR: 'llm_briefing' variable not")

\n▶ Loading the Large Language Model: meta-llama/Meta-Llama-3-8B-Instruct...
This may take a few minutes...


tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


✔ LLM loaded successfully!
▶ Preparing prompt for the Scribe Agent (Llama 3)...
▶ Generating final critique...
**Executive Headline:** NVIDIA's Neutral Sentiment Dominates Online Discussions, Amidst Mixed Market Reaction

**Quantitative Overview:** The recent sentiment analysis of NVIDIA Corporation (NASDAQ:NVDA) reveals a predominantly neutral sentiment distribution, with 77.07% of online discussions falling under this category. The remaining 10.23% and 12.70% are comprised of positive and negative sentiments, respectively. The average model confidence score of 0.8113 suggests a moderate level of accuracy in the sentiment analysis.

**Qualitative Deep-Dive:** The neutral sentiment is largely driven by the mixed market reaction to NVIDIA's recent performance. The positive sentiment, exemplified by the statement "NVIDIA Corporation (NASDAQ:NVDA) is among the best stocks to buy now according to AI," highlights the company's strong fundamentals and growth prospects. However, the negative 