# Financial News Sentiment Analysis Pipeline
# Using Fine-tuned Llama 3-8B (100% Agreement Dataset)

This notebook analyzes financial news sentiment using our custom fine-tuned Llama 3-8B model trained on the FinancialPhraseBank dataset with 100% annotator agreement for maximum data quality.

In [12]:
# Financial News Sentiment Analysis Pipeline
# Using Fine-tuned Llama 3-8B for Stock News Sentiment Prediction

import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import time
import re
import warnings
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import gc
import os

warnings.filterwarnings('ignore')

print("Financial News Sentiment Analysis Pipeline")
print("Using Fine-tuned Llama 3-8B Model (100% Agreement Dataset)")
print("Model: models/trained-llama3-sentences_allagree")
print("=" * 60)

Financial News Sentiment Analysis Pipeline
Using Fine-tuned Llama 3-8B Model (100% Agreement Dataset)
Model: models/trained-llama3-sentences_allagree


In [13]:
# 1. Load and Explore the Dataset
print("Loading historical news data...")

# Load the dataset
df = pd.read_csv('historical_news_monthly_final.csv')

print(f"Dataset shape: {df.shape}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"Unique tickers: {df['ticker'].nunique()}")
print(f"Total news articles: {len(df)}")

# Display basic info
print("\nDataset Info:")
print(df.info())

print("\nSample of the data:")
print(df.head())

Loading historical news data...
Dataset shape: (163777, 11)
Date range: 1/1/2022 to 9/9/2024
Unique tickers: 20
Total news articles: 163777

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163777 entries, 0 to 163776
Data columns (total 11 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   ticker                  163777 non-null  object
 1   date                    163777 non-null  object
 2   title                   163777 non-null  object
 3   description             157954 non-null  object
 4   author                  162462 non-null  object
 5   published_utc           163777 non-null  object
 6   article_url             163777 non-null  object
 7   publisher_name          163777 non-null  object
 8   publisher_homepage_url  163777 non-null  object
 9   tickers                 163777 non-null  object
 10  keywords                92721 non-null   object
dtypes: object(11)
memory usage: 13.7+ MB
Non

In [14]:
# 2. Data Cleaning and Filtering
print("Starting data cleaning process...")

# Create a copy for processing
df_clean = df.copy()

# Convert date to datetime
df_clean['date'] = pd.to_datetime(df_clean['date'])

# Check for missing values in key columns
print("Missing values:")
print(df_clean[['ticker', 'date', 'title', 'description']].isnull().sum())

# Remove rows with missing descriptions (since we'll analyze descriptions)
initial_count = len(df_clean)
df_clean = df_clean.dropna(subset=['description'])
print(f"Removed {initial_count - len(df_clean)} rows with missing descriptions")

# Clean and prepare text data
def clean_text(text):
    """Clean text by removing extra whitespace and formatting"""
    if pd.isna(text):
        return ""
    # Remove extra whitespace and newlines
    text = re.sub(r'\s+', ' ', str(text))
    # Remove special characters that might cause issues
    text = re.sub(r'[^\w\s\.\,\!\?\:\;\-\(\)]', ' ', text)
    return text.strip()

# Apply text cleaning
df_clean['description_clean'] = df_clean['description'].apply(clean_text)
df_clean['title_clean'] = df_clean['title'].apply(clean_text)

print(f"Cleaned dataset shape: {df_clean.shape}")
print(f"Date range: {df_clean['date'].min().date()} to {df_clean['date'].max().date()}")

Starting data cleaning process...
Missing values:
ticker            0
date              0
title             0
description    5823
dtype: int64
Removed 5823 rows with missing descriptions
Cleaned dataset shape: (157954, 13)
Date range: 2020-06-10 to 2025-06-20


In [15]:
# 3. Filter Out Meaningless News (Questions, Style Boxes, etc.)
print("Filtering out meaningless news...")

def is_meaningful_news(title, description):
    """
    Determine if a news article is meaningful for sentiment analysis.
    Returns False for questions, style box reports, and other non-informative content.
    """
    text_to_check = f"{title} {description}".lower()
    
    # Patterns that indicate meaningless news
    meaningless_patterns = [
        # Questions
        r'\?',  # Contains question marks
        r'is it too late to',
        r'should you buy',
        r'what\'s next for',
        r'when will',
        r'how to',
        r'why',
        r'where to',
        
        # Style box and generic reports
        r'etf',
        r'weekly roundup',
        r'daily roundup',
        r'monthly update',
        r'quarterly review',
        
        # Generic/promotional content
        r'here are \d+ stocks',
        r'\d+ stocks to buy',
        r'\d+ dividend stocks',
        r'top \d+ stocks',
    ]
    
    # Check for meaningless patterns
    for pattern in meaningless_patterns:
        if re.search(pattern, text_to_check, re.IGNORECASE):
            return False
    
    # Check if description is too short (likely incomplete)
    if len(description.split()) < 10:
        return False
    
    # Check if it's mostly promotional/generic
    promotional_words = ['click for', 'see our', 'subscribe', 'sign up', 'join now']
    if any(word in text_to_check for word in promotional_words):
        return False
    
    return True

# Apply the filter
df_clean['is_meaningful'] = df_clean.apply(
    lambda row: is_meaningful_news(row['title_clean'], row['description_clean']), 
    axis=1
)

# Show filtering results
meaningful_count = df_clean['is_meaningful'].sum()
filtered_out = len(df_clean) - meaningful_count

print(f"Filtering Results:")
print(f"  • Total articles: {len(df_clean):,}")
print(f"  • Meaningful articles: {meaningful_count:,}")
print(f"  • Filtered out: {filtered_out:,}")
print(f"  • Retention rate: {meaningful_count/len(df_clean)*100:.1f}%")

# Show examples of filtered out articles
print(f"\nExamples of filtered out articles:")
filtered_examples = df_clean[~df_clean['is_meaningful']]['title_clean'].head(10)
for i, title in enumerate(filtered_examples, 1):
    print(f"  {i}. {title}")

# Keep only meaningful articles
df_meaningful = df_clean[df_clean['is_meaningful']].copy()
print(f"\nDataset after meaning filter: {len(df_meaningful):,} articles")

Filtering out meaningless news...
Filtering Results:
  • Total articles: 157,954
  • Meaningful articles: 89,772
  • Filtered out: 68,182
  • Retention rate: 56.8%

Examples of filtered out articles:
  1. Should you buy Apple stock as it approaches the  3T market cap?
  2. Why This Analyst Thinks Tesla Is a  2,500 Stock And When It May Reach That Price Target
  3. What will streaming services bring in 2022? Here are 5 trends to watch
  4. Big Tech heads for  a year of thousands of tiny tech papercuts,  but what antitrust efforts could make them bleed?
  5. Amazon has mostly avoided antitrust scrutiny, but that may change in 2022
  6. 2 Stocks That Could Thrive With Rising Inflation
  7. How Should Investors Analyze a Stock s All-Time High?
  8. Should Apple Buy Peloton?
  9. Investing in This ETF Right Now Could Make You a Millionaire Retiree
  10. Down 28  in 2022, Is Apple Stock a Buy for 2023?

Dataset after meaning filter: 89,772 articles


In [16]:
# 4. Filter Out Multi-Ticker News (More than 3 stock symbols)
print("\n" + "="*60)
print("FILTERING OUT MULTI-TICKER NEWS")
print("="*60)

def count_tickers_in_string(ticker_string):
    """
    Count the number of stock symbols in the tickers column.
    Handles various formats like 'AAPL,MSFT,GOOGL' or 'AAPL MSFT GOOGL'
    """
    if pd.isna(ticker_string) or ticker_string == '':
        return 0
    
    # Handle different separators (comma, space, semicolon)
    # Clean and split the string
    cleaned = str(ticker_string).replace(',', ' ').replace(';', ' ').replace('|', ' ')
    tickers = [t.strip() for t in cleaned.split() if t.strip()]
    
    # Filter out obviously invalid tickers (too long, contains numbers/symbols)
    valid_tickers = []
    for ticker in tickers:
        # Basic validation: 1-5 characters, mostly letters
        if 1 <= len(ticker) <= 5 and ticker.isalpha():
            valid_tickers.append(ticker)
    
    return len(valid_tickers)

# Apply ticker counting
df_meaningful['ticker_count'] = df_meaningful['tickers'].apply(count_tickers_in_string)

# Analyze ticker distribution
print("Distribution of ticker counts:")
ticker_distribution = df_meaningful['ticker_count'].value_counts().sort_index()
for count, articles in ticker_distribution.items():
    percentage = (articles / len(df_meaningful)) * 100
    print(f"  {count} tickers: {articles:,} articles ({percentage:.1f}%)")

# Filter out multi-ticker news (keep only articles with 1-3 tickers)
initial_count = len(df_meaningful)
df_filtered = df_meaningful[df_meaningful['ticker_count'] <= 3].copy()
filtered_out_count = initial_count - len(df_filtered)

print(f"\nMULTI-TICKER FILTERING RESULTS:")
print(f"  • Articles before filtering: {initial_count:,}")
print(f"  • Articles with >3 tickers (filtered out): {filtered_out_count:,}")
print(f"  • Articles remaining (1-3 tickers): {len(df_filtered):,}")
print(f"  • Retention rate: {len(df_filtered)/initial_count*100:.1f}%")

print(f"\nFinal dataset for sentiment analysis: {len(df_filtered):,} articles")


FILTERING OUT MULTI-TICKER NEWS
Distribution of ticker counts:
  0 tickers: 370 articles (0.4%)
  1 tickers: 12,969 articles (14.4%)
  2 tickers: 10,147 articles (11.3%)
  3 tickers: 12,073 articles (13.4%)
  4 tickers: 16,788 articles (18.7%)
  5 tickers: 9,239 articles (10.3%)
  6 tickers: 5,005 articles (5.6%)
  7 tickers: 3,755 articles (4.2%)
  8 tickers: 3,180 articles (3.5%)
  9 tickers: 2,177 articles (2.4%)
  10 tickers: 1,874 articles (2.1%)
  11 tickers: 1,334 articles (1.5%)
  12 tickers: 1,129 articles (1.3%)
  13 tickers: 909 articles (1.0%)
  14 tickers: 774 articles (0.9%)
  15 tickers: 837 articles (0.9%)
  16 tickers: 581 articles (0.6%)
  17 tickers: 663 articles (0.7%)
  18 tickers: 515 articles (0.6%)
  19 tickers: 566 articles (0.6%)
  20 tickers: 487 articles (0.5%)
  21 tickers: 395 articles (0.4%)
  22 tickers: 366 articles (0.4%)
  23 tickers: 348 articles (0.4%)
  24 tickers: 199 articles (0.2%)
  25 tickers: 226 articles (0.3%)
  26 tickers: 223 articles (0

In [17]:
# 5. Load Fine-tuned Llama 3-8B Model
print("Loading fine-tuned Llama 3-8B model...")

# Model configuration
base_model_name = "meta-llama/Meta-Llama-3-8B"
model_path = "../models/trained-llama3-sentences_allagree"

# Check if model exists
if not os.path.exists(model_path):
    print(f"Error: Model not found at {model_path}")
    print("Please ensure you have trained the model using fine-tune-llama3-8b-comparison.ipynb")
    raise FileNotFoundError(f"Model not found: {model_path}")

print(f"Loading model from: {model_path}")

# Configure quantization for memory efficiency
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
)

# Load base model
print("Loading base Llama 3-8B model...")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    quantization_config=bnb_config,
    token=True
)

# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(base_model_name, token=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load fine-tuned weights
print("Loading fine-tuned weights...")
model = PeftModel.from_pretrained(base_model, model_path)

print("Fine-tuned Llama 3-8B model loaded successfully!")
print(f"Base model: {base_model_name}")
print(f"Fine-tuned model: {model_path}")
print(f"Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")

# Test the model with a sample text
test_text = "The company reported strong quarterly earnings beating expectations significantly"
test_prompt = f"""Analyze the sentiment of the news headline enclosed in square brackets, 
determine if it is positive, neutral, or negative, and return the answer as 
the corresponding sentiment label "positive" or "neutral" or "negative"

[{test_text}] = """.strip()

print(f"\nTesting model with sample text...")
input_ids = tokenizer(test_prompt, return_tensors="pt").to("cuda")
with torch.no_grad():
    outputs = model.generate(**input_ids, max_new_tokens=3, temperature=0.0, do_sample=False)
result = tokenizer.decode(outputs[0])
answer = result.split("=")[-1].strip()
print(f"Test prediction: {answer}")

print(f"\nLlama 3-8B Model Information:")
print(f"• Base Model: {base_model_name}")
print(f"• Fine-tuned on: FinancialPhraseBank (100% annotator agreement)")
print(f"• Training approach: LoRA (Low-Rank Adaptation)")
print(f"• Classes: positive, negative, neutral")
print(f"• Input: Financial text with prompt formatting")
print(f"• Output: Direct sentiment label generation")

Loading fine-tuned Llama 3-8B model...
Loading model from: ../models/trained-llama3-sentences_allagree
Loading base Llama 3-8B model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading tokenizer...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading fine-tuned weights...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Fine-tuned Llama 3-8B model loaded successfully!
Base model: meta-llama/Meta-Llama-3-8B
Fine-tuned model: ../models/trained-llama3-sentences_allagree
Device: cuda

Testing model with sample text...
Test prediction: positive

[

Llama 3-8B Model Information:
• Base Model: meta-llama/Meta-Llama-3-8B
• Fine-tuned on: FinancialPhraseBank (100% annotator agreement)
• Training approach: LoRA (Low-Rank Adaptation)
• Classes: positive, negative, neutral
• Input: Financial text with prompt formatting
• Output: Direct sentiment label generation


In [18]:
# 6. Define Sentiment Prediction Functions
print("Setting up sentiment prediction functions...")

def predict_sentiment_llama(text):
    """
    Predict sentiment of financial text using fine-tuned Llama 3-8B.
    Returns: sentiment label ('positive', 'negative', 'neutral')
    """
    try:
        # Create the prompt in the same format as training
        prompt = f"""Analyze the sentiment of the news headline enclosed in square brackets, 
        determine if it is positive, neutral, or negative, and return the answer as 
        the corresponding sentiment label "positive" or "neutral" or "negative"

        [{text[:500]}] = """.strip()  # Truncate to avoid token limits
        
        # Tokenize and generate
        input_ids = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to("cuda")
        
        with torch.no_grad():
            outputs = model.generate(
                **input_ids,
                max_new_tokens=1,
                temperature=0.0,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id
            )
        
        # Decode the result
        result = tokenizer.decode(outputs[0])
        answer = result.split("=")[-1].lower().strip()
        
        # Extract sentiment from response
        if "positive" in answer:
            return "positive"
        elif "negative" in answer:
            return "negative"
        elif "neutral" in answer:
            return "neutral"
        else:
            # If unclear response, try to extract from the full response
            full_response = result.lower()
            if "positive" in full_response:
                return "positive"
            elif "negative" in full_response:
                return "negative"
            else:
                return "neutral"  # Default fallback
        
    except Exception as e:
        print(f"Error during prediction: {e}")
        return "error"

def sentiment_to_score(sentiment):
    """
    Convert sentiment label to numerical score for easier analysis.
    positive: 1, neutral: 0, negative: -1, unknown/error: 0
    """
    mapping = {
        'positive': 1,
        'neutral': 0,
        'negative': -1,
        'unknown': 0,
        'error': 0
    }
    return mapping.get(sentiment, 0)

def clear_gpu_memory():
    """Clear GPU memory to prevent OOM errors"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()

print("Llama 3-8B sentiment prediction functions ready")

Setting up sentiment prediction functions...
Llama 3-8B sentiment prediction functions ready


In [19]:
# 7. Test Llama 3-8B Sentiment Prediction on Sample Data
print("Testing Llama 3-8B sentiment prediction on sample articles...")

# Test on a few examples first
test_samples = [
    "Company reports strong quarterly earnings beating expectations significantly",
    "Stock price falls amid market uncertainty and economic concerns", 
    "Company maintains steady performance this quarter with no major changes",
    "Regulatory concerns impact company operations and future growth prospects",
    "Tech startup secures major funding round from leading investors"
]

print("Sample predictions with fine-tuned Llama 3-8B:")
for i, text in enumerate(test_samples, 1):
    start_time = time.time()
    sentiment = predict_sentiment_llama(text)
    score = sentiment_to_score(sentiment)
    end_time = time.time()
    
    print(f"{i}. Text: {text[:80]}...")
    print(f"   Sentiment: {sentiment} (Score: {score:+d})")
    print(f"   Time: {end_time-start_time:.2f}s")
    print()
    
    # Clear memory after each prediction
    clear_gpu_memory()

# Test on actual news descriptions from our dataset
print("Testing Llama 3-8B on actual news from dataset:")
sample_news = df_meaningful.sample(5, random_state=42)

for idx, row in sample_news.iterrows():
    start_time = time.time()
    sentiment = predict_sentiment_llama(row['description_clean'])
    score = sentiment_to_score(sentiment)
    end_time = time.time()
    
    print(f"Ticker: {row['ticker']} | Date: {row['date'].date()}")
    print(f"Title: {row['title_clean'][:100]}...")
    print(f"Description: {row['description_clean'][:150]}...")
    print(f"Llama 3-8B Sentiment: {sentiment} (Score: {score:+d})")
    print(f"Time: {end_time-start_time:.2f}s")
    print("-" * 80)
    
    # Clear memory after each prediction
    clear_gpu_memory()

Testing Llama 3-8B sentiment prediction on sample articles...
Sample predictions with fine-tuned Llama 3-8B:
1. Text: Company reports strong quarterly earnings beating expectations significantly...
   Sentiment: positive (Score: +1)
   Time: 0.20s

2. Text: Stock price falls amid market uncertainty and economic concerns...
   Sentiment: negative (Score: -1)
   Time: 0.20s

3. Text: Company maintains steady performance this quarter with no major changes...
   Sentiment: neutral (Score: +0)
   Time: 0.21s

4. Text: Regulatory concerns impact company operations and future growth prospects...
   Sentiment: negative (Score: -1)
   Time: 0.20s

5. Text: Tech startup secures major funding round from leading investors...
   Sentiment: positive (Score: +1)
   Time: 0.21s

Testing Llama 3-8B on actual news from dataset:
Ticker: AMZN | Date: 2023-05-16
Title: United Launch Alliance eyes first Vulcan rocket launch later this year...
Description: The new Vulcan rocket will be the successor to ULA s

In [20]:
# 8. Batch Sentiment Prediction with Progress Tracking
print("Starting batch sentiment prediction...")

def batch_predict_sentiments_llama(df, batch_size=25, save_checkpoints=True):
    """
    Predict sentiments for all articles in batches with progress tracking.
    Using smaller batch size for Llama 3-8B due to memory requirements.
    """
    total_articles = len(df)
    results = []
    
    # Create a copy to work with
    df_batch = df.copy().reset_index(drop=True)
    
    print(f"Processing {total_articles:,} articles in batches of {batch_size}")
    print(f"Note: Using smaller batches for memory efficiency with Llama 3-8B")
    
    # Process in batches
    for batch_start in range(0, total_articles, batch_size):
        batch_end = min(batch_start + batch_size, total_articles)
        batch_df = df_batch.iloc[batch_start:batch_end]
        
        print(f"\nProcessing batch {batch_start//batch_size + 1}: articles {batch_start+1}-{batch_end}")
        
        batch_results = []
        batch_start_time = time.time()
        
        # Process each article in the batch
        for idx, row in batch_df.iterrows():
            try:
                # Predict sentiment
                sentiment = predict_sentiment_llama(row['description_clean'])
                score = sentiment_to_score(sentiment)
                
                batch_results.append({
                    'index': idx,
                    'ticker': row['ticker'],
                    'date': row['date'],
                    'sentiment': sentiment,
                    'sentiment_score': score
                })
                
                # Show progress every 5 articles
                if (len(batch_results) % 5 == 0) or (len(batch_results) == len(batch_df)):
                    progress = len(batch_results) / len(batch_df) * 100
                    print(f"  Progress: {len(batch_results)}/{len(batch_df)} ({progress:.1f}%)")
                
                # Clear memory after each prediction
                clear_gpu_memory()
                
            except Exception as e:
                print(f"  Error processing article {idx}: {e}")
                batch_results.append({
                    'index': idx,
                    'ticker': row['ticker'],
                    'date': row['date'],
                    'sentiment': 'error',
                    'sentiment_score': 0
                })
        
        # Add batch results to overall results
        results.extend(batch_results)
        
        batch_time = time.time() - batch_start_time
        avg_time_per_article = batch_time / len(batch_df)
        remaining_articles = total_articles - batch_end
        estimated_remaining_time = remaining_articles * avg_time_per_article
        
        print(f"  Batch completed in {batch_time:.1f}s ({avg_time_per_article:.2f}s per article)")
        print(f"  Estimated remaining time: {estimated_remaining_time/60:.1f} minutes")
        
        # Save checkpoint every few batches
        # if save_checkpoints and (batch_start // batch_size + 1) % 10 == 0:
        #     checkpoint_df = pd.DataFrame(results)
        #     checkpoint_df.to_csv(f'sentiment_checkpoint_llama3_batch_{batch_start//batch_size + 1}.csv', index=False)
        #     print(f"  Checkpoint saved")
        
        # Aggressive memory cleanup between batches
        clear_gpu_memory()
        time.sleep(1)  # Brief pause to help with memory management
    
    return pd.DataFrame(results)

# Start the batch prediction
print("Starting sentiment analysis for all meaningful articles...")
start_time = time.time()

# For testing purposes, you might want to start with a smaller subset
# Uncomment the next line to process only first 100 articles for testing
# df_to_process = df_filtered.head(100)
df_to_process = df_filtered

print(f"Processing {len(df_to_process):,} articles with fine-tuned Llama 3-8B...")
sentiment_results = batch_predict_sentiments_llama(df_to_process, batch_size=1000)

end_time = time.time()
total_time = end_time - start_time

print(f"\nSentiment analysis completed!")
print(f"Total time: {total_time/60:.1f} minutes")
print(f"Average time per article: {total_time/len(df_to_process):.2f} seconds")

Starting batch sentiment prediction...
Starting sentiment analysis for all meaningful articles...
Processing 35,559 articles with fine-tuned Llama 3-8B...
Processing 35,559 articles in batches of 1000
Note: Using smaller batches for memory efficiency with Llama 3-8B

Processing batch 1: articles 1-1000
  Progress: 5/1000 (0.5%)
  Progress: 10/1000 (1.0%)
  Progress: 15/1000 (1.5%)
  Progress: 20/1000 (2.0%)
  Progress: 25/1000 (2.5%)
  Progress: 30/1000 (3.0%)
  Progress: 35/1000 (3.5%)
  Progress: 40/1000 (4.0%)
  Progress: 45/1000 (4.5%)
  Progress: 50/1000 (5.0%)
  Progress: 55/1000 (5.5%)
  Progress: 60/1000 (6.0%)
  Progress: 65/1000 (6.5%)
  Progress: 70/1000 (7.0%)
  Progress: 75/1000 (7.5%)
  Progress: 80/1000 (8.0%)
  Progress: 85/1000 (8.5%)
  Progress: 90/1000 (9.0%)
  Progress: 95/1000 (9.5%)
  Progress: 100/1000 (10.0%)
  Progress: 105/1000 (10.5%)
  Progress: 110/1000 (11.0%)
  Progress: 115/1000 (11.5%)
  Progress: 120/1000 (12.0%)
  Progress: 125/1000 (12.5%)
  Progress

In [21]:
# 10. Save Sentiment Results
print("Saving sentiment results...")

# Reset index for proper merging
df_filtered_reset = df_filtered.reset_index(drop=True)

# Merge sentiment results with original data
df_with_sentiment = df_filtered_reset.copy()
df_with_sentiment['sentiment'] = sentiment_results['sentiment'].values
df_with_sentiment['sentiment_score'] = sentiment_results['sentiment_score'].values

# Display sentiment distribution
print("Sentiment Distribution:")
sentiment_counts = df_with_sentiment['sentiment'].value_counts()
print(sentiment_counts)
print(f"\nSentiment percentages:")
for sentiment, count in sentiment_counts.items():
    percentage = count / len(df_with_sentiment) * 100
    print(f"  {sentiment}: {percentage:.1f}%")

# Save the complete dataset with sentiment predictions
output_file = 'historical_news_with_sentiment_llama3allagree.csv'
df_with_sentiment.to_csv(output_file, index=False)

print(f"\n✅ SENTIMENT PREDICTION COMPLETE!")
print(f"📊 Results saved to: {output_file}")
print(f"📈 Dataset shape: {df_with_sentiment.shape}")
print(f"📅 Date range: {df_with_sentiment['date'].min().date()} to {df_with_sentiment['date'].max().date()}")
print(f"🏢 Unique tickers: {df_with_sentiment['ticker'].nunique()}")

print(f"\n" + "="*60)
print("NEXT STEP: Daily Aggregation & Feature Engineering")
print("="*60)
print(f"✅ Sentiment prediction is now complete!")
print(f"🔄 Next, run the create_stock_price_with_sentiment.ipynb notebook to:")
print(f"   • Aggregate daily sentiment scores")
print(f"   • Engineer features for stock prediction")
print(f"   • Combine sentiment data with stock prices")
print(f"   • Create the final dataset for ML models")
print(f"\n📁 Input file for next notebook: {output_file}")
print("="*60)

Saving sentiment results...
Sentiment Distribution:
sentiment
positive    16479
neutral     11704
negative     7376
Name: count, dtype: int64

Sentiment percentages:
  positive: 46.3%
  neutral: 32.9%
  negative: 20.7%

✅ SENTIMENT PREDICTION COMPLETE!
📊 Results saved to: historical_news_with_sentiment_llama3allagree.csv
📈 Dataset shape: (35559, 17)
📅 Date range: 2020-06-25 to 2025-06-20
🏢 Unique tickers: 20

NEXT STEP: Daily Aggregation & Feature Engineering
✅ Sentiment prediction is now complete!
🔄 Next, run the create_stock_price_with_sentiment.ipynb notebook to:
   • Aggregate daily sentiment scores
   • Engineer features for stock prediction
   • Combine sentiment data with stock prices
   • Create the final dataset for ML models

📁 Input file for next notebook: historical_news_with_sentiment_llama3allagree.csv
