# Financial News Extraction Pipeline

## Overview
This notebook contains an institutional-grade news fetcher designed to extract high-signal financial news for a set of companies over a specific period (like 30 days), categorise them, and output a structured JSON.

## Input
- `tickers`: A list of 25 companies, e.g., `["TSLA", "NVDA", "AAPL", ...]`
- `days`: Number of days to fetch news for (30 days by default).

## Logic
1. **Financial Intent Classifier**: Categorizes extracted news into targeted buckets (earnings, analyst, management, corporate, regulation, etc.)
2. **Source Credibility Weighting**: Boosts scores for premium sources like Reuters, Bloomberg, WSJ.
3. **Semantic Similarity Filter**: Filters out non-financial noise using local dense embeddings (`all-MiniLM-L6-v2`).
4. **Categorization & Pagination**: Limits news per category to avoid spam and focuses only on high-quality articles.

## Output
A hierarchical JSON document containing:
```json
{
   "TICKER": {
      "news_count": 10,
      "30_day_news": {
         "YYYY-MM-DD": {
             "category_name": [
                  { "title": "...", "summary": "...", "source": "..." }
             ]
         }
      }
   }
}
```
This JSON can directly be used to feed into an LLM context window to generate analytical summaries (.join(['probabilities', 'bull/bear case'])) per day or overall.

In [8]:
import feedparser
import requests
import re
import yfinance as yf
from datetime import datetime, timedelta
import time
import ssl
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed
import json

# SSL Fix
if hasattr(ssl, '_create_unverified_context'):
    ssl._create_default_https_context = ssl._create_unverified_context

# ============================================================
# UPGRADE 3: SEMANTIC SIMILARITY FILTER
# ============================================================

try:
    from sentence_transformers import SentenceTransformer, util
    SEMANTIC_MODEL = SentenceTransformer("all-MiniLM-L6-v2")
    FINANCE_REFERENCE = "earnings revenue guidance profit loss merger acquisition regulation lawsuit analyst rating upgrade downgrade CEO CFO quarterly results forecast dividend buyback IPO"
    REF_EMBEDDING = SEMANTIC_MODEL.encode(FINANCE_REFERENCE)
    SEMANTIC_ENABLED = True
    print("Semantic filter: ENABLED")
except ImportError:
    SEMANTIC_ENABLED = False
    print("Semantic filter: DISABLED (sentence-transformers not installed)")

# ============================================================
# UPGRADE 1: FINANCIAL INTENT CLASSIFIER
# ============================================================

CATEGORY_RULES = {
    "earnings": ["earnings", "eps", "revenue", "quarter", "results", "guidance", "forecast", "profit", "loss", "beat", "miss"],
    "analyst": ["upgrade", "downgrade", "price target", "rating", "initiated", "reiterate", "analyst"],
    "management": ["ceo", "cfo", "board", "resigns", "appoints", "executive", "leadership"],
    "corporate": ["acquisition", "merger", "buyback", "dividend", "split", "deal", "partnership", "expansion"],
    "filing": ["13f", "stake", "holdings", "llc increases", "llc reduces", "management purchased", "increases position", 
               "reduces position", "institutional", "buys shares", "sells shares", "buys new shares", "new position in"],
    "regulation": ["sec", "lawsuit", "settlement", "investigation", "fine", "penalty", "compliance"],
}

# Categories to KEEP (high signal)
KEEP_CATEGORIES = ["earnings", "analyst", "management", "corporate", "regulation"]

# Categories to DISCARD (low signal, high noise)
DISCARD_CATEGORIES = ["filing"]

# Ownership spam patterns (HARD BLOCK)
OWNERSHIP_SPAM_PATTERNS = [
    "buys shares", "sells shares", "buys new shares", "increases holdings",
    "reduces holdings", "new position in", "llc buys", "llc sells",
    "advisors buys", "advisors sells", "management increases", "management reduces",
    "grows stock holdings", "raises stock position", "sells 4,", "sells 3,", "sells 2,", "sells 1,"
]

def is_ownership_spam(text):
    """Returns True if article is ownership/institutional filing spam."""
    text_lower = text.lower()
    return any(pattern in text_lower for pattern in OWNERSHIP_SPAM_PATTERNS)

def classify_article(text):
    """
    Classifies article into financial categories.
    Returns the primary category or 'general'.
    """
    text_lower = text.lower()
    scores = {}
    
    for category, keywords in CATEGORY_RULES.items():
        score = sum(1 for kw in keywords if kw in text_lower)
        if score > 0:
            scores[category] = score
    
    if not scores:
        return "general"
    
    return max(scores, key=scores.get)

# ============================================================
# UPGRADE 2: SOURCE CREDIBILITY WEIGHTING
# ============================================================

SOURCE_WEIGHT = {
    "reuters": 5,
    "wsj": 5,
    "ft.com": 5,
    "bloomberg": 5,
    "cnbc": 4,
    "seekingalpha": 3,
    "yahoo": 3,
    "benzinga": 2,
    "marketwatch": 2,
    "nasdaq": 2,
    "stocktwits": 0,
    "google": 1,  # Google News aggregates, variable quality
}

def get_source_weight(link, source_name):
    """Returns credibility weight based on source."""
    link_lower = link.lower()
    source_lower = source_name.lower()
    
    for source, weight in SOURCE_WEIGHT.items():
        if source in link_lower or source in source_lower:
            return weight
    
    return 1  # Default weight

# ============================================================
# UPGRADE 4: STRONGER NOISE BLACKLIST
# ============================================================

NOISE_KEYWORDS = [
    # Weather
    "weather", "storm", "hurricane", "flood", "tornado", "freeze", "snow",
    # Violence/Crime
    "shooting", "murder", "crime", "arrest", "police",
    # War/Military
    "war", "military", "invasion", "troops", "missile", "ukraine", "russia",
    # Politics (unless directly business-related)
    "election", "vote", "congress", "senate", "democrat", "republican", "trump", "biden",
    # Sports/Entertainment
    "sports", "game", "celebrity", "movie", "concert", "nfl", "nba",
    # Travel disruptions
    "airline delays", "flight cancel", "airport",
    # Crypto-only noise (unless it's about the company's crypto strategy)
    "bitcoin price", "crypto crash", "meme coin",
]

def has_noise(text):
    """Returns True if article contains noise keywords."""
    text_lower = text.lower()
    noise_count = sum(1 for kw in NOISE_KEYWORDS if kw in text_lower)
    return noise_count >= 2  # Allow 1 mention, block 2+

# ============================================================
# LAYER 1: SOURCE SEPARATION (Tier A = ticker-specific only)
# ============================================================

def get_rss_feeds(ticker):
    """
    Returns ONLY high-quality ticker-specific feeds.
    Removed all generic macro noise sources.
    """
    return [
        # Tier A: Ticker-specific (HIGH RELEVANCE)
        f"https://news.google.com/rss/search?q={ticker}+stock&hl=en-US&gl=US&ceid=US:en",
        f"https://feeds.finance.yahoo.com/rss/2.0/headline?s={ticker}",
        f"https://www.nasdaq.com/feed/rssoutbound?symbol={ticker}",
        f"https://stocktwits.com/symbol/{ticker}.rss",
        f"https://seekingalpha.com/api/sa/combined/{ticker}.xml",
        
        # Tier B: Quality sources (will be filtered by ticker presence)
        "https://feeds.benzinga.com/benzinga",
        "https://seekingalpha.com/market_currents.xml",
    ]

# ============================================================
# COMPANY NAME LOOKUP
# ============================================================

_company_cache = {}

def get_company_info(ticker):
    """Returns company name and related keywords for filtering."""
    if ticker in _company_cache:
        return _company_cache[ticker]
    
    try:
        stock = yf.Ticker(ticker)
        info = stock.info
        
        company_name = info.get('shortName', '') or info.get('longName', '')
        
        keywords = [ticker.lower()]
        
        if company_name:
            keywords.append(company_name.lower())
            for word in company_name.split():
                if len(word) > 3:
                    keywords.append(word.lower())
        
        _company_cache[ticker] = keywords
        return keywords
        
    except Exception:
        _company_cache[ticker] = [ticker.lower()]
        return [ticker.lower()]

# ============================================================
# HIGH IMPACT KEYWORDS
# ============================================================

HIGH_IMPACT_KEYWORDS = [
    "earnings", "revenue", "guidance", "quarter", "profit", "loss",
    "upgrade", "downgrade", "beat", "miss", "forecast", "outlook",
    "acquisition", "merger", "buyback", "dividend", "split",
    "sec", "filing", "lawsuit", "settlement", "investigation",
    "ceo", "cfo", "executive", "board", "analyst"
]

# ============================================================
# RELEVANCE FILTER (HARD FILTER)
# ============================================================

def is_relevant(article, ticker, company_keywords):
    """HARD FILTER: Returns True only if article is company-specific."""
    text = (article["title"] + " " + article.get("summary", "")).lower()
    
    for kw in company_keywords:
        if kw in text:
            return True
    
    return False

# ============================================================
# UPGRADE 3: SEMANTIC SIMILARITY SCORE
# ============================================================

def semantic_score(text):
    """Returns semantic similarity to financial reference text."""
    if not SEMANTIC_ENABLED:
        return 1.0  # Bypass if not available
    
    try:
        emb = SEMANTIC_MODEL.encode(text[:500])  # Limit text length
        score = util.cos_sim(emb, REF_EMBEDDING).item()
        return score
    except Exception:
        return 0.5

# ============================================================
# COMPLETE SCORING FUNCTION
# ============================================================

def score_article(article, ticker, company_keywords):
    """
    PRO scoring with all upgrades:
    - Ticker/company presence
    - High-impact keywords
    - Source credibility
    - Semantic similarity
    - Category bonuses
    - Noise penalties
    """
    score = 0
    title = article["title"].lower()
    text = (title + " " + article.get("summary", "")).lower()
    link = article.get("link", "").lower()
    source = article.get("source", "")
    
    # === BASE SCORING ===
    
    # Ticker in title: +10 (strongest signal)
    if ticker.lower() in title:
        score += 10
    elif ticker.lower() in text:
        score += 5
    
    # Company name match: +6
    for kw in company_keywords:
        if kw != ticker.lower() and kw in text:
            score += 6
            break
    
    # High-impact keywords: +2 each (max 8)
    impact_count = sum(1 for k in HIGH_IMPACT_KEYWORDS if k in text)
    score += min(impact_count * 2, 8)
    
    # === UPGRADE 2: Source credibility ===
    score += get_source_weight(link, source)
    
    # === UPGRADE 3: Semantic similarity bonus ===
    if SEMANTIC_ENABLED:
        sem_score = semantic_score(text)
        if sem_score > 0.35:
            score += 5
        elif sem_score > 0.25:
            score += 2
        elif sem_score < 0.15:
            score -= 5  # Penalize non-financial content
    
    # === UPGRADE 1: Category bonuses ===
    category = classify_article(text)
    article['_category'] = category  # Store for quota system
    
    if category in ["earnings", "analyst"]:
        score += 4
    elif category in ["management", "corporate"]:
        score += 3
    elif category in ["regulation"]:
        score += 2
    elif category in ["filing"]:
        score -= 8  # Heavy penalty for ownership spam
    
    # === UPGRADE 4: Noise penalties ===
    if has_noise(text):
        score -= 10
    
    return score

# ============================================================
# DEDUPLICATION
# ============================================================

def deduplicate_articles(articles):
    """Deduplicates articles based on normalized title similarity."""
    unique_articles = []
    seen_normalized_titles = set()
    
    for article in articles:
        norm_title = re.sub(r'\W+', '', article['title'].lower())
        
        # More aggressive dedup: first 50 chars
        short_key = norm_title[:50]
        
        if short_key in seen_normalized_titles:
            continue
            
        seen_normalized_titles.add(short_key)
        unique_articles.append(article)
        
    return unique_articles

# ============================================================
# UPGRADE 5: CATEGORY QUOTAS
# ============================================================

def apply_category_quotas(articles, quota_per_category=5, total_limit=50):
    """
    Returns balanced articles with max N per category.
    Ensures diverse summary (Bloomberg style).
    """
    bucket = defaultdict(list)
    
    for article in articles:
        category = article.get('_category', 'general')
        bucket[category].append(article)
    
    final = []
    
    # Priority order
    priority_categories = ["earnings", "analyst", "corporate", "management", "regulation", "general"]
    
    for cat in priority_categories:
        if cat in bucket:
            final.extend(bucket[cat][:quota_per_category])
    
    # Fill remaining with any leftover high-scoring articles
    if len(final) < total_limit:
        all_remaining = [a for a in articles if a not in final]
        final.extend(all_remaining[:total_limit - len(final)])
    
    return final[:total_limit]

# ============================================================
# MAIN FETCH FUNCTION (with all 5 upgrades)
# ============================================================

def fetch_news_data(ticker, days=30):
    """
    INSTITUTIONAL-GRADE news fetcher with 5 upgrades:
    1. Financial intent classifier
    2. Source credibility weighting
    3. Semantic similarity filter
    4. Stronger noise blacklist
    5. Category quotas
    """
    feeds = get_rss_feeds(ticker)
    cutoff_date = datetime.now() - timedelta(days=days)
    
    company_keywords = get_company_info(ticker)
    
    raw_articles = []
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
    }

    def fetch_single_feed(rss_url):
        articles = []
        try:
            response = requests.get(rss_url, headers=headers, timeout=5)
            if response.status_code != 200:
                return []
            
            feed = feedparser.parse(response.content)

            for entry in feed.entries:
                published_dt = None
                if hasattr(entry, 'published_parsed') and entry.published_parsed:
                    published_dt = datetime.fromtimestamp(time.mktime(entry.published_parsed))
                elif hasattr(entry, 'updated_parsed') and entry.updated_parsed:
                    published_dt = datetime.fromtimestamp(time.mktime(entry.updated_parsed))
                
                if not published_dt or published_dt < cutoff_date:
                    continue

                title = entry.title if hasattr(entry, 'title') else ''
                link = entry.link if hasattr(entry, 'link') else ''
                summary = entry.summary if hasattr(entry, 'summary') else ''
                
                def clean_text(text):
                    if not text: return ""
                    text = re.sub(r'<[^>]+>', '', text)
                    text = text.replace("&nbsp;", " ").replace("&amp;", "&")
                    return " ".join(text.split())

                title = clean_text(title)
                summary = clean_text(summary)
                
                if len(summary) < 20:
                    summary = title
                    
                source_title = getattr(feed.feed, 'title', 'Unknown')

                articles.append({
                    'title': title,
                    'link': link,
                    'published': published_dt.strftime('%Y-%m-%d %H:%M:%S'),
                    'summary': summary,
                    'source': source_title
                })
        except Exception:
            pass
            
        return articles

    # Parallel fetch
    with ThreadPoolExecutor(max_workers=8) as executor:
        future_to_url = {executor.submit(fetch_single_feed, url): url for url in feeds}
        for future in as_completed(future_to_url):
            try:
                data = future.result()
                raw_articles.extend(data)
            except Exception:
                pass
    
    # Deduplicate
    articles = deduplicate_articles(raw_articles)
    
    # Hard ticker filter
    articles = [a for a in articles if is_relevant(a, ticker, company_keywords)]
    
    # Remove noise
    articles = [a for a in articles if not has_noise(a["title"] + " " + a.get("summary", ""))]
    
    # Score and rank
    for article in articles:
        article['_score'] = score_article(article, ticker, company_keywords)
    
    articles.sort(key=lambda x: x['_score'], reverse=True)
    
    # Apply category quotas
    top_articles = apply_category_quotas(articles, quota_per_category=5, total_limit=50)
    
    return top_articles

Semantic filter: ENABLED


## Generating the Extracted Output
The target output is a nested dictionary mapping each company to its 30-day news breakdown. We iterate over the 25 requested symbols and collect the data into the requested shape.

In [9]:
COMPANIES = [
    "TSLA", "NVDA", "AAPL", "AMD",  "AMZN", "MSFT", "GOOGL", "META", 
    "BAC", "INTC", "CSCO", "KO", "XOM", 
     "NFLX", "NKE",
]

def extract_daily_categorical_news(tickers, days=30):
    all_companies_data = {}
    
    for ticker in tickers:
        print(f"Fetching data for {ticker}...")
        articles = fetch_news_data(ticker, days=days)
        
        # Structure: date -> category -> list of articles
        daily_news = defaultdict(lambda: defaultdict(list))
        
        for idx, a in enumerate(articles):
            # publish format: YYYY-MM-DD HH:MM:SS
            pub_date = a['published'].split(' ')[0]
            
            # Re-fetch category if we want, or fall back to '_category'. 
            # Our implementation retains `_category`.
            category = a.get('_category', 'general')
            
            # Store simplified representation for JSON output
            daily_news[pub_date][category].append({
        
                "summary": a['summary'],
                "source": a['source'],
               
            })

        all_companies_data[ticker] = {
            "news_count": len(articles),
            "30_day_news": { date: dict(categories) for date, categories in daily_news.items() }
        }
        
    return all_companies_data

# Set list for a quick test run to first 2 companies, to fetch actual complete output use `COMPANIES` list.
output_data = extract_daily_categorical_news(COMPANIES, days=30)

# Print out the result JSON as requested
# To avoid massive console logs, we write to a file first. We then print an excerpt.
with open("companies_30_day_categorical_news.json", "w") as f:
    json.dump(output_data, f, indent=4)
    
print(f"Processed extraction for {len(COMPANIES)} companies. Saved to companies_30_day_categorical_news.json")

Fetching data for TSLA...
Fetching data for NVDA...
Fetching data for AAPL...
Fetching data for AMD...
Fetching data for AMZN...
Fetching data for MSFT...
Fetching data for GOOGL...
Fetching data for META...
Fetching data for BAC...
Fetching data for INTC...
Fetching data for CSCO...
Fetching data for KO...
Fetching data for XOM...
Fetching data for NFLX...
Fetching data for NKE...
Processed extraction for 15 companies. Saved to companies_30_day_categorical_news.json


## Step 2: Text to Numerical Scoring & Ranking
We will now take the extracted json output from above, and transform the text (title + summary) into numerical representations using `all-MiniLM-L6-v2` (SentenceTransformer).
We'll compute a relevance/impact score against predefined financial keywords, rank the daily news based on this numerical score, and store the updated data.

In [11]:
from sentence_transformers import SentenceTransformer, util
import torch
import json

# 1. Load the model
# all-MiniLM-L6-v2 is an excellent balance of speed and performance for semantic similarity.
print("Loading SentenceTransformer model...")
model = SentenceTransformer('all-MiniLM-L6-v2')

# 2. Define Reference Embeddings
# We define what a 'High Impact' article looks like for stock movement.
HIGH_IMPACT_PHRASES = [
    "Better than expected earnings revenue beat growth surge",
    "Analyst upgrade price target increase buy rating",
    "Major acquisition merger buyout new partnership",
    "Unexpected CEO resignation leadership change",
    "SEC investigation lawsuit regulatory penalty"
]
reference_embeddings = model.encode(HIGH_IMPACT_PHRASES, convert_to_tensor=True)

def compute_semantic_score( summary):
    """
    Embeds the text and computes the maximum cosine similarity
    against the predefined high-impact phrases.
    """
    text = f"{summary}"
    text_embedding = model.encode(text, convert_to_tensor=True)
    
    # Compute cosine similarities
    cosine_scores = util.cos_sim(text_embedding, reference_embeddings)
    
    # Return the highest similarity score (0 to 1)
    max_score = torch.max(cosine_scores).item()
    return round(max_score, 4)

# 3. Load the data (or use the variable from the previous cell)
try:
    with open("companies_30_day_categorical_news.json", "r") as f:
        news_data = json.load(f)
    print("Loaded JSON data for scoring.")
except FileNotFoundError:
    print("Using in-memory output_data for scoring.")
    news_data = output_data  # defined in previous cell

def rank_and_score_news(data):
    """
    Iterates through the structured JSON, computes semantic scores,
    and sorts them descending per day per category.
    """
    print("Computing semantic scores and ranking...")
    # Recreate a new dictionary to avoid mutating while iterating if needed,
    # but we can also update in place.
    for ticker, ticker_data in data.items():
        # if "14_day_news" in ticker_data:
        # Let's find the correct key dynamically (14_day_news or 30_day_news)
        news_key = next((k for k in ticker_data.keys() if "_news" in k), None)
        if not news_key:
            continue
            
        for date, categories in ticker_data[news_key].items():
            for category, articles in categories.items():
                # Score each article
                for article in articles:
                    score = compute_semantic_score( article['summary'])
                    article['semantic_score'] = score
                
                # Sort articles by semantic score descending
                articles.sort(key=lambda x: x['semantic_score'], reverse=True)
    return data

# 4. Apply the scoring and ranking
ranked_news_data = rank_and_score_news(news_data)

# 5. Save the final ranked JSON
with open("companies_ranked_news.json", "w") as f:
    json.dump(ranked_news_data, f, indent=4)

print("Scoring and ranking complete. Saved to companies_ranked_news.json")


Loading SentenceTransformer model...
Loaded JSON data for scoring.
Computing semantic scores and ranking...
Scoring and ranking complete. Saved to companies_ranked_news.json
