In [None]:
import requests
import json
from datetime import datetime, timedelta
import pandas as pd
import time
import re
import os

class EnhancedFinnhubDataFetcher:
    def __init__(self, api_key):
        """
        Enhanced Finnhub data fetcher with smart company matching

        Args:
            api_key (str): Your Finnhub API key
        """
        self.api_key = api_key
        self.base_url = "https://finnhub.io/api/v1"
        self.headers = {"X-Finnhub-Token": self.api_key}

        # Define your company universe for focused analysis
        self.company_universe = {
            'AAPL': {'name': 'Apple', 'keywords': ['apple', 'iphone', 'ipad', 'mac', 'ios']},
            'TSLA': {'name': 'Tesla', 'keywords': ['tesla', 'elon musk', 'electric vehicle', 'ev', 'model s', 'model 3', 'model y']},
            'MSFT': {'name': 'Microsoft', 'keywords': ['microsoft', 'windows', 'azure', 'office', 'teams', 'xbox']},
            'GOOGL': {'name': 'Google', 'keywords': ['google', 'alphabet', 'youtube', 'android', 'chrome', 'search']},
            'AMZN': {'name': 'Amazon', 'keywords': ['amazon', 'aws', 'prime', 'alexa', 'bezos']},
            'META': {'name': 'Meta', 'keywords': ['meta', 'facebook', 'instagram', 'whatsapp', 'metaverse']},
            'NVDA': {'name': 'NVIDIA', 'keywords': ['nvidia', 'gpu', 'ai chip', 'graphics card']},
            'NFLX': {'name': 'Netflix', 'keywords': ['netflix', 'streaming', 'subscriber']},
            'CRM': {'name': 'Salesforce', 'keywords': ['salesforce', 'crm', 'cloud software']},
            'PYPL': {'name': 'PayPal', 'keywords': ['paypal', 'digital payment', 'fintech']}
        }

    def fetch_market_news(self, category="general", limit=100):
        """
        Fetch market news headlines with enhanced metadata

        Args:
            category (str): News category
            limit (int): Number of news articles to fetch

        Returns:
            list: Enhanced news articles with company matching
        """
        try:
            url = f"{self.base_url}/news"
            params = {
                "category": category,
                "token": self.api_key
            }

            response = requests.get(url, params=params)
            response.raise_for_status()

            raw_news = response.json()

            if len(raw_news) > limit:
                raw_news = raw_news[:limit]

            # Enhance news with company matching and metadata
            enhanced_news = []
            for article in raw_news:
                enhanced_article = self._enhance_news_article(article)
                if enhanced_article:  # Only keep articles related to our universe
                    enhanced_news.append(enhanced_article)

            print(f"✅ Fetched {len(raw_news)} total articles, {len(enhanced_news)} relevant to tracked companies")
            return enhanced_news

        except requests.exceptions.RequestException as e:
            print(f"❌ Error fetching news: {e}")
            return []

    def _enhance_news_article(self, article):
        """
        Enhance news article with company matching and sentiment preparation
        """
        headline = article.get('headline', '').lower()
        summary = article.get('summary', '').lower()
        content = f"{headline} {summary}"

        # Find matching companies
        matched_companies = []
        for symbol, company_data in self.company_universe.items():
            if self._matches_company(content, company_data['keywords']):
                matched_companies.append(symbol)

        # Only return articles that match our company universe
        if not matched_companies:
            return None

        # Enhance the article
        enhanced = {
            'id': article.get('id', f"news_{datetime.now().timestamp()}"),
            'headline': article.get('headline', ''),
            'summary': article.get('summary', ''),
            'source': article.get('source', 'Unknown'),
            'url': article.get('url', ''),
            'datetime': article.get('datetime', int(time.time())),
            'formatted_date': datetime.fromtimestamp(article.get('datetime', time.time())).strftime('%Y-%m-%d %H:%M:%S'),
            'matched_companies': matched_companies,
            'company_count': len(matched_companies),
            'category': article.get('category', 'general'),
            'sentiment_score': None,  # To be filled by sentiment analysis
            'sentiment_label': None,  # To be filled by sentiment analysis
            'content_length': len(article.get('headline', '') + article.get('summary', '')),
            'relevance_score': len(matched_companies) / len(self.company_universe)  # How many companies this affects
        }

        return enhanced

    def _matches_company(self, content, keywords):
        """
        Check if content mentions company keywords
        """
        for keyword in keywords:
            if keyword.lower() in content:
                return True
        return False

    def fetch_stock_prices_for_universe(self):
        """
        Fetch stock prices for all companies in our universe

        Returns:
            list: Enhanced stock price data
        """
        stock_data = []

        print(f"📊 Fetching prices for {len(self.company_universe)} companies...")

        for symbol in self.company_universe.keys():
            price_data = self._fetch_enhanced_stock_price(symbol)
            if price_data:
                stock_data.append(price_data)

            # Rate limiting
            time.sleep(0.2)

        return stock_data

    def _fetch_enhanced_stock_price(self, symbol):
        """
        Fetch enhanced stock price data for a single symbol
        """
        try:
            url = f"{self.base_url}/quote"
            params = {
                "symbol": symbol,
                "token": self.api_key
            }

            response = requests.get(url, params=params)
            response.raise_for_status()

            raw_data = response.json()

            # Enhanced stock data
            current_price = raw_data.get('c', 0)
            previous_close = raw_data.get('pc', 0)

            enhanced_stock = {
                'symbol': symbol,
                'company_name': self.company_universe[symbol]['name'],
                'current_price': current_price,
                'previous_close': previous_close,
                'change': raw_data.get('d', 0),
                'change_percent': raw_data.get('dp', 0),
                'high': raw_data.get('h', 0),
                'low': raw_data.get('l', 0),
                'open': raw_data.get('o', 0),
                'volume': raw_data.get('v', 0),
                'timestamp': datetime.now().isoformat(),
                'fetch_date': datetime.now().strftime('%Y-%m-%d'),
                'market_cap_estimate': current_price * 1000000000,  # Simplified estimate
                'volatility': abs(raw_data.get('dp', 0)),  # Using absolute percentage change as volatility proxy
                'trading_status': 'active' if current_price > 0 else 'inactive'
            }

            print(f"✅ {symbol} ({self.company_universe[symbol]['name']}): ${current_price:.2f} ({enhanced_stock['change']:+.2f}, {enhanced_stock['change_percent']:+.2f}%)")
            return enhanced_stock

        except requests.exceptions.RequestException as e:
            print(f"❌ Error fetching {symbol}: {e}")
            return {}

    def create_matched_dataset(self):
        """
        Create a perfectly matched dataset of news and stock prices

        Returns:
            dict: Complete dataset ready for sentiment analysis and correlation
        """
        print("🔄 Creating matched dataset...")

        # Fetch both news and stock data
        news_data = self.fetch_market_news(limit=50)
        stock_data = self.fetch_stock_prices_for_universe()

        # Create matched records
        matched_dataset = {
            'fetch_timestamp': datetime.now().isoformat(),
            'fetch_date': datetime.now().strftime('%Y-%m-%d'),
            'total_news_articles': len(news_data),
            'total_companies_tracked': len(self.company_universe),
            'companies_with_news': len(set([company for article in news_data for company in article['matched_companies']])),
            'news_articles': news_data,
            'stock_prices': stock_data,
            'company_universe': self.company_universe
        }

        # Generate summary statistics
        matched_dataset['summary'] = self._generate_summary_stats(news_data, stock_data)

        return matched_dataset

    def _generate_summary_stats(self, news_data, stock_data):
        """
        Generate summary statistics for the dashboard
        """
        # News statistics
        company_mentions = {}
        for article in news_data:
            for company in article['matched_companies']:
                company_mentions[company] = company_mentions.get(company, 0) + 1

        # Stock statistics
        positive_movers = [s for s in stock_data if s.get('change_percent', 0) > 0]
        negative_movers = [s for s in stock_data if s.get('change_percent', 0) < 0]

        return {
            'most_mentioned_company': max(company_mentions.items(), key=lambda x: x[1]) if company_mentions else ('None', 0),
            'total_company_mentions': sum(company_mentions.values()),
            'companies_with_news_today': len(company_mentions),
            'positive_movers': len(positive_movers),
            'negative_movers': len(negative_movers),
            'biggest_gainer': max(stock_data, key=lambda x: x.get('change_percent', 0)) if stock_data else None,
            'biggest_loser': min(stock_data, key=lambda x: x.get('change_percent', 0)) if stock_data else None,
            'average_change_percent': sum([s.get('change_percent', 0) for s in stock_data]) / len(stock_data) if stock_data else 0
        }

    def save_enhanced_data(self, dataset, base_filename="market_data"):
        """
        Save enhanced dataset in multiple formats for different use cases
        """
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

        # Complete dataset (JSON) - for API integrations and cloud processing
        json_filename = f"{base_filename}_{timestamp}.json"
        with open(json_filename, 'w', encoding='utf-8') as f:
            json.dump(dataset, f, indent=2, ensure_ascii=False)
        print(f"✅ Complete dataset saved: {json_filename}")

        # News articles (CSV) - for sentiment analysis input
        if dataset['news_articles']:
            news_df = pd.DataFrame(dataset['news_articles'])
            news_csv = f"news_articles_{timestamp}.csv"
            news_df.to_csv(news_csv, index=False)
            print(f"✅ News articles saved: {news_csv}")

        # Stock prices (CSV) - for correlation analysis
        if dataset['stock_prices']:
            stocks_df = pd.DataFrame(dataset['stock_prices'])
            stocks_csv = f"stock_prices_{timestamp}.csv"
            stocks_df.to_csv(stocks_csv, index=False)
            print(f"✅ Stock prices saved: {stocks_csv}")

        # Summary report (JSON) - for dashboard overview
        summary_filename = f"daily_summary_{timestamp}.json"
        summary_data = {
            'date': dataset['fetch_date'],
            'summary': dataset['summary'],
            'company_universe': list(dataset['company_universe'].keys())
        }
        with open(summary_filename, 'w') as f:
            json.dump(summary_data, f, indent=2)
        print(f"✅ Summary report saved: {summary_filename}")

        return {
            'complete_dataset': json_filename,
            'news_csv': news_csv if dataset['news_articles'] else None,
            'stocks_csv': stocks_csv if dataset['stock_prices'] else None,
            'summary_report': summary_filename
        }

    def display_preview(self, dataset):
        """
        Display a preview of the matched dataset
        """
        print("\n" + "="*60)
        print("📊 ENHANCED MARKET DATA PREVIEW")
        print("="*60)

        summary = dataset['summary']

        print(f"\n📈 MARKET OVERVIEW ({dataset['fetch_date']})")
        print(f"Total Companies Tracked: {dataset['total_companies_tracked']}")
        print(f"Companies with News Today: {summary['companies_with_news_today']}")
        print(f"Total News Articles: {dataset['total_news_articles']}")
        print(f"Average Market Change: {summary['average_change_percent']:+.2f}%")

        if summary['biggest_gainer']:
            gainer = summary['biggest_gainer']
            print(f"Biggest Gainer: {gainer['symbol']} ({gainer['company_name']}) +{gainer['change_percent']:.2f}%")

        if summary['biggest_loser']:
            loser = summary['biggest_loser']
            print(f"Biggest Loser: {loser['symbol']} ({loser['company_name']}) {loser['change_percent']:.2f}%")

        print(f"\n📰 NEWS HIGHLIGHTS")
        for i, article in enumerate(dataset['news_articles'][:3], 1):
            companies = ', '.join(article['matched_companies'])
            print(f"{i}. {article['headline']}")
            print(f"   Companies: {companies}")
            print(f"   Source: {article['source']} | {article['formatted_date']}")
            print()

        print(f"\n💹 STOCK PERFORMANCE")
        for stock in dataset['stock_prices'][:5]:
            status = "📈" if stock['change_percent'] > 0 else "📉" if stock['change_percent'] < 0 else "➡️"
            print(f"{status} {stock['symbol']}: ${stock['current_price']:.2f} ({stock['change_percent']:+.2f}%)")


def main():
    # Replace with your actual Finnhub API key from finnhub.io
    API_KEY = "d2670ohr01qh25lm0e1gd2670ohr01qh25lm0e20"

    # Check if API key is still placeholder
    if API_KEY == "YOUR_ACTUAL_API_KEY_HERE":
        print("❌ Please update the API_KEY variable with your actual Finnhub API key!")
        print("1. Go to https://finnhub.io")
        print("2. Sign up for free account")
        print("3. Get your API key from dashboard")
        print("4. Replace 'YOUR_ACTUAL_API_KEY_HERE' with your key")
        return

    # Initialize enhanced fetcher
    fetcher = EnhancedFinnhubDataFetcher(API_KEY)

    print("🚀 Starting Enhanced Finnhub Data Collection...")
    print("="*60)

    # Create matched dataset
    dataset = fetcher.create_matched_dataset()

    # Display preview
    fetcher.display_preview(dataset)

    # Save enhanced data
    print("\n💾 Saving enhanced dataset...")
    saved_files = fetcher.save_enhanced_data(dataset)

    print("\n✅ Enhanced data collection completed!")
    print("\n📁 Files created:")
    for file_type, filename in saved_files.items():
        if filename:
            print(f"  • {file_type}: {filename}")

    print(f"\n🎯 Ready for next steps:")
    print("  1. Use news CSV for sentiment analysis with Vertex AI")
    print("  2. Load stock CSV into BigQuery for correlation analysis")
    print("  3. Connect complete dataset to Looker Studio for dashboards")
    print("  4. Set up Apache Airflow to run this daily")


if __name__ == "__main__":
    main()

🚀 Starting Enhanced Finnhub Data Collection...
🔄 Creating matched dataset...
✅ Fetched 50 total articles, 23 relevant to tracked companies
📊 Fetching prices for 10 companies...
✅ AAPL (Apple): $207.57 (-1.48, -0.71%)
✅ TSLA (Tesla): $308.27 (-10.77, -3.38%)
✅ MSFT (Microsoft): $533.50 (+20.26, +3.95%)
✅ GOOGL (Google): $191.90 (-4.63, -2.36%)
✅ AMZN (Amazon): $234.11 (+3.92, +1.70%)
✅ META (Meta): $773.44 (+78.23, +11.25%)
✅ NVDA (NVIDIA): $177.87 (-1.40, -0.78%)
✅ NFLX (Netflix): $1159.40 (-24.80, -2.09%)
✅ CRM (Salesforce): $258.33 (-6.48, -2.45%)
✅ PYPL (PayPal): $68.76 (-0.95, -1.36%)

📊 ENHANCED MARKET DATA PREVIEW

📈 MARKET OVERVIEW (2025-08-01)
Total Companies Tracked: 10
Companies with News Today: 5
Total News Articles: 23
Average Market Change: +0.38%
Biggest Gainer: META (Meta) +11.25%
Biggest Loser: TSLA (Tesla) -3.38%

📰 NEWS HIGHLIGHTS
1. Exxon Mobil, Chevron say oil production is booming and they’re rolling in cash
   Companies: TSLA
   Source: MarketWatch | 2025-08-01 11