# Data Collection

This notebook collects:
1. **Stock price data** for MSFT, TSLA, and VISA using Yahoo Finance API
2. **Company-specific financial news** using News API
3. **Macro-economic news** affecting the broader market

All data covers a 30-day period and is saved to CSV files for sentiment analysis.

## Setup and Imports

In [None]:
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from newsapi import NewsApiClient
from dotenv import load_dotenv
import os

# Load API keys
load_dotenv()
news_api = NewsApiClient(api_key=os.getenv('NEWS_API_KEY'))

## 1. Collect Stock Price Data

Fetching historical stock prices for Microsoft (MSFT), Tesla (TSLA), and Visa (V) using yfinance.

In [None]:
# Define tickers
tickers = ['MSFT', 'TSLA', 'V']  # V is Visa's ticker symbol

print("Fetching stock data for the last 30 days...\n")

for ticker in tickers:
    stock = yf.Ticker(ticker)
    df = stock.history(period='1mo')
    
    print(f"{ticker}:")
    print(f"  Trading days: {len(df)}")
    print(f"  Date range: {df.index.min().date()} to {df.index.max().date()}")
    print(f"  Latest close: ${df['Close'].iloc[-1]:.2f}")
    print()

# Visualize MSFT price trend
msft = yf.Ticker('MSFT')
msft_df = msft.history(period='3mo')

plt.figure(figsize=(12, 6))
plt.plot(msft_df.index, msft_df['Close'], linewidth=2, color='#0078D4')
plt.title('Microsoft Stock Price - Last 3 Months', fontsize=16, fontweight='bold')
plt.xlabel('Date', fontsize=12)
plt.ylabel('Price ($)', fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("✓ Stock data collection complete")

## 2. Collect Company-Specific News

Collecting financially-relevant news articles for each stock. We filter for business/financial keywords to focus on market-moving news rather than generic mentions.

In [None]:
def get_company_news(company_name, ticker, days=30):
    """
    Fetch financially-relevant news for a specific company
    
    Parameters:
    - company_name: Full company name
    - ticker: Stock ticker symbol
    - days: Number of days to look back
    
    Returns: DataFrame with news articles
    """
    
    news = news_api.get_everything(
        q=f'({company_name} OR {ticker}) AND (earnings OR revenue OR stock OR shares OR CEO OR acquisition OR layoffs OR product launch OR quarterly OR profit OR loss OR investor)',
        language='en',
        sort_by='publishedAt',
        from_param=(datetime.now() - timedelta(days=days)).strftime('%Y-%m-%d'),
        to=datetime.now().strftime('%Y-%m-%d'),
        domains='bloomberg.com,reuters.com,cnbc.com,wsj.com,ft.com,forbes.com,marketwatch.com,businessinsider.com'
    )
    
    # Convert to DataFrame
    articles = []
    for article in news['articles']:
        articles.append({
            'date': article['publishedAt'][:10],
            'title': article['title'],
            'description': article['description'],
            'source': article['source']['name'],
            'url': article['url'],
            'company': company_name,
            'ticker': ticker
        })
    
    return pd.DataFrame(articles)

In [None]:
# Collect news for all 3 stocks
print("Collecting company-specific news (30 days)...\n")

companies = [
    ('Microsoft', 'MSFT'),
    ('Tesla', 'TSLA'),
    ('Visa', 'V')
]

all_company_news = []

for company, ticker in companies:
    print(f"Fetching {company} news...")
    df = get_company_news(company, ticker, days=30)
    all_company_news.append(df)
    print(f"  Found {len(df)} articles")

# Combine all company news
company_news_df = pd.concat(all_company_news, ignore_index=True)

print(f"\n✓ Total company-specific articles: {len(company_news_df)}")
print(f"✓ Date range: {company_news_df['date'].min()} to {company_news_df['date'].max()}")

# Save to CSV
company_news_df.to_csv('../data/raw/company_news.csv', index=False)
print("✓ Saved to data/raw/company_news.csv")

company_news_df.head(10)

## 3. Collect Macro-Economic News

Collecting broader market news (interest rates, tariffs, economic data) that affects all stocks.

In [None]:
print("Collecting macro-economic news (30 days)...\n")

macro_news = news_api.get_everything(
    q='(stock market OR interest rates OR Federal Reserve OR unemployment OR inflation OR recession OR economic data OR jobs report OR GDP OR trade war OR tariffs)',
    language='en',
    sort_by='publishedAt',
    from_param=(datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d'),
    to=datetime.now().strftime('%Y-%m-%d'),
    domains='bloomberg.com,reuters.com,cnbc.com,wsj.com,ft.com,marketwatch.com,economist.com'
)

# Convert to DataFrame
macro_articles = []
for article in macro_news['articles']:
    macro_articles.append({
        'date': article['publishedAt'][:10],
        'title': article['title'],
        'description': article['description'],
        'source': article['source']['name'],
        'url': article['url'],
        'type': 'macro'
    })

macro_news_df = pd.DataFrame(macro_articles)

print(f"✓ Found {len(macro_news_df)} macro news articles")
print(f"✓ Date range: {macro_news_df['date'].min()} to {macro_news_df['date'].max()}")

# Save
macro_news_df.to_csv('../data/raw/macro_news.csv', index=False)
print("✓ Saved to data/raw/macro_news.csv")

macro_news_df.head(10)

## Summary

Data collection complete! We now have:
- Stock price data for MSFT, TSLA, and VISA
- Company-specific financial news articles
- Macro-economic news articles

All data saved to `../data/raw/` for sentiment analysis in the next notebook.