In [19]:
# Import required libraries
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import datetime
from pathlib import Path
import logging
from tqdm import tqdm

# Configure logging
log_file = f'../reports/data_collection_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_file),
        logging.StreamHandler()
    ]
)

# Create necessary directories
Path('../data/stocks').mkdir(parents=True, exist_ok=True)
Path('../reports').mkdir(parents=True, exist_ok=True)

# Define parameters
START_DATE = '2015-01-01'
END_DATE = '2025-07-21'
INTERVAL = '1d'

# Required columns for validation
REQUIRED_COLUMNS = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits']

logging.info(f"Data collection configured for period: {START_DATE} to {END_DATE}")
print(f"Logs will be saved to: {log_file}")


2025-07-26 04:57:35,306 - INFO - Data collection configured for period: 2015-01-01 to 2025-07-21


Logs will be saved to: ../reports/data_collection_20250726_045735.log


In [20]:
# Load sector mapping
try:
    sector_mapping = pd.read_csv('../data/sector_mapping.csv')
    tickers = sector_mapping['Ticker'].unique()
    logging.info(f"Loaded {len(tickers)} tickers from sector mapping")
    
    print("\nSector distribution:")
    sector_counts = sector_mapping['Sector'].value_counts()
    for sector, count in sector_counts.items():
        print(f"- {sector}: {count} stocks")
        
except Exception as e:
    logging.error(f"Error loading sector mapping: {str(e)}")
    raise Exception("Cannot proceed without sector mapping")


2025-07-26 04:57:35,388 - INFO - Loaded 98 tickers from sector mapping



Sector distribution:
- Technology: 50 stocks
- Consumer Discretionary: 12 stocks
- Healthcare: 11 stocks
- Industrials: 7 stocks
- Consumer Staples: 6 stocks
- Communication Services: 6 stocks
- Utilities: 3 stocks
- Energy: 2 stocks
- Technology : 1 stocks


In [21]:
# Dictionary to store individual dataframes
stock_dfs = {}

# Function to fetch and clean data for a single ticker
def fetch_stock_data(ticker):
    """
    Fetch and clean stock data for a given ticker.
    
    Args:
        ticker (str): Stock ticker symbol
        
    Returns:
        pd.DataFrame or None: Cleaned stock data if successful, None if failed
    """
    try:
        # Fetch data
        stock = yf.Ticker(ticker)
        df = stock.history(
            start=START_DATE,
            end=END_DATE,
            interval=INTERVAL
        )
        
        if len(df) == 0:
            logging.warning(f"{ticker}: No data returned from yfinance")
            return None
            
        # Reset index to make Date a column
        df = df.reset_index()
        
        # Add ticker column
        df['Ticker'] = ticker
        
        # Verify required columns
        missing_cols = set(REQUIRED_COLUMNS) - set(df.columns)
        if missing_cols:
            logging.warning(f"{ticker}: Missing columns: {missing_cols}")
            return None
            
        # Clean data
        df = df.dropna(subset=['Close', 'Volume'])  # Remove rows with missing critical data
        
        if len(df) < 100:  # Minimum data requirement
            logging.warning(f"{ticker}: Insufficient data points ({len(df)})")
            return None
            
        return df
        
    except Exception as e:
        logging.error(f"Error processing {ticker}: {str(e)}")
        return None

# Process each ticker
for ticker in tqdm(tickers, desc="Processing stocks"):
    try:
        logging.info(f"Processing {ticker}")
        
        # Check if file already exists
        output_path = f'../data/stocks/{ticker}.csv'
        if Path(output_path).exists():
            # Load existing file
            df = pd.read_csv(output_path)
            stock_dfs[ticker] = df
            logging.info(f"Loaded existing data for {ticker}")
            continue
        
        # Fetch and process data
        df = fetch_stock_data(ticker)
        
        if df is not None:
            # Store in dictionary and save CSV
            stock_dfs[ticker] = df
            df.to_csv(output_path, index=False)
            logging.info(f"Saved {ticker} data with {len(df)} rows")
            print(f"\nSaved {output_path}")
            print(f"Date range: {df['Date'].min()} to {df['Date'].max()}")
            print(f"Columns: {', '.join(df.columns)}")
        else:
            logging.warning(f"Failed to process {ticker}")
            
    except Exception as e:
        logging.error(f"Error processing {ticker}: {str(e)}")
        continue

# Print summary
successful = list(stock_dfs.keys())
failed = [t for t in tickers if t not in successful]

print(f"\nProcessed {len(tickers)} tickers:")
print(f"- Successful: {len(successful)}")
print(f"- Failed: {len(failed)}")

if failed:
    print("\nFailed downloads:")
    for ticker in failed:
        print(f"- {ticker}")


Processing stocks:   0%|          | 0/98 [00:00<?, ?it/s]2025-07-26 04:57:35,481 - INFO - Processing AAPL
2025-07-26 04:57:35,498 - INFO - Loaded existing data for AAPL
2025-07-26 04:57:35,499 - INFO - Processing MSFT
2025-07-26 04:57:35,510 - INFO - Loaded existing data for MSFT
2025-07-26 04:57:35,511 - INFO - Processing AMZN
2025-07-26 04:57:35,522 - INFO - Loaded existing data for AMZN
2025-07-26 04:57:35,523 - INFO - Processing NVDA
2025-07-26 04:57:35,537 - INFO - Loaded existing data for NVDA
2025-07-26 04:57:35,539 - INFO - Processing META
2025-07-26 04:57:35,551 - INFO - Loaded existing data for META
2025-07-26 04:57:35,553 - INFO - Processing GOOGL
2025-07-26 04:57:35,569 - INFO - Loaded existing data for GOOGL
2025-07-26 04:57:35,570 - INFO - Processing GOOG
2025-07-26 04:57:35,581 - INFO - Loaded existing data for GOOG
Processing stocks:   7%|▋         | 7/98 [00:00<00:01, 69.31it/s]2025-07-26 04:57:35,583 - INFO - Processing TSLA
2025-07-26 04:57:35,593 - INFO - Loaded exi

2025-07-26 04:57:35,603 - INFO - Loaded existing data for AVGO
2025-07-26 04:57:35,604 - INFO - Processing PEP
2025-07-26 04:57:35,613 - INFO - Loaded existing data for PEP
2025-07-26 04:57:35,614 - INFO - Processing COST
2025-07-26 04:57:35,623 - INFO - Loaded existing data for COST
2025-07-26 04:57:35,625 - INFO - Processing CSCO
2025-07-26 04:57:35,636 - INFO - Loaded existing data for CSCO
2025-07-26 04:57:35,638 - INFO - Processing TMUS
2025-07-26 04:57:35,650 - INFO - Loaded existing data for TMUS
2025-07-26 04:57:35,651 - INFO - Processing ADBE
2025-07-26 04:57:35,661 - INFO - Loaded existing data for ADBE
2025-07-26 04:57:35,663 - INFO - Processing NFLX
2025-07-26 04:57:35,673 - INFO - Loaded existing data for NFLX
2025-07-26 04:57:35,674 - INFO - Processing CMCSA
2025-07-26 04:57:35,684 - INFO - Loaded existing data for CMCSA
Processing stocks:  16%|█▋        | 16/98 [00:00<00:01, 80.37it/s]2025-07-26 04:57:35,686 - INFO - Processing AMD
2025-07-26 04:57:35,698 - INFO - Loaded


Processed 98 tickers:
- Successful: 97
- Failed: 1

Failed downloads:
- ATVI





In [22]:
# Data Collection Summary
print("\nData Collection Summary")
print("=" * 50)

# Analyze downloaded files
stock_files = list(Path('../data/stocks').glob('*.csv'))
total_rows = 0
date_ranges = []

print(f"\nTotal files generated: {len(stock_files)}")

# Sample a few files to show data quality
sample_files = np.random.choice(stock_files, min(5, len(stock_files)), replace=False)
print("\nSample Data Quality Check:")

for file in sample_files:
    try:
        df = pd.read_csv(file)
        ticker = file.stem
        
        # Convert date with UTC=True to avoid warning
        df['Date'] = pd.to_datetime(df['Date'], utc=True)
        
        # Calculate basic statistics
        print(f"\n{ticker}:")
        print(f"- Rows: {len(df):,}")
        print(f"- Date Range: {df['Date'].min():%Y-%m-%d} to {df['Date'].max():%Y-%m-%d}")
        print(f"- Trading days per year: {len(df) / ((df['Date'].max() - df['Date'].min()).days / 365):.1f}")
        print(f"- Missing values: {df.isnull().sum().sum():,}")
        
        # Check data quality
        price_cols = ['Open', 'High', 'Low', 'Close']
        if all(col in df.columns for col in price_cols):
            invalid_prices = df[df[price_cols] <= 0].shape[0]
            if invalid_prices > 0:
                print(f"  ⚠️ Found {invalid_prices} rows with invalid prices")
        
        total_rows += len(df)
        date_ranges.append((df['Date'].min(), df['Date'].max()))
        
    except Exception as e:
        print(f"\nError reading {ticker}: {str(e)}")

if date_ranges:
    overall_start = min(date[0] for date in date_ranges)
    overall_end = max(date[1] for date in date_ranges)
    
    print(f"\nOverall Statistics:")
    print(f"- Total trading days collected: {total_rows:,}")
    print(f"- Date coverage: {overall_start:%Y-%m-%d} to {overall_end:%Y-%m-%d}")
    print(f"- Average rows per stock: {total_rows / len(stock_files):,.1f}")
    
print(f"\nDetailed logs saved to: {log_file}")
print("\nData collection complete!")



Data Collection Summary

Total files generated: 97

Sample Data Quality Check:

AEP:
- Rows: 2,651
- Date Range: 2015-01-02 to 2025-07-18
- Trading days per year: 251.4
- Missing values: 0
  ⚠️ Found 2651 rows with invalid prices

GOOG:
- Rows: 2,651
- Date Range: 2015-01-02 to 2025-07-18
- Trading days per year: 251.4
- Missing values: 0
  ⚠️ Found 2651 rows with invalid prices

CDNS:
- Rows: 2,651
- Date Range: 2015-01-02 to 2025-07-18
- Trading days per year: 251.4
- Missing values: 0
  ⚠️ Found 2651 rows with invalid prices

DOCU:
- Rows: 1,816
- Date Range: 2018-04-27 to 2025-07-18
- Trading days per year: 251.2
- Missing values: 0
  ⚠️ Found 1816 rows with invalid prices

XEL:
- Rows: 2,651
- Date Range: 2015-01-02 to 2025-07-18
- Trading days per year: 251.4
- Missing values: 0
  ⚠️ Found 2651 rows with invalid prices

Overall Statistics:
- Total trading days collected: 12,420
- Date coverage: 2015-01-02 to 2025-07-18
- Average rows per stock: 128.0

Detailed logs saved to: ../