In [36]:
# Import required libraries
import pandas as pd
import numpy as np
from pathlib import Path
import logging
from tqdm import tqdm

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Configuration
VALIDATION_CONFIG = {
    'min_trading_days': 1000,  # Minimum number of trading days required
    'max_missing_pct': 5,      # Maximum percentage of missing data allowed
    'min_stock_sector_corr': 0.3  # Minimum correlation between stock and sector
}

# Required columns for validation
REQUIRED_COLUMNS = {
    'stocks': ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Sector'],
    'sectors': ['Date', 'Open', 'High', 'Low', 'Close', 'Volume']
}

# Define GICS sectors and their ETFs
SECTOR_ETF_MAP = {
    'XLK': 'Information Technology',    # Technology sector benchmark
    'XLF': 'Financials',               # Banks, Insurance, Investment firms
    'XLV': 'Healthcare',               # Healthcare providers, Biotech
    'XLE': 'Energy',                   # Oil & Gas companies
    'XLY': 'Consumer Discretionary',   # Retail, Automotive, Media
    'XLP': 'Consumer Staples',         # Food & Beverage
    'XLI': 'Industrials',              # Aerospace & Defense
    'XLB': 'Materials',                # Chemicals, Mining
    'XLU': 'Utilities',                # Electric & Gas utilities
    'XLRE': 'Real Estate',             # REITs & Property management
    'XLC': 'Communication Services'     # Telecom services & Media
}

# Mapping for standardizing sector names
SECTOR_NAME_MAP = {
    'Technology': 'Information Technology',
    'Technology ': 'Information Technology',  # Handle trailing space
    'Consumer_Discretionary': 'Consumer Discretionary',
    'Consumer_Staples': 'Consumer Staples',
    'Communication_Services': 'Communication Services',
    'Real_Estate': 'Real Estate'
}

# Create reverse mapping for ETF lookup
SECTOR_TO_ETF = {sector: etf for etf, sector in SECTOR_ETF_MAP.items()}


In [37]:
# Utility functions for data validation
def validate_columns(df, required_cols, name):
    """Validate that all required columns are present."""
    missing_cols = set(required_cols) - set(df.columns)
    if missing_cols:
        raise ValueError(f"{name}: Missing columns: {missing_cols}")
    return True

def fix_price_relationships(df, name):
    """Fix price relationships in the data."""
    price_cols = ['Open', 'High', 'Low', 'Close']
    
    # Round to 4 decimal places to handle floating point precision
    for col in price_cols:
        df[col] = df[col].round(4)
    
    # Step 1: Basic price adjustments
    df['High'] = df[price_cols].max(axis=1)
    df['Low'] = df[price_cols].min(axis=1)
    
    # Step 2: Handle specific cases
    # If Open/Close are outside High/Low range, adjust them
    df.loc[df['Open'] > df['High'], 'Open'] = df['High']
    df.loc[df['Open'] < df['Low'], 'Open'] = df['Low']
    df.loc[df['Close'] > df['High'], 'Close'] = df['High']
    df.loc[df['Close'] < df['Low'], 'Close'] = df['Low']
    
    # Step 3: Final validation with tolerance
    tolerance = 0.0001  # 0.01% tolerance
    invalid_price = (
        (df['High'] * (1 - tolerance) < df['Low']) |
        (df['High'] * (1 - tolerance) < df['Open']) |
        (df['High'] * (1 - tolerance) < df['Close']) |
        (df['Low'] * (1 + tolerance) > df['Open']) |
        (df['Low'] * (1 + tolerance) > df['Close'])
    )
    
    if invalid_price.any():
        invalid_dates = df[invalid_price].index
        logging.warning(f"{name}: Found {len(invalid_dates)} invalid price relationships")
        
        # Additional fixing attempt for remaining issues
        problem_rows = df[invalid_price].copy()
        problem_rows['High'] = problem_rows[price_cols].max(axis=1) * 1.0001
        problem_rows['Low'] = problem_rows[price_cols].min(axis=1) * 0.9999
        
        # Update the original dataframe
        df.loc[invalid_price] = problem_rows
    
    return df

def validate_data_quality(df, name):
    """Validate data quality metrics."""
    # Make a copy to avoid modifying the original data
    df = df.copy()
    
    # Check date range
    if len(df) < VALIDATION_CONFIG['min_trading_days']:
        if name in ['GFS', 'RIVN']:  # Known newer stocks
            logging.warning(f"{name}: Newer stock with {len(df)} trading days")
            return df
        raise ValueError(f"{name}: Insufficient trading days ({len(df)})")
    
    # Check missing data
    missing_pct = (df.isnull().sum() / len(df) * 100).max()
    if missing_pct > VALIDATION_CONFIG['max_missing_pct']:
        raise ValueError(f"{name}: High missing data ({missing_pct:.1f}%)")
    
    # Check for invalid values
    price_cols = ['Open', 'High', 'Low', 'Close']
    for col in price_cols:
        if (df[col] <= 0).any():
            raise ValueError(f"{name}: Invalid {col} values (<=0)")
        if df[col].isnull().any():
            raise ValueError(f"{name}: Missing {col} values")
    
    if (df['Volume'] < 0).any():
        raise ValueError(f"{name}: Invalid Volume values (<0)")
    
    # Fix price relationships
    df = fix_price_relationships(df, name)
    
    return df


In [38]:
# Main data loading functions
def load_stock_data():
    """Load and validate stock data."""
    print("\nLoading and validating stock data...", flush=True)
    stock_files = list(Path('../data/stocks').glob('*.csv'))
    
    stock_data = {}
    validation_results = {'passed': [], 'failed': []}
    
    for file in tqdm(stock_files, desc="Loading stock data"):
        try:
            # Load data
            df = pd.read_csv(file)
            name = file.stem
            
            # Convert date to datetime with UTC
            df['Date'] = pd.to_datetime(df['Date'], utc=True)
            
            # Clean sector names
            df['Sector'] = df['Sector'].str.strip()
            df['Sector'] = df['Sector'].replace(SECTOR_NAME_MAP)
            
            # Add ticker column
            df['Ticker'] = name
            
            # Sort by date
            df = df.sort_values('Date')
            
            # Handle missing data
            df = df.ffill(limit=5).bfill(limit=5)
            
            # Validate columns
            validate_columns(df, REQUIRED_COLUMNS['stocks'], name)
            
            # Validate and fix data quality
            df = validate_data_quality(df, name)
            
            # Store data
            stock_data[name] = df
            validation_results['passed'].append(name)
            logging.info(f"Validated {name} data: {len(df)} rows")
            
        except Exception as e:
            logging.warning(f"{name}: {str(e)}")
            validation_results['failed'].append(name)
            continue
    
    return stock_data, validation_results

def load_sector_data():
    """Load and validate sector ETF data."""
    print("\nLoading and validating sector data...", flush=True)
    sector_data = {}
    validation_results = {'passed': [], 'failed': []}
    
    for etf, sector in SECTOR_ETF_MAP.items():
        try:
            # Load data
            file_path = Path(f'../data/sectors/{etf}.csv')
            if not file_path.exists():
                raise FileNotFoundError(f"No data file found for ETF {etf}")
            
            df = pd.read_csv(file_path)
            
            # Convert date to datetime with UTC
            df['Date'] = pd.to_datetime(df['Date'], utc=True)
            
            # Add sector information
            df['Sector'] = sector
            df['Ticker'] = etf
            
            # Sort by date
            df = df.sort_values('Date')
            
            # Handle missing data
            df = df.ffill(limit=5).bfill(limit=5)
            
            # Drop any remaining rows with missing values
            df = df.dropna()
            
            # Validate columns
            validate_columns(df, REQUIRED_COLUMNS['sectors'], f"{sector} ({etf})")
            
            # Validate and fix data quality
            df = validate_data_quality(df, f"{sector} ({etf})")
            
            # Store data
            sector_data[sector] = df
            validation_results['passed'].append(sector)
            logging.info(f"Validated {sector} ({etf}) data: {len(df)} rows")
            
            # Print summary
            print(f"\n{etf} ({sector}) Summary:", flush=True)
            print(f"- Date Range: {df['Date'].min():%Y-%m-%d} to {df['Date'].max():%Y-%m-%d}")
            print(f"- Trading Days: {len(df)}")
            print(f"- File: {file_path}")
            
        except Exception as e:
            logging.warning(f"{sector} ({etf}): {str(e)}")
            validation_results['failed'].append(sector)
            
            # Try to fix the data with more aggressive settings
            try:
                # Load data again
                df = pd.read_csv(file_path)
                
                # Convert date to datetime with UTC
                df['Date'] = pd.to_datetime(df['Date'], utc=True)
                
                # Add sector information
                df['Sector'] = sector
                df['Ticker'] = etf
                
                # Sort by date
                df = df.sort_values('Date')
                
                # Handle missing data more aggressively
                df = df.ffill(limit=10).bfill(limit=10)
                
                # Drop any remaining rows with missing values
                df = df.dropna()
                
                # Validate columns
                validate_columns(df, REQUIRED_COLUMNS['sectors'], f"{sector} ({etf})")
                
                # Validate and fix data quality with more aggressive settings
                df = validate_data_quality(df, f"{sector} ({etf})")
                
                # Additional price relationship fixes if needed
                df = fix_price_relationships(df, f"{sector} ({etf})")
                
                # If we get here, the retry was successful
                sector_data[sector] = df
                validation_results['passed'].append(sector)
                validation_results['failed'].remove(sector)
                logging.info(f"Successfully fixed and validated {sector} ({etf}) data: {len(df)} rows")
                
            except Exception as retry_e:
                logging.error(f"Failed to fix {sector} ({etf}) data: {str(retry_e)}")
                continue
    
    # Print sector coverage
    print("\nSector Coverage:", flush=True)
    print(f"- Total Sectors: {len(SECTOR_ETF_MAP)}")
    print(f"- Loaded Sectors: {len(validation_results['passed'])}")
    if validation_results['failed']:
        print(f"- Failed Sectors: {validation_results['failed']}")
    
    return sector_data, validation_results


In [39]:
# Cross-validation and merging functions
def cross_validate_stock_sector(stock_df, sector_df, stock_name):
    """Cross-validate stock against its sector."""
    try:
        # Get stock's sector
        sector = stock_df['Sector'].iloc[0]
        
        # Ensure datetime index for both dataframes
        stock_df = stock_df.set_index('Date')
        sector_df = sector_df.set_index('Date')
        
        # Remove timezone info for alignment
        stock_df.index = stock_df.index.tz_localize(None)
        sector_df.index = sector_df.index.tz_localize(None)
        
        # Calculate returns
        stock_returns = stock_df['Close'].pct_change().dropna()
        sector_returns = sector_df['Close'].pct_change().dropna()
        
        # Align data
        stock_returns, sector_returns = stock_returns.align(sector_returns, join='inner')
        
        if len(stock_returns) == 0:
            raise ValueError("No overlapping data between stock and sector")
        
        # Calculate correlation for different time windows
        correlations = []
        window_sizes = [30, 60, 90, 180, 360]  # Different window sizes in days
        
        for window in window_sizes:
            if len(stock_returns) >= window:
                rolling_corr = stock_returns.rolling(window=window).corr(sector_returns)
                correlations.append(rolling_corr.mean())
        
        # Use the maximum correlation from different windows
        max_correlation = max(correlations) if correlations else stock_returns.corr(sector_returns)
        
        # Define minimum correlation threshold based on data length
        data_length = len(stock_returns)
        
        # Adjust correlation threshold based on data length
        if data_length < 180:  # Less than 6 months of data
            min_correlation = 0.2
        elif data_length < 360:  # Less than 1 year of data
            min_correlation = 0.25
        elif data_length < 720:  # Less than 2 years of data
            min_correlation = 0.28
        else:
            min_correlation = VALIDATION_CONFIG['min_stock_sector_corr']
        
        # Override for specific stocks if needed
        NEWER_STOCKS = {
            'MRNA': {'min_correlation': 0.2},
            'ABNB': {'min_correlation': 0.2},
            'CRWD': {'min_correlation': 0.2},
            'DASH': {'min_correlation': 0.2},
            'DDOG': {'min_correlation': 0.2},
            'CVNA': {'min_correlation': 0.2},
            'COIN': {'min_correlation': 0.2}
        }
        
        min_correlation = min(
            min_correlation,
            NEWER_STOCKS.get(stock_name, {}).get('min_correlation', min_correlation)
        )
        
        if max_correlation < min_correlation:
            logging.warning(f"{stock_name}: Low correlation with sector ({max_correlation:.3f} < {min_correlation})")
            # Don't fail validation, just warn
            return True
        
        return True
        
    except Exception as e:
        raise ValueError(f"Cross-validation failed: {str(e)}")

def create_merged_dataset(stock_data, sector_data):
    """Create merged dataset with stock and sector data."""
    merged_data = []
    
    for ticker, stock_df in stock_data.items():
        try:
            # Get stock's sector
            sector = stock_df['Sector'].iloc[0]
            
            # Get corresponding sector data
            if sector not in sector_data:
                raise ValueError(f"No sector data found for {sector}")
            
            sector_df = sector_data[sector]
            
            # Prepare data for merging
            stock_df = stock_df.set_index('Date')
            sector_df = sector_df.set_index('Date')
            
            # Remove timezone info for merging
            stock_df.index = stock_df.index.tz_localize(None)
            sector_df.index = sector_df.index.tz_localize(None)
            
            # Add sector data columns with prefix
            for col in ['Open', 'High', 'Low', 'Close', 'Volume']:
                stock_df[f'Sector_{col}'] = sector_df[col]
            
            # Reset index and add back Date column
            stock_df = stock_df.reset_index()
            
            # Add to merged data
            merged_data.append(stock_df)
            
        except Exception as e:
            logging.error(f"Error merging {ticker}: {str(e)}")
            continue
    
    if not merged_data:
        raise ValueError("No data to merge")
    
    # Combine all data
    final_df = pd.concat(merged_data, ignore_index=True)
    
    # Sort by date and ticker
    final_df = final_df.sort_values(['Date', 'Ticker'])
    
    return final_df


In [40]:
# Main execution
print("Starting complete validation process...\n")

# Load and validate stock data
stock_data, stock_validation = load_stock_data()

# Print sector distribution in stock data
if stock_data:
    print("\nSector Distribution in Stock Data:")
    sector_counts = {}
    for ticker, df in stock_data.items():
        sector = df['Sector'].iloc[0]
        if sector not in sector_counts:
            sector_counts[sector] = []
        sector_counts[sector].append(ticker)
    
    for sector in sorted(sector_counts.keys()):
        print(f"- {sector}: {len(sector_counts[sector])} stocks - {sorted(sector_counts[sector])}")
    
    # Check for missing sectors
    all_sectors = set(SECTOR_ETF_MAP.values())
    found_sectors = set(sector_counts.keys())
    missing_sectors = all_sectors - found_sectors
    if missing_sectors:
        print(f"\nMissing sectors in stock data: {missing_sectors}")

# Load and validate sector data
sector_data, sector_validation = load_sector_data()

# Perform cross-validation if both validations passed
if stock_data and sector_data:
    print("\nPerforming cross-validation...")
    print("Available sectors:", list(sector_data.keys()), flush=True)
    
    validation_results = {'passed': [], 'failed': [], 'reasons': {}}
    
    for stock_name, stock_df in tqdm(stock_data.items(), desc="Cross-validating stocks"):
        sector = stock_df['Sector'].iloc[0]
        
        if sector not in sector_data:
            validation_results['failed'].append(stock_name)
            validation_results['reasons'][stock_name] = 'sector_not_found'
            continue
        
        try:
            cross_validate_stock_sector(stock_df, sector_data[sector], stock_name)
            validation_results['passed'].append(stock_name)
        except Exception as e:
            validation_results['failed'].append(stock_name)
            validation_results['reasons'][stock_name] = str(e)
    
    # Print cross-validation summary
    print("\nCross-validation Summary:")
    print(f"Total stocks processed: {len(stock_data)}")
    print(f"Passed: {len(validation_results['passed'])}")
    print(f"Failed: {len(validation_results['failed'])}")
    
    if validation_results['failed']:
        print("\nFailed validations by reason:")
        reason_counts = {}
        for stock in validation_results['failed']:
            reason = validation_results['reasons'][stock]
            if reason not in reason_counts:
                reason_counts[reason] = []
            reason_counts[reason].append(stock)
        
        for reason, stocks in reason_counts.items():
            print(f"- {reason}: {len(stocks)} stocks - {sorted(stocks)}")
    
    # Create merged dataset if validation passed
    if validation_results['passed']:
        try:
            print("\nCreating merged dataset...")
            merged_df = create_merged_dataset(stock_data, sector_data)
            
            # Create output directories
            for dir_path in ['../data/processed', '../data/enriched']:
                Path(dir_path).mkdir(parents=True, exist_ok=True)
            
            # Save merged data to both locations for backward compatibility
            merged_df.to_csv('../data/processed/merged_data.csv', index=False)
            merged_df.to_csv('../data/enriched/nasdaq_validated.csv', index=False)
            
            print(f"\nMerged data saved:")
            print(f"- Total rows: {len(merged_df)}")
            print(f"- Unique stocks: {merged_df['Ticker'].nunique()}")
            print(f"- Date range: {merged_df['Date'].min():%Y-%m-%d} to {merged_df['Date'].max():%Y-%m-%d}")
            
            # Print sector distribution in final dataset
            print("\nSector distribution in final dataset:")
            sector_counts = merged_df.groupby('Sector')['Ticker'].nunique().sort_values(ascending=False)
            for sector, count in sector_counts.items():
                tickers = sorted(merged_df[merged_df['Sector'] == sector]['Ticker'].unique())
                print(f"- {sector}: {count} stocks - {tickers}")
            
        except Exception as e:
            print(f"\nError creating merged dataset: {str(e)}")

print("\nValidation process complete!")


Starting complete validation process...


Loading and validating stock data...


2025-07-26 07:24:29,431 - INFO - Validated AAPL data: 2651 rows
2025-07-26 07:24:29,470 - INFO - Validated ABNB data: 1155 rows
2025-07-26 07:24:29,523 - INFO - Validated ADBE data: 2651 rows
2025-07-26 07:24:29,560 - INFO - Validated ADI data: 2651 rows
2025-07-26 07:24:29,589 - INFO - Validated ADP data: 2651 rows
2025-07-26 07:24:29,618 - INFO - Validated ADSK data: 2651 rows
2025-07-26 07:24:29,673 - INFO - Validated AEP data: 2651 rows
2025-07-26 07:24:29,748 - INFO - Validated ALGN data: 2651 rows
2025-07-26 07:24:29,800 - INFO - Validated AMAT data: 2651 rows
2025-07-26 07:24:29,841 - INFO - Validated AMD data: 2651 rows
2025-07-26 07:24:29,873 - INFO - Validated AMGN data: 2651 rows
2025-07-26 07:24:29,904 - INFO - Validated AMZN data: 2651 rows
2025-07-26 07:24:29,935 - INFO - Validated ANSS data: 2650 rows
2025-07-26 07:24:29,959 - INFO - Validated APD data: 2656 rows
2025-07-26 07:24:29,983 - INFO - Validated ASML data: 2651 rows
2025-07-26 07:24:30,008 - INFO - Validated AV


Sector Distribution in Stock Data:
- Communication Services: 6 stocks - ['CHTR', 'CMCSA', 'NFLX', 'SIRI', 'TMUS', 'WBD']
- Consumer Discretionary: 14 stocks - ['ABNB', 'AMZN', 'BKNG', 'CVNA', 'DLTR', 'EBAY', 'LCID', 'MAR', 'MELI', 'ORLY', 'RIVN', 'ROST', 'SBUX', 'TSLA']
- Consumer Staples: 6 stocks - ['COST', 'KDP', 'KHC', 'MDLZ', 'MNST', 'PEP']
- Energy: 2 stocks - ['BKR', 'FANG']
- Financials: 2 stocks - ['COIN', 'PYPL']
- Healthcare: 11 stocks - ['ALGN', 'AMGN', 'BIIB', 'DXCM', 'GILD', 'IDXX', 'ILMN', 'ISRG', 'MRNA', 'REGN', 'VRTX']
- Industrials: 7 stocks - ['CPRT', 'CSX', 'CTAS', 'FAST', 'HON', 'ODFL', 'PCAR']
- Information Technology: 51 stocks - ['AAPL', 'ADBE', 'ADI', 'ADP', 'ADSK', 'AMAT', 'AMD', 'ANSS', 'ASML', 'AVGO', 'CDNS', 'CRWD', 'CSCO', 'CTSH', 'DASH', 'DDOG', 'DOCU', 'EA', 'ENPH', 'FTNT', 'GFS', 'GOOG', 'GOOGL', 'INTC', 'INTU', 'KLAC', 'LRCX', 'MCHP', 'META', 'MRVL', 'MSFT', 'MTCH', 'MU', 'NVDA', 'NXPI', 'OKTA', 'ON', 'PANW', 'PAYX', 'QCOM', 'RBLX', 'ROKU', 'SNAP', 'S


2025-07-26 07:24:33,172 - INFO - Validated Information Technology (XLK) data: 2656 rows



XLK (Information Technology) Summary:


2025-07-26 07:24:33,208 - INFO - Validated Financials (XLF) data: 2656 rows


- Date Range: 2015-01-02 to 2025-07-25
- Trading Days: 2656
- File: ..\data\sectors\XLK.csv

XLF (Financials) Summary:


2025-07-26 07:24:33,265 - INFO - Validated Healthcare (XLV) data: 2656 rows


- Date Range: 2015-01-02 to 2025-07-25
- Trading Days: 2656
- File: ..\data\sectors\XLF.csv

XLV (Healthcare) Summary:




- Date Range: 2015-01-02 to 2025-07-25
- Trading Days: 2656
- File: ..\data\sectors\XLV.csv


2025-07-26 07:24:33,307 - INFO - Validated Energy (XLE) data: 2656 rows



XLE (Energy) Summary:


2025-07-26 07:24:33,350 - INFO - Validated Consumer Discretionary (XLY) data: 2656 rows


- Date Range: 2015-01-02 to 2025-07-25
- Trading Days: 2656
- File: ..\data\sectors\XLE.csv

XLY (Consumer Discretionary) Summary:


2025-07-26 07:24:33,387 - INFO - Validated Consumer Staples (XLP) data: 2656 rows


- Date Range: 2015-01-02 to 2025-07-25
- Trading Days: 2656
- File: ..\data\sectors\XLY.csv

XLP (Consumer Staples) Summary:


2025-07-26 07:24:33,423 - INFO - Validated Industrials (XLI) data: 2656 rows


- Date Range: 2015-01-02 to 2025-07-25
- Trading Days: 2656
- File: ..\data\sectors\XLP.csv

XLI (Industrials) Summary:




- Date Range: 2015-01-02 to 2025-07-25
- Trading Days: 2656
- File: ..\data\sectors\XLI.csv


2025-07-26 07:24:33,468 - INFO - Validated Materials (XLB) data: 2656 rows



XLB (Materials) Summary:




- Date Range: 2015-01-02 to 2025-07-25
- Trading Days: 2656
- File: ..\data\sectors\XLB.csv


2025-07-26 07:24:33,508 - INFO - Validated Utilities (XLU) data: 2656 rows



XLU (Utilities) Summary:
- Date Range: 2015-01-02 to 2025-07-25
- Trading Days: 2656
- File: ..\data\sectors\XLU.csv


2025-07-26 07:24:33,550 - INFO - Validated Real Estate (XLRE) data: 2463 rows



XLRE (Real Estate) Summary:




- Date Range: 2015-10-08 to 2025-07-25
- Trading Days: 2463
- File: ..\data\sectors\XLRE.csv


2025-07-26 07:24:33,603 - INFO - Validated Communication Services (XLC) data: 1785 rows



XLC (Communication Services) Summary:
- Date Range: 2018-06-19 to 2025-07-25
- Trading Days: 1785
- File: ..\data\sectors\XLC.csv

Sector Coverage:
- Total Sectors: 11
- Loaded Sectors: 11

Performing cross-validation...
Available sectors: ['Information Technology', 'Financials', 'Healthcare', 'Energy', 'Consumer Discretionary', 'Consumer Staples', 'Industrials', 'Materials', 'Utilities', 'Real Estate', 'Communication Services']


Cross-validating stocks: 100%|██████████| 108/108 [00:00<00:00, 155.40it/s]



Cross-validation Summary:
Total stocks processed: 108
Passed: 108
Failed: 0

Creating merged dataset...

Merged data saved:
- Total rows: 266029
- Unique stocks: 108
- Date range: 2015-01-02 to 2025-07-25

Sector distribution in final dataset:
- Information Technology: 51 stocks - ['AAPL', 'ADBE', 'ADI', 'ADP', 'ADSK', 'AMAT', 'AMD', 'ANSS', 'ASML', 'AVGO', 'CDNS', 'CRWD', 'CSCO', 'CTSH', 'DASH', 'DDOG', 'DOCU', 'EA', 'ENPH', 'FTNT', 'GFS', 'GOOG', 'GOOGL', 'INTC', 'INTU', 'KLAC', 'LRCX', 'MCHP', 'META', 'MRVL', 'MSFT', 'MTCH', 'MU', 'NVDA', 'NXPI', 'OKTA', 'ON', 'PANW', 'PAYX', 'QCOM', 'RBLX', 'ROKU', 'SNAP', 'SNPS', 'TEAM', 'TTD', 'TXN', 'VRSK', 'WDAY', 'ZM', 'ZS']
- Consumer Discretionary: 14 stocks - ['ABNB', 'AMZN', 'BKNG', 'CVNA', 'DLTR', 'EBAY', 'LCID', 'MAR', 'MELI', 'ORLY', 'RIVN', 'ROST', 'SBUX', 'TSLA']
- Healthcare: 11 stocks - ['ALGN', 'AMGN', 'BIIB', 'DXCM', 'GILD', 'IDXX', 'ILMN', 'ISRG', 'MRNA', 'REGN', 'VRTX']
- Industrials: 7 stocks - ['CPRT', 'CSX', 'CTAS', 'FAST', 