In [81]:
import pandas as pd
from rf_stock_annual_macro import (
    load_and_prepare_data, 
    infer_sector_from_data, 
    StockSelectionRF
)

In [82]:
def select_stocks_for_date(csv_path, target_date, n_stocks=30, min_cap='Mid Cap'):
    """
    Generates a list of stocks to buy on a specific target_date using the 
    Macro-Aware Random Forest model.
    
    Parameters:
    -----------
    csv_path : str
        Path to the SimFin panel CSV.
    target_date : str
        The date you want to generate picks for (e.g., '2025-12-31').
    n_stocks : int
        Number of top stocks to return.
    min_cap : str
        Minimum market cap filter ('Nano Cap', 'Micro Cap', 'Small Cap', 'Mid Cap', 'Large Cap').
        
    Returns:
    --------
    pd.DataFrame
        Top stock picks with predicted ranks and fundamental data.
    """
    import pandas as pd
    import numpy as np
    
    # 1. Parse and Snap Dates
    # We snap to the month end to match the training data conventions
    buy_date = pd.to_datetime(target_date) + pd.offsets.MonthEnd(0)
    
    # We train on data where the 1-year outcome is already known.
    # If we buy on 2024-12-31, we need a model that predicts 1-year returns.
    # The most recent data with a known 1-year return is from 2023-12-31.
    train_end = buy_date - pd.DateOffset(years=1) + pd.offsets.MonthEnd(0)
    
    print(f"--- Processing Selection for {buy_date.date()} ---")
    print(f"Training Data Cutoff (Known Outcomes): {train_end.date()}")

    # 2. Load and Prepare Data
    # Assumes load_and_prepare_data and infer_sector_from_data are in scope
    df = load_and_prepare_data(csv_path) 
    df = infer_sector_from_data(df)
    
    # 3. Filter Universe by Market Cap
    if 'cap' in df.columns and min_cap != 'Nano Cap':
        cap_hierarchy = ['Nano Cap', 'Micro Cap', 'Small Cap', 'Mid Cap', 'Large Cap', 'Mega Cap']
        if min_cap in cap_hierarchy:
            # Drop rows with missing cap first
            df = df[df['cap'].notna()].copy()
            
            allowed = cap_hierarchy[cap_hierarchy.index(min_cap):]
            df = df[df['cap'].isin(allowed)].copy()
            print(f"Universe filtered to {min_cap}+: {len(df)} rows remaining")
        else:
            print(f"Warning: '{min_cap}' not found in hierarchy. Using full universe.")

    # 4. Split Data
    # Training: All history up to the point where 1-year returns are realized
    train_df = df[df['public_date'] <= train_end].copy()
    
    # Prediction: The specific date we want to trade
    predict_df = df[df['public_date'] == buy_date].copy()

    if predict_df.empty:
        # Help the user find valid dates if the exact date is missing
        nearby_dates = df[(df['public_date'] > buy_date - pd.Timedelta(days=60)) & 
                          (df['public_date'] < buy_date + pd.Timedelta(days=60))]['public_date'].unique()
        nearby_str = ", ".join([str(d.date()) for d in sorted(nearby_dates)])
        raise ValueError(f"No data found for {buy_date.date()}. Nearest available dates: {nearby_str}")

    # 5. Check for Macro Features (Critical for this Strategy)
    # The model defines these in __init__
    rf_model = StockSelectionRF(random_state=42)
    macro_cols = rf_model.macro_columns
    
    # Check if macro columns exist and are populated in the prediction set
    missing_macro = [c for c in macro_cols if c not in predict_df.columns or predict_df[c].isna().all()]
    if missing_macro:
        print(f"\nWARNING: The following macro features are missing for {buy_date.date()}:")
        print(f"  {missing_macro}")
        print("  The model will degrade to a fundamental-only selection.")
    else:
        print(f"Macro data verified for {buy_date.date()}.")

    # 6. Feature Preparation
    feature_cols = rf_model.prepare_features(df)

    # 7. Clean Training Data
    X_train_raw = train_df[feature_cols]
    y_train = train_df['1yr_return']
    
    # Handle missing data using the class method
    # This also calculates medians for imputation later
    X_train_clean, dropped_features = rf_model.handle_missing_data(X_train_raw)

    # 8. Clean Prediction Data
    # Must use exactly the same features as training (minus dropped ones)
    features_to_use = [f for f in feature_cols if f not in dropped_features]
    X_predict_raw = predict_df[features_to_use]
    
    # Filter for Feature Completeness (Quality Control)
    row_completeness = X_predict_raw.notna().sum(axis=1) / X_predict_raw.shape[1]
    rows_to_keep = row_completeness >= 0.8
    X_predict_filtered = X_predict_raw[rows_to_keep]
    
    dropped_count = len(X_predict_raw) - len(X_predict_filtered)
    if dropped_count > 0:
        print(f"Dropped {dropped_count} stocks from prediction set due to missing data (<80% complete)")
    
    if X_predict_filtered.empty:
        raise ValueError("All candidates dropped due to missing data.")

    # Impute missing values with training medians (prevents look-ahead bias)
    X_predict = X_predict_filtered.fillna(rf_model.feature_medians_).fillna(0)
    
    # Align metadata
    metadata_train = train_df.loc[X_train_clean.index, ['sector', 'public_date']]
    metadata_predict = predict_df.loc[X_predict.index, ['sector', 'public_date']]

    # 9. Train Model
    # y_train will have NaNs for the most recent year in the full DF, 
    # but our 'train_end' filter ensures we only look at valid history.
    # The train() method also filters NaNs internally as a safety net.
    print(f"Training model on {len(X_train_clean)} samples...")
    rf_model.train(X_train_clean, y_train.loc[X_train_clean.index], metadata_train)
    
    # 10. Prediction & Selection
    print(f"Ranking {len(X_predict)} candidates...")
    
    # Get top stocks with predicted ranks
    top_stocks_df = rf_model.get_top_stocks(
        X_predict, 
        X_predict.index, 
        metadata_predict, 
        n=n_stocks
    )

    # Join with original data for display
    result = predict_df.loc[top_stocks_df.index].copy()
    result['predicted_rank'] = top_stocks_df['predicted_rank']
    
    # Define columns to display (if they exist)
    display_cols = ['TICKER', 'sector', 'MthCap', 'roe', 'bm', 'pe_inc', 'de_ratio', 'predicted_rank']
    final_cols = [c for c in display_cols if c in result.columns]
    
    return result[final_cols].sort_values('predicted_rank', ascending=False)

In [83]:
# Select stocks for end of Q2 2024
target_date = '2023-04-30'
picks = select_stocks_for_date(
    csv_path='data/simfin_panel.csv',
    target_date=target_date,
    n_stocks=10,
    min_cap='Mid Cap'
)

print(picks)

--- Processing Selection for 2023-04-30 ---
Training Data Cutoff (Known Outcomes): 2022-04-30
Loading data...
Found existing sector column with 11 sectors
Universe filtered to Mid Cap+: 90740 rows remaining
Macro data verified for 2023-04-30.
Dropped 4 features with >50% missing values
Dropped 17969 instances (33.9%) with <80% feature completeness
Final dataset: 35099 samples, 57 features
Dropped 58 stocks from prediction set due to missing data (<80% complete)
Training model on 35099 samples...
Skipping 1236 rows with missing targets (likely recent data for prediction).
Training Random Forest on 33863 samples with 57 features...
Applied sector neutralization across 11 sectors
Macro features passed through raw (not neutralized): ['FEDFUNDS', 'DGS10', '1mo_inf_rate', '1yr_inf_rate', '1mo_GDP', '1yr_GDP']
Winsorized target: [-0.7530, 1.4547]
Converted to cross-sectional ranks (mean: 0.501)

Macro features in model: 6
  FEDFUNDS: Rank #6, Importance: 0.0314
  DGS10: Rank #12, Importance: 

In [84]:
print(f"{target_date}: {picks.TICKER.to_list()}")

2023-04-30: ['WMT', 'UNH', 'KLAC', 'WM', 'FIX', 'MTD', 'KR', 'NSIT', 'PCTY', 'ADP']


In [92]:
# Define the year
year = 2017

# Create date range for month-ends
month_ends = pd.date_range(start=f'{year}-01-01', end=f'{year}-12-31', freq='ME')

# Dictionary to store the results
tickers_by_date = {}

print(f"Processing {len(month_ends)} months for {year}...\n")

for date_obj in month_ends:
    target_date = date_obj.strftime('%Y-%m-%d')
    
    try:
        # Run the selection function
        picks_df = select_stocks_for_date(
            csv_path='data/simfin_panel.csv',
            target_date=target_date,
            n_stocks=10,
            min_cap='Mid Cap'
        )
        
        # Extract the list of tickers from the result DataFrame
        if not picks_df.empty and 'TICKER' in picks_df.columns:
            ticker_list = picks_df['TICKER'].tolist()
            tickers_by_date[target_date] = ticker_list
            print(f"  > {target_date}: Found {len(ticker_list)} stocks.")
        else:
            print(f"  > {target_date}: No stocks returned.")

    except Exception as e:
        print(f"  > {target_date}: Error - {e}")

# --- FINAL OUTPUT GENERATION ---
print("\n" + "="*50)
print("FINAL TICKER DICTIONARY")
print("="*50)

print("tickers = {")
for date_key, ticker_list in tickers_by_date.items():
    # Print each entry formatted as a Python dictionary item
    print(f"    '{date_key}': {ticker_list},")
print("}")

Processing 12 months for 2017...

--- Processing Selection for 2017-01-31 ---
Training Data Cutoff (Known Outcomes): 2016-01-31
Loading data...
Found existing sector column with 11 sectors
Universe filtered to Mid Cap+: 90740 rows remaining

  ['1yr_GDP']
  The model will degrade to a fundamental-only selection.
Dropped 35 features with >50% missing values
Dropped 1 instances (11.1%) with <80% feature completeness
Final dataset: 8 samples, 26 features
Dropped 24 stocks from prediction set due to missing data (<80% complete)
Training model on 8 samples...
Training Random Forest on 8 samples with 26 features...
Applied sector neutralization across 4 sectors
Macro features passed through raw (not neutralized): ['FEDFUNDS', 'DGS10', '1mo_inf_rate', '1yr_inf_rate', '1mo_GDP']
Winsorized target: [-0.1036, 0.6611]
Converted to cross-sectional ranks (mean: 0.688)

Macro features in model: 5
  FEDFUNDS: Rank #7, Importance: 0.0000
  DGS10: Rank #6, Importance: 0.0000
  1mo_inf_rate: Rank #5, Im