# Build Prediction Bases: Congressional Trading

Constructs monthly prediction bases with comprehensive congressional trading features.

**Outputs:**
- `base_sp500_monthly.csv`: For predicting S&P 500 returns
- `base_sector_monthly.csv`: For predicting sector returns

**Feature Categories:**
1. Direction (CSI, buy ratio, net)
2. Timing (disclosure delay, end of month, day of week)
3. Coordination (same day trades, party, committee)
4. Behavior (frequent traders, first time, direction change)
5. Power (chairs, seniority, net worth)
6. Relevance (committee-sector match)
7. Context (contrarian, volatility, liquidity)
8. Composite signals (smart money, insider ring)

---

## 0. Setup

In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
from datetime import datetime
from tqdm import tqdm
import os
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams.update({
    'figure.facecolor': 'white',
    'axes.facecolor': 'white',
    'axes.grid': True,
    'grid.alpha': 0.3,
    'font.size': 11,
    'figure.dpi': 120
})

print('Setup complete')

Setup complete


## 1. Configuration

In [2]:
# Paths
INPUT_TRADES = 'data/outputs/congress_trades_with_committees.csv'
SECTOR_CACHE = 'data/outputs/ticker_sector_cache.csv'

# Output directory
OUTPUT_DIR = 'data/prediction_bases'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Date range
START_DATE = '2012-01-01'
END_DATE = '2024-12-31'

# Sector ETFs
SECTOR_ETFS = {
    'Energy': 'XLE',
    'Financials': 'XLF',
    'Health Care': 'XLV',
    'Industrials': 'XLI',
    'Technology': 'XLK',
    'Consumer Discretionary': 'XLY',
    'Consumer Staples': 'XLP',
    'Materials': 'XLB',
    'Utilities': 'XLU',
    'Real Estate': 'XLRE',
    'Communication Services': 'XLC'
}

In [3]:
# Committee to sector mapping
COMMITTEE_TO_SECTORS = {
    'Armed Services': ['Industrials', 'Technology', 'Communication Services'],
    'Foreign Affairs': ['Industrials', 'Energy', 'Materials', 'Financials', 'Technology'],
    'Homeland Security': ['Industrials', 'Technology', 'Communication Services'],
    'Intelligence': ['Technology', 'Industrials', 'Communication Services'],
    'Select Committee on Intelligence': ['Technology', 'Industrials'],
    'Financial Services': ['Financials', 'Real Estate'],
    'Banking, Housing and Urban Affairs': ['Financials', 'Real Estate'],
    'Finance': ['Financials'],
    'Ways and Means': ['Financials', 'Health Care', 'Consumer Discretionary'],
    'Energy and Commerce': ['Energy', 'Utilities', 'Health Care', 'Communication Services'],
    'Energy and Natural Resources': ['Energy', 'Utilities', 'Materials'],
    'Natural Resources': ['Energy', 'Materials', 'Utilities'],
    'Environment and Public Works': ['Energy', 'Utilities', 'Materials'],
    'Health  Education Labor and Pensions': ['Health Care'],
    'Agriculture': ['Consumer Staples', 'Consumer Defensive', 'Materials'],
    'Agriculture Nutrition and Forestry': ['Consumer Staples', 'Consumer Defensive', 'Materials'],
    'Transportation and Infrastructure': ['Industrials', 'Energy'],
    'Appropriations': ['Industrials', 'Health Care', 'Technology', 'Financials'],
    'Education and Workforce': ['Consumer Discretionary', 'Technology'],
    'Small Business': ['Consumer Discretionary', 'Industrials'],
    'Judiciary': ['Technology', 'Communication Services'],
    'Commerce, Science and Transportation': ['Technology', 'Communication Services', 'Industrials'],
    'Science  Space and Technology': ['Technology', 'Industrials'],
    "Veterans Affairs": ['Health Care', 'Industrials'],
    "Veterans' Affairs": ['Health Care', 'Industrials'],
}

# Informative committees (more likely to have material non-public info)
INFO_COMMITTEES = [
    'Armed Services', 'Financial Services', 'Energy and Commerce',
    'Intelligence', 'Select Committee on Intelligence',
    'Ways and Means', 'Appropriations', 'Health  Education Labor and Pensions',
    'Banking, Housing and Urban Affairs', 'Finance'
]

## 2. Load Data

In [4]:
df = pd.read_csv(INPUT_TRADES)
print(f'Loaded {len(df):,} trades')
print(f'Columns: {len(df.columns)}')

Loaded 99,609 trades
Columns: 97


## 3. Create Trade-Level Features

Before aggregating, we need to create features at the individual trade level.

In [5]:
# Basic date features
df['trade_date'] = pd.to_datetime(df['Traded'])
df['filed_date'] = pd.to_datetime(df['Filed'])
df['trade_month'] = df['trade_date'].dt.to_period('M')
df['trade_year'] = df['trade_date'].dt.year

# Buy/sell
df['is_buy'] = (df['Transaction'] == 'Purchase').astype(int)
df['is_sell'] = df['Transaction'].str.contains('Sale', case=False, na=False).astype(int)

# Amount proxy
size_map = {
    '$1,001 - $15,000': 8000,
    '$15,001 - $50,000': 32500,
    '$50,001 - $100,000': 75000,
    '$100,001 - $250,000': 175000,
    '$250,001 - $500,000': 375000,
    '$500,001 - $1,000,000': 750000,
    '$1,000,001 - $5,000,000': 3000000,
    'Over $5,000,000': 7500000,
}
df['amount_proxy'] = df['Trade_Size_USD'].map(size_map).fillna(8000)
df['is_large_trade'] = (df['amount_proxy'] >= 100000).astype(int)

print(f'Buys: {df["is_buy"].sum():,}, Sells: {df["is_sell"].sum():,}')

Buys: 50,079, Sells: 49,088


### 3.1 Timing Features

In [6]:
# Disclosure delay
df['disclosure_delay'] = (df['filed_date'] - df['trade_date']).dt.days
df['disclosure_delay'] = df['disclosure_delay'].clip(lower=0, upper=365)

# Long delay indicator (> 30 days)
df['long_delay'] = (df['disclosure_delay'] > 30).astype(int)

# End of month (last 5 days)
df['day_of_month'] = df['trade_date'].dt.day
df['days_in_month'] = df['trade_date'].dt.daysinmonth
df['end_of_month'] = (df['days_in_month'] - df['day_of_month'] <= 5).astype(int)

# Day of week
df['day_of_week'] = df['trade_date'].dt.dayofweek
df['is_monday'] = (df['day_of_week'] == 0).astype(int)
df['is_friday'] = (df['day_of_week'] == 4).astype(int)

print(f'Avg disclosure delay: {df["disclosure_delay"].mean():.1f} days')
print(f'Long delay (>30d): {df["long_delay"].mean()*100:.1f}%')
print(f'End of month: {df["end_of_month"].mean()*100:.1f}%')

Avg disclosure delay: 41.9 days
Long delay (>30d): 43.6%
End of month: 18.6%


### 3.2 Sector and Committee Features

In [7]:
# Load sector cache
def get_ticker_sectors(tickers, cache_file):
    try:
        cache_df = pd.read_csv(cache_file)
        cache = dict(zip(cache_df['ticker'], cache_df['sector']))
        print(f'Loaded cache: {len(cache)} tickers')
    except FileNotFoundError:
        cache = {}
        print('No cache found')
    
    tickers_unique = [t for t in set(tickers) if pd.notna(t)]
    missing = [t for t in tickers_unique if t not in cache]
    
    if missing:
        print(f'Fetching {len(missing)} new tickers...')
        for ticker in tqdm(missing[:500]):
            try:
                info = yf.Ticker(ticker).info
                cache[ticker] = info.get('sector', None)
            except:
                cache[ticker] = None
        cache_df = pd.DataFrame({'ticker': list(cache.keys()), 'sector': list(cache.values())})
        cache_df.to_csv(cache_file, index=False)
    
    return cache

sector_cache = get_ticker_sectors(df['Ticker_Clean'].dropna().unique(), SECTOR_CACHE)
df['sector'] = df['Ticker_Clean'].map(sector_cache)

# Normalize
sector_normalize = {
    'Information Technology': 'Technology',
    'Consumer Cyclical': 'Consumer Discretionary',
    'Financial Services': 'Financials',
    'Healthcare': 'Health Care',
    'Basic Materials': 'Materials',
}
df['sector'] = df['sector'].replace(sector_normalize)

print(f'Trades with sector: {df["sector"].notna().sum():,}')

Loaded cache: 600 tickers
Fetching 4083 new tickers...


  1%|          | 3/500 [00:02<07:20,  1.13it/s]HTTP Error 404: {"quoteSummary":{"result":null,"error":{"code":"Not Found","description":"Quote not found for symbol: 37045XEF9"}}}
  3%|▎         | 14/500 [00:11<05:47,  1.40it/s]HTTP Error 404: {"quoteSummary":{"result":null,"error":{"code":"Not Found","description":"Quote not found for symbol: SINT1"}}}
  5%|▌         | 27/500 [00:21<05:32,  1.42it/s]HTTP Error 404: {"quoteSummary":{"result":null,"error":{"code":"Not Found","description":"Quote not found for symbol: 912797LL9"}}}
 10%|█         | 50/500 [00:38<05:07,  1.47it/s]HTTP Error 404: {"quoteSummary":{"result":null,"error":{"code":"Not Found","description":"Quote not found for symbol: NEE.PRI"}}}
 15%|█▌        | 76/500 [00:56<05:04,  1.39it/s]HTTP Error 404: {"quoteSummary":{"result":null,"error":{"code":"Not Found","description":"Quote not found for symbol: HAMILTO"}}}
 20%|██        | 102/500 [01:16<04:37,  1.43it/s]HTTP Error 404: {"quoteSummary":{"result":null,"error":{"cod

Trades with sector: 21,408





In [8]:
# Committee match
def get_committee_sectors(committee_name):
    if pd.isna(committee_name):
        return []
    for comm, sectors in COMMITTEE_TO_SECTORS.items():
        if comm.lower() in str(committee_name).lower() or str(committee_name).lower() in comm.lower():
            return sectors
    return []

def check_committee_match(row):
    if pd.isna(row.get('sector')) or pd.isna(row.get('committee_name')):
        return 0
    committee_sectors = get_committee_sectors(row['committee_name'])
    if not committee_sectors:
        return 0
    for cs in committee_sectors:
        if cs.lower() in str(row['sector']).lower() or str(row['sector']).lower() in cs.lower():
            return 1
    return 0

def is_info_committee(committee_name):
    if pd.isna(committee_name):
        return 0
    for ic in INFO_COMMITTEES:
        if ic.lower() in str(committee_name).lower():
            return 1
    return 0

df['committee_related'] = df.apply(check_committee_match, axis=1)
df['is_info_committee'] = df['committee_name'].apply(is_info_committee)

# Chair indicator
if 'committee_role' in df.columns:
    df['is_chair'] = df['committee_role'].str.lower().str.contains('chair|ranking', na=False).astype(int)
else:
    df['is_chair'] = 0

print(f'Committee related: {df["committee_related"].mean()*100:.1f}%')
print(f'Info committee: {df["is_info_committee"].mean()*100:.1f}%')

Committee related: 5.3%
Info committee: 41.4%


### 3.3 Power Features

In [9]:
# Seniority
if 'Years in position' in df.columns:
    df['years_in_position'] = pd.to_numeric(df['Years in position'], errors='coerce').fillna(0)
    df['is_senior'] = (df['years_in_position'] >= 10).astype(int)
else:
    df['years_in_position'] = 0
    df['is_senior'] = 0

# Net worth
if 'Net worth' in df.columns:
    df['net_worth'] = pd.to_numeric(df['Net worth'], errors='coerce').fillna(0)
    df['is_wealthy'] = (df['net_worth'] > df['net_worth'].median()).astype(int)
else:
    df['net_worth'] = 0
    df['is_wealthy'] = 0

# High profile (has social media presence)
if 'has_twitter' in df.columns and 'has_wikipedia' in df.columns:
    df['high_profile'] = ((df['has_twitter'] == True) | (df['has_wikipedia'] == True)).astype(int)
else:
    df['high_profile'] = 0

# Senate indicator
df['is_senator'] = (df['Chamber'] == 'Senate').astype(int)

# Power index (composite)
df['power_index'] = df['is_chair'] + df['is_senator'] + df['is_senior'] + df['is_info_committee']

print(f'Senior (>10 yrs): {df["is_senior"].mean()*100:.1f}%')
print(f'Avg power index: {df["power_index"].mean():.2f}')

Senior (>10 yrs): 5.4%
Avg power index: 0.95


### 3.4 Behavior Features

In [10]:
# Frequent trader (top 25% by trade count)
trader_counts = df.groupby('Name')['trade_id'].count()
frequent_threshold = trader_counts.quantile(0.75)
frequent_traders = trader_counts[trader_counts >= frequent_threshold].index
df['frequent_trader'] = df['Name'].isin(frequent_traders).astype(int)

# First time trading this stock
df = df.sort_values(['Name', 'Ticker_Clean', 'trade_date'])
df['first_time'] = (~df.duplicated(subset=['Name', 'Ticker_Clean'], keep='first')).astype(int)

# Direction change (was buying, now selling or vice versa)
df = df.sort_values(['Name', 'Ticker_Clean', 'trade_date'])
df['prev_is_buy'] = df.groupby(['Name', 'Ticker_Clean'])['is_buy'].shift(1)
df['direction_change'] = ((df['is_buy'] != df['prev_is_buy']) & df['prev_is_buy'].notna()).astype(int)

print(f'Frequent traders: {df["frequent_trader"].mean()*100:.1f}%')
print(f'First time: {df["first_time"].mean()*100:.1f}%')
print(f'Direction change: {df["direction_change"].mean()*100:.1f}%')

Frequent traders: 93.8%
First time: 18.6%
Direction change: 28.0%


### 3.5 Coordination Features

In [11]:
# Coordinated: multiple politicians trading same stock on same day
daily_stock_traders = df.groupby(['trade_date', 'Ticker_Clean'])['Name'].nunique().reset_index()
daily_stock_traders.columns = ['trade_date', 'Ticker_Clean', 'n_traders_same_day']
df = df.merge(daily_stock_traders, on=['trade_date', 'Ticker_Clean'], how='left')
df['coordinated'] = (df['n_traders_same_day'] >= 2).astype(int)

# Party coordinated: multiple from same party on same stock same day
party_stock_traders = df.groupby(['trade_date', 'Ticker_Clean', 'Party'])['Name'].nunique().reset_index()
party_stock_traders.columns = ['trade_date', 'Ticker_Clean', 'Party', 'n_party_traders']
df = df.merge(party_stock_traders, on=['trade_date', 'Ticker_Clean', 'Party'], how='left')
df['party_coordinated'] = (df['n_party_traders'] >= 2).astype(int)

# Committee coordinated: multiple from same committee on same stock same day
if 'committee_name' in df.columns:
    comm_stock_traders = df[df['committee_name'].notna()].groupby(['trade_date', 'Ticker_Clean', 'committee_name'])['Name'].nunique().reset_index()
    comm_stock_traders.columns = ['trade_date', 'Ticker_Clean', 'committee_name', 'n_comm_traders']
    df = df.merge(comm_stock_traders, on=['trade_date', 'Ticker_Clean', 'committee_name'], how='left')
    df['n_comm_traders'] = df['n_comm_traders'].fillna(1)
    df['committee_coordinated'] = (df['n_comm_traders'] >= 2).astype(int)
else:
    df['committee_coordinated'] = 0

print(f'Coordinated: {df["coordinated"].mean()*100:.1f}%')
print(f'Party coordinated: {df["party_coordinated"].mean()*100:.1f}%')
print(f'Committee coordinated: {df["committee_coordinated"].mean()*100:.1f}%')

Coordinated: 6.0%
Party coordinated: 2.0%
Committee coordinated: 0.2%


### 3.6 Context Features

In [12]:
# Using existing market data columns if available
# Contrarian: buying when stock falling, selling when rising
if 'momentum_20d' in df.columns:
    df['contrarian'] = ((df['is_buy'] == 1) & (df['momentum_20d'] < 0) | 
                        (df['is_sell'] == 1) & (df['momentum_20d'] > 0)).astype(int)
else:
    df['contrarian'] = 0

# High volatility
if 'realized_vol_30d' in df.columns:
    vol_median = df['realized_vol_30d'].median()
    df['high_vol'] = (df['realized_vol_30d'] > vol_median).astype(int)
else:
    df['high_vol'] = 0

# Illiquid
if 'amihud_illiq_20d' in df.columns:
    illiq_75 = df['amihud_illiq_20d'].quantile(0.75)
    df['illiquid'] = (df['amihud_illiq_20d'] > illiq_75).astype(int)
else:
    df['illiquid'] = 0

# Small cap
if 'market_cap' in df.columns:
    cap_25 = df['market_cap'].quantile(0.25)
    df['small_cap'] = (df['market_cap'] < cap_25).astype(int)
else:
    df['small_cap'] = 0

print(f'Contrarian: {df["contrarian"].mean()*100:.1f}%')
print(f'High vol: {df["high_vol"].mean()*100:.1f}%')
print(f'Illiquid: {df["illiquid"].mean()*100:.1f}%')
print(f'Small cap: {df["small_cap"].mean()*100:.1f}%')

Contrarian: 39.3%
High vol: 40.6%
Illiquid: 20.2%
Small cap: 19.8%


### 3.7 Composite Features

In [13]:
# Smart money: buy + info committee + chair
df['smart_money'] = ((df['is_buy'] == 1) & 
                     (df['is_info_committee'] == 1) & 
                     (df['is_chair'] == 1)).astype(int)

# Insider ring: committee coordinated (multiple from same committee)
df['insider_ring'] = df['committee_coordinated']

# Hidden trade: illiquid/small cap + info committee
df['hidden_trade'] = (((df['illiquid'] == 1) | (df['small_cap'] == 1)) & 
                      (df['is_info_committee'] == 1)).astype(int)

# Strong signal: committee related + large trade + first time
df['strong_signal'] = ((df['committee_related'] == 1) & 
                       (df['is_large_trade'] == 1) & 
                       (df['first_time'] == 1)).astype(int)

print(f'Smart money: {df["smart_money"].mean()*100:.2f}%')
print(f'Insider ring: {df["insider_ring"].mean()*100:.2f}%')
print(f'Hidden trade: {df["hidden_trade"].mean()*100:.2f}%')
print(f'Strong signal: {df["strong_signal"].mean()*100:.2f}%')

Smart money: 3.72%
Insider ring: 0.25%
Hidden trade: 8.10%
Strong signal: 0.02%


## 4. Download Market Data

In [14]:
# S&P 500
print('Downloading S&P 500...')
sp500 = yf.download('^GSPC', start=START_DATE, end=END_DATE, auto_adjust=True, progress=False)
sp500.columns = sp500.columns.get_level_values(0)
sp500_monthly = sp500['Close'].resample('ME').last()
sp500_ret = sp500_monthly.pct_change()

# VIX
print('Downloading VIX...')
vix = yf.download('^VIX', start=START_DATE, end=END_DATE, auto_adjust=True, progress=False)
vix.columns = vix.columns.get_level_values(0)
vix_monthly = vix['Close'].resample('ME').last()

# Realized volatility
sp500_vol = sp500['Close'].pct_change().resample('ME').std() * np.sqrt(252)

print(f'S&P 500: {len(sp500_monthly)} months')

Downloading S&P 500...
Downloading VIX...
S&P 500: 156 months


In [15]:
# Sector ETFs
print('Downloading sector ETFs...')
sector_returns = {}

for sector, etf in tqdm(SECTOR_ETFS.items()):
    try:
        data = yf.download(etf, start=START_DATE, end=END_DATE, auto_adjust=True, progress=False)
        data.columns = data.columns.get_level_values(0)
        monthly = data['Close'].resample('ME').last()
        sector_returns[sector] = monthly.pct_change()
    except Exception as e:
        print(f'Error {etf}: {e}')

df_sector_ret = pd.DataFrame(sector_returns)
df_sector_ret.index = df_sector_ret.index.to_period('M')

print(f'Sectors: {len(df_sector_ret.columns)}, Months: {len(df_sector_ret)}')

Downloading sector ETFs...


100%|██████████| 11/11 [00:08<00:00,  1.31it/s]

Sectors: 11, Months: 156





## 5. Aggregate to Monthly Level

In [16]:
def aggregate_monthly(df):
    """
    Comprehensive monthly aggregation with all feature categories.
    """
    
    agg = df.groupby('trade_month').agg(
        # === BASIC COUNTS ===
        cong_total_trades=('trade_id', 'count'),
        cong_buy_count=('is_buy', 'sum'),
        cong_sell_count=('is_sell', 'sum'),
        cong_unique_politicians=('Name', 'nunique'),
        cong_unique_stocks=('Ticker_Clean', 'nunique'),
        
        # === AMOUNTS ===
        cong_total_buy_amount=('amount_proxy', lambda x: (x * df.loc[x.index, 'is_buy']).sum()),
        cong_total_sell_amount=('amount_proxy', lambda x: (x * df.loc[x.index, 'is_sell']).sum()),
        cong_large_trades=('is_large_trade', 'sum'),
        
        # === TIMING ===
        cong_avg_disclosure_delay=('disclosure_delay', 'mean'),
        cong_long_delay_trades=('long_delay', 'sum'),
        cong_end_of_month_trades=('end_of_month', 'sum'),
        cong_monday_trades=('is_monday', 'sum'),
        cong_friday_trades=('is_friday', 'sum'),
        
        # === COORDINATION ===
        cong_coordinated_trades=('coordinated', 'sum'),
        cong_party_coordinated_trades=('party_coordinated', 'sum'),
        cong_committee_coordinated_trades=('committee_coordinated', 'sum'),
        
        # === BEHAVIOR ===
        cong_frequent_trader_trades=('frequent_trader', 'sum'),
        cong_first_time_trades=('first_time', 'sum'),
        cong_direction_change_trades=('direction_change', 'sum'),
        
        # === POWER ===
        cong_chair_trades=('is_chair', 'sum'),
        cong_senior_trades=('is_senior', 'sum'),
        cong_wealthy_trades=('is_wealthy', 'sum'),
        cong_high_profile_trades=('high_profile', 'sum'),
        cong_avg_power_index=('power_index', 'mean'),
        cong_avg_seniority=('years_in_position', 'mean'),
        cong_avg_networth=('net_worth', 'mean'),
        
        # === RELEVANCE ===
        cong_committee_related_trades=('committee_related', 'sum'),
        cong_info_committee_trades=('is_info_committee', 'sum'),
        
        # === CONTEXT ===
        cong_contrarian_trades=('contrarian', 'sum'),
        cong_high_vol_trades=('high_vol', 'sum'),
        cong_illiquid_trades=('illiquid', 'sum'),
        cong_small_cap_trades=('small_cap', 'sum'),
        
        # === COMPOSITE ===
        cong_smart_money_trades=('smart_money', 'sum'),
        cong_insider_ring_trades=('insider_ring', 'sum'),
        cong_hidden_trades=('hidden_trade', 'sum'),
        cong_strong_signal_trades=('strong_signal', 'sum'),
        
        # === BY PARTY ===
        cong_dem_trades=('Party', lambda x: (x == 'D').sum()),
        cong_rep_trades=('Party', lambda x: (x == 'R').sum()),
        cong_dem_buys=('is_buy', lambda x: (x & (df.loc[x.index, 'Party'] == 'D')).sum()),
        cong_rep_buys=('is_buy', lambda x: (x & (df.loc[x.index, 'Party'] == 'R')).sum()),
        cong_dem_sells=('is_sell', lambda x: (x & (df.loc[x.index, 'Party'] == 'D')).sum()),
        cong_rep_sells=('is_sell', lambda x: (x & (df.loc[x.index, 'Party'] == 'R')).sum()),
        
        # === BY CHAMBER ===
        cong_senate_trades=('is_senator', 'sum'),
        cong_house_trades=('is_senator', lambda x: (x == 0).sum()),
    )
    
    # === DERIVED FEATURES ===
    
    # Direction
    agg['cong_net'] = agg['cong_buy_count'] - agg['cong_sell_count']
    agg['cong_buy_ratio'] = agg['cong_buy_count'] / (agg['cong_total_trades'] + 1)
    agg['cong_csi'] = (agg['cong_buy_count'] - agg['cong_sell_count']) / (agg['cong_total_trades'] + 1)
    agg['cong_csi_volume'] = (agg['cong_total_buy_amount'] - agg['cong_total_sell_amount']) / \
                             (agg['cong_total_buy_amount'] + agg['cong_total_sell_amount'] + 1)
    
    # Party CSI
    agg['cong_csi_D'] = (agg['cong_dem_buys'] - agg['cong_dem_sells']) / (agg['cong_dem_trades'] + 1)
    agg['cong_csi_R'] = (agg['cong_rep_buys'] - agg['cong_rep_sells']) / (agg['cong_rep_trades'] + 1)
    agg['cong_dem_ratio'] = agg['cong_dem_trades'] / (agg['cong_total_trades'] + 1)
    
    # Chamber
    agg['cong_senate_ratio'] = agg['cong_senate_trades'] / (agg['cong_total_trades'] + 1)
    
    # Intensity and concentration
    agg['cong_intensity'] = agg['cong_total_trades'] / (agg['cong_unique_politicians'] + 1)
    agg['cong_concentration'] = agg['cong_total_trades'] / (agg['cong_unique_stocks'] + 1)
    agg['cong_trading_intensity'] = agg['cong_total_trades'] / agg['cong_total_trades'].rolling(12, min_periods=3).mean()
    
    # Proportions
    total = agg['cong_total_trades'] + 1
    agg['cong_pct_large'] = agg['cong_large_trades'] / total
    agg['cong_pct_long_delay'] = agg['cong_long_delay_trades'] / total
    agg['cong_pct_end_of_month'] = agg['cong_end_of_month_trades'] / total
    agg['cong_pct_coordinated'] = agg['cong_coordinated_trades'] / total
    agg['cong_pct_party_coordinated'] = agg['cong_party_coordinated_trades'] / total
    agg['cong_pct_committee_coordinated'] = agg['cong_committee_coordinated_trades'] / total
    agg['cong_pct_frequent_trader'] = agg['cong_frequent_trader_trades'] / total
    agg['cong_pct_first_time'] = agg['cong_first_time_trades'] / total
    agg['cong_pct_direction_change'] = agg['cong_direction_change_trades'] / total
    agg['cong_pct_chair'] = agg['cong_chair_trades'] / total
    agg['cong_pct_senior'] = agg['cong_senior_trades'] / total
    agg['cong_pct_committee_related'] = agg['cong_committee_related_trades'] / total
    agg['cong_pct_info_committee'] = agg['cong_info_committee_trades'] / total
    agg['cong_pct_contrarian'] = agg['cong_contrarian_trades'] / total
    agg['cong_pct_high_vol'] = agg['cong_high_vol_trades'] / total
    agg['cong_pct_illiquid'] = agg['cong_illiquid_trades'] / total
    agg['cong_pct_small_cap'] = agg['cong_small_cap_trades'] / total
    agg['cong_pct_smart_money'] = agg['cong_smart_money_trades'] / total
    agg['cong_pct_insider_ring'] = agg['cong_insider_ring_trades'] / total
    agg['cong_pct_hidden'] = agg['cong_hidden_trades'] / total
    agg['cong_pct_strong_signal'] = agg['cong_strong_signal_trades'] / total
    
    # Binary signals
    agg['cong_any_activity'] = (agg['cong_total_trades'] > 0).astype(int)
    agg['cong_consensus_buy'] = (agg['cong_buy_ratio'] > 0.7).astype(int)
    agg['cong_consensus_sell'] = (agg['cong_buy_ratio'] < 0.3).astype(int)
    agg['cong_bipartisan_buy'] = ((agg['cong_csi_D'] > 0) & (agg['cong_csi_R'] > 0)).astype(int)
    agg['cong_bipartisan_sell'] = ((agg['cong_csi_D'] < 0) & (agg['cong_csi_R'] < 0)).astype(int)
    agg['cong_bipartisan'] = (agg['cong_bipartisan_buy'] | agg['cong_bipartisan_sell']).astype(int)
    agg['cong_strong_buy'] = ((agg['cong_csi'] > 0) & (agg['cong_info_committee_trades'] > 0)).astype(int)
    
    return agg

In [17]:
congress_monthly = aggregate_monthly(df)

print(f'Monthly observations: {len(congress_monthly)}')
print(f'Features: {len(congress_monthly.columns)}')

Monthly observations: 163
Features: 83


In [18]:
# List all features by category
print('CONGRESSIONAL FEATURES:')
print('='*50)
for col in sorted(congress_monthly.columns):
    print(f'  {col}')

CONGRESSIONAL FEATURES:
  cong_any_activity
  cong_avg_disclosure_delay
  cong_avg_networth
  cong_avg_power_index
  cong_avg_seniority
  cong_bipartisan
  cong_bipartisan_buy
  cong_bipartisan_sell
  cong_buy_count
  cong_buy_ratio
  cong_chair_trades
  cong_committee_coordinated_trades
  cong_committee_related_trades
  cong_concentration
  cong_consensus_buy
  cong_consensus_sell
  cong_contrarian_trades
  cong_coordinated_trades
  cong_csi
  cong_csi_D
  cong_csi_R
  cong_csi_volume
  cong_dem_buys
  cong_dem_ratio
  cong_dem_sells
  cong_dem_trades
  cong_direction_change_trades
  cong_end_of_month_trades
  cong_first_time_trades
  cong_frequent_trader_trades
  cong_friday_trades
  cong_hidden_trades
  cong_high_profile_trades
  cong_high_vol_trades
  cong_house_trades
  cong_illiquid_trades
  cong_info_committee_trades
  cong_insider_ring_trades
  cong_intensity
  cong_large_trades
  cong_long_delay_trades
  cong_monday_trades
  cong_net
  cong_party_coordinated_trades
  cong_pct_

## 6. Aggregate by Sector

In [19]:
def aggregate_by_sector(df):
    """Aggregate by sector x month."""
    
    df_sec = df[df['sector'].notna()].copy()
    
    agg = df_sec.groupby(['trade_month', 'sector']).agg(
        cong_total_trades=('trade_id', 'count'),
        cong_buy_count=('is_buy', 'sum'),
        cong_sell_count=('is_sell', 'sum'),
        cong_unique_politicians=('Name', 'nunique'),
        cong_committee_related_trades=('committee_related', 'sum'),
        cong_info_committee_trades=('is_info_committee', 'sum'),
        cong_chair_trades=('is_chair', 'sum'),
        cong_large_trades=('is_large_trade', 'sum'),
        cong_coordinated_trades=('coordinated', 'sum'),
        cong_first_time_trades=('first_time', 'sum'),
        cong_smart_money_trades=('smart_money', 'sum'),
        cong_avg_disclosure_delay=('disclosure_delay', 'mean'),
    ).reset_index()
    
    # Committee-related buys/sells
    comm_agg = df_sec[df_sec['committee_related'] == 1].groupby(['trade_month', 'sector']).agg(
        cong_buys_committee=('is_buy', 'sum'),
        cong_sells_committee=('is_sell', 'sum'),
    ).reset_index()
    
    agg = agg.merge(comm_agg, on=['trade_month', 'sector'], how='left')
    agg['cong_buys_committee'] = agg['cong_buys_committee'].fillna(0)
    agg['cong_sells_committee'] = agg['cong_sells_committee'].fillna(0)
    
    # Derived
    agg['cong_csi_sector'] = (agg['cong_buy_count'] - agg['cong_sell_count']) / (agg['cong_total_trades'] + 1)
    n_comm = agg['cong_buys_committee'] + agg['cong_sells_committee']
    agg['cong_csi_committee'] = (agg['cong_buys_committee'] - agg['cong_sells_committee']) / (n_comm + 1)
    
    total = agg['cong_total_trades'] + 1
    agg['cong_pct_committee_related'] = agg['cong_committee_related_trades'] / total
    agg['cong_pct_large'] = agg['cong_large_trades'] / total
    agg['cong_pct_coordinated'] = agg['cong_coordinated_trades'] / total
    agg['cong_pct_first_time'] = agg['cong_first_time_trades'] / total
    agg['cong_intensity'] = agg['cong_total_trades'] / (agg['cong_unique_politicians'] + 1)
    
    agg['cong_trading_intensity'] = agg.groupby('sector')['cong_total_trades'].transform(
        lambda x: x / x.rolling(12, min_periods=3).mean()
    )
    
    return agg

In [20]:
sector_monthly = aggregate_by_sector(df)

print(f'Sector-month observations: {len(sector_monthly)}')
print(f'Sectors: {sector_monthly["sector"].nunique()}')

Sector-month observations: 1559
Sectors: 12


## 7. Build Final Bases

In [21]:
# S&P 500 base
market_data = pd.DataFrame({
    'sp500_ret': sp500_ret,
    'sp500_ret_lag': sp500_ret.shift(1),
    'sp500_ret_lag2': sp500_ret.shift(2),
    'sp500_vol': sp500_vol,
    'vix': vix_monthly,
    'sp500_mom_3m': sp500_monthly.pct_change(3),
    'sp500_mom_6m': sp500_monthly.pct_change(6),
    'sp500_mom_12m': sp500_monthly.pct_change(12),
    'ret_sp500_1m': sp500_ret.shift(-1),  # TARGET
})
market_data.index = market_data.index.to_period('M')

base_sp500 = market_data.join(congress_monthly, how='left')

# Fill missing
for col in congress_monthly.columns:
    if col in base_sp500.columns:
        base_sp500[col] = base_sp500[col].fillna(0)

base_sp500 = base_sp500[base_sp500['ret_sp500_1m'].notna()]

print(f'S&P 500 base: {len(base_sp500)} obs, {len(base_sp500.columns)} cols')

S&P 500 base: 155 obs, 92 cols


In [22]:
# Sector base
sector_long = df_sector_ret.stack().reset_index()
sector_long.columns = ['period', 'sector', 'ret_sector']

sector_long['ret_sector_lag'] = sector_long.groupby('sector')['ret_sector'].shift(1)
sector_long['ret_sector_lag2'] = sector_long.groupby('sector')['ret_sector'].shift(2)
sector_long['vol_sector_3m'] = sector_long.groupby('sector')['ret_sector'].transform(
    lambda x: x.rolling(3, min_periods=1).std()
)
sector_long['mom_sector_3m'] = sector_long.groupby('sector')['ret_sector'].transform(
    lambda x: x.rolling(3, min_periods=1).sum()
)
sector_long['mom_sector_6m'] = sector_long.groupby('sector')['ret_sector'].transform(
    lambda x: x.rolling(6, min_periods=1).sum()
)
sector_long['ret_sector_1m'] = sector_long.groupby('sector')['ret_sector'].shift(-1)  # TARGET

sector_monthly['period'] = sector_monthly['trade_month']

base_sector = sector_long.merge(sector_monthly, on=['period', 'sector'], how='left')

# Fill missing
cong_cols = [c for c in sector_monthly.columns if c.startswith('cong_')]
for col in cong_cols:
    if col in base_sector.columns:
        base_sector[col] = base_sector[col].fillna(0)

base_sector = base_sector[base_sector['ret_sector_1m'].notna()]

print(f'Sector base: {len(base_sector)} obs, {len(base_sector.columns)} cols')

Sector base: 1583 obs, 32 cols


## 8. Quality Check

In [23]:
# Correlation with target
print('TOP CORRELATIONS WITH ret_sp500_1m:')
print('='*50)
cong_features = [c for c in base_sp500.columns if c.startswith('cong_')]
corr = base_sp500[cong_features + ['ret_sp500_1m']].corr()['ret_sp500_1m'].drop('ret_sp500_1m')
print(corr.sort_values(key=abs, ascending=False).head(15).round(4))

TOP CORRELATIONS WITH ret_sp500_1m:
cong_csi_R                       0.1604
cong_senior_trades               0.1563
cong_bipartisan_buy              0.1316
cong_consensus_buy               0.1208
cong_pct_high_vol                0.1158
cong_high_vol_trades             0.1146
cong_pct_chair                  -0.1107
cong_net                         0.0990
cong_pct_strong_signal          -0.0949
cong_pct_info_committee          0.0947
cong_dem_ratio                   0.0925
cong_party_coordinated_trades    0.0887
cong_pct_smart_money             0.0846
cong_strong_signal_trades       -0.0844
cong_avg_seniority               0.0840
Name: ret_sp500_1m, dtype: float64


## 9. Save

In [24]:
sp500_path = os.path.join(OUTPUT_DIR, 'base_sp500_monthly.csv')
sector_path = os.path.join(OUTPUT_DIR, 'base_sector_monthly.csv')

base_sp500.to_csv(sp500_path)
base_sector.to_csv(sector_path, index=False)

print(f'Saved: {sp500_path}')
print(f'Saved: {sector_path}')

Saved: data/prediction_bases/base_sp500_monthly.csv
Saved: data/prediction_bases/base_sector_monthly.csv


## 10. Summary

In [25]:
print('='*70)
print('BASE CONSTRUCTION COMPLETE')
print('='*70)

print(f'\nS&P 500 BASE: {len(base_sp500)} months, {len(base_sp500.columns)} features')
print(f'SECTOR BASE: {len(base_sector)} sector-months, {len(base_sector.columns)} features')

print('\nFEATURE CATEGORIES:')
categories = {
    'Direction': ['cong_csi', 'cong_csi_volume', 'cong_net', 'cong_buy_ratio'],
    'Timing': ['cong_avg_disclosure_delay', 'cong_pct_long_delay', 'cong_pct_end_of_month'],
    'Coordination': ['cong_pct_coordinated', 'cong_pct_party_coordinated', 'cong_pct_committee_coordinated'],
    'Behavior': ['cong_pct_frequent_trader', 'cong_pct_first_time', 'cong_pct_direction_change'],
    'Power': ['cong_pct_chair', 'cong_pct_senior', 'cong_avg_power_index'],
    'Relevance': ['cong_pct_committee_related', 'cong_pct_info_committee'],
    'Context': ['cong_pct_contrarian', 'cong_pct_high_vol', 'cong_pct_illiquid', 'cong_pct_small_cap'],
    'Composite': ['cong_pct_smart_money', 'cong_pct_insider_ring', 'cong_pct_hidden', 'cong_strong_buy'],
    'Party': ['cong_csi_D', 'cong_csi_R', 'cong_bipartisan'],
}

for cat, features in categories.items():
    available = [f for f in features if f in base_sp500.columns]
    print(f'  {cat}: {len(available)} features')

print(f'\nFILES: {OUTPUT_DIR}/')
print('='*70)

BASE CONSTRUCTION COMPLETE

S&P 500 BASE: 155 months, 92 features
SECTOR BASE: 1583 sector-months, 32 features

FEATURE CATEGORIES:
  Direction: 4 features
  Timing: 3 features
  Coordination: 3 features
  Behavior: 3 features
  Power: 3 features
  Relevance: 2 features
  Context: 4 features
  Composite: 4 features
  Party: 3 features

FILES: data/prediction_bases/


In [26]:
print(base_sp500.isnull().sum().sum())
print(base_sector.isnull().sum().sum())

27
478


In [27]:
# Ver cuáles tienen NaN
print("S&P 500 NaN:")
print(base_sp500.isnull().sum()[base_sp500.isnull().sum() > 0])

print("\nSector NaN:")
print(base_sector.isnull().sum()[base_sector.isnull().sum() > 0])

S&P 500 NaN:
sp500_ret          1
sp500_ret_lag      2
sp500_ret_lag2     3
sp500_mom_3m       3
sp500_mom_6m       6
sp500_mom_12m     12
dtype: int64

Sector NaN:
ret_sector          11
ret_sector_lag      22
ret_sector_lag2     33
vol_sector_3m       22
mom_sector_3m       11
mom_sector_6m       11
trade_month        368
dtype: int64


In [28]:
# S&P 500: dropear las primeras 12 filas (por mom_12m)
base_sp500_clean = base_sp500.dropna()
print(f"S&P 500: {len(base_sp500)} -> {len(base_sp500_clean)}")

# Sector: dropear filas con NaN en variables públicas
# (trade_month NaN es porque no hubo trades, las cong_ ya son 0)
base_sector_clean = base_sector.dropna(subset=['ret_sector', 'ret_sector_lag', 'ret_sector_lag2', 
                                                'vol_sector_3m', 'mom_sector_3m', 'mom_sector_6m'])
print(f"Sector: {len(base_sector)} -> {len(base_sector_clean)}")

# Verificar
print(f"\nNaN restantes: {base_sp500_clean.isnull().sum().sum()}, {base_sector_clean.isnull().sum().sum()}")

S&P 500: 155 -> 143
Sector: 1583 -> 1550

NaN restantes: 0, 338


In [29]:
# Dropear la columna trade_month de sector (es redundante, ya tenés 'period')
base_sector_clean = base_sector_clean.drop(columns=['trade_month'])

# Verificar
print(f"NaN restantes: {base_sector_clean.isnull().sum().sum()}")

NaN restantes: 0


In [31]:
base_sp500_clean.to_csv('data/prediction_bases/base_sp500_monthly.csv')
base_sector_clean.to_csv('data/prediction_bases/base_sector_monthly.csv', index=False)