## Imports and Setup

In [1]:
import pandas as pd
import os
import time
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as si

## Convert WRDS data to .parquet

In [None]:
# --- 1. Configuration (Updated Path) ---
BASE_DIR = r"G:\My Drive\00) Interview Prep\00) Quant\Data Sources\WRDS Data\Returns\Options"

# Input Files
FILE_SPOT  = "IndexPrices_alltime.gz"
FILE_YIELD = "ZeroCouponYieldCurve.gz"
FILE_DIV   = "IndexDividendYields.gz"

# Output Files
OUT_SPOT  = "SPX_IndexPrices.parquet"
OUT_YIELD = "ZeroCouponYieldCurve.parquet"
OUT_DIV   = "SPX_IndexDividendYields.parquet"

TARGET_TICKER = "SPX"
TARGET_SECID = 108105 # OptionMetrics SPX SecID

def convert_ancillary_file(input_file, output_file, filter_spx=False):
    input_path = os.path.join(BASE_DIR, input_file)
    output_path = os.path.join(BASE_DIR, output_file)
    
    if not os.path.exists(input_path):
        print(f"‚ùå Could not find {input_file}")
        return
        
    print(f"üè≠ Converting {input_file}...")
    start_time = time.time()
    
    try:
        # These are smaller than the options files, so we can usually read them directly, 
        # but chunking is safer for the 1.2GB IndexPrices file.
        chunks = []
        chunk_iter = pd.read_csv(input_path, chunksize=250_000, compression='gzip', low_memory=False)
        
        for chunk in chunk_iter:
            chunk.columns = [c.lower() for c in chunk.columns]
            
            if filter_spx:
                if 'ticker' in chunk.columns:
                    target_data = chunk[chunk['ticker'] == TARGET_TICKER].copy()
                elif 'secid' in chunk.columns:
                    target_data = chunk[chunk['secid'] == TARGET_SECID].copy()
                else:
                    target_data = chunk.copy()
            else:
                target_data = chunk.copy()
                
            if not target_data.empty:
                chunks.append(target_data)
                
        if not chunks:
            print(f"‚ùå No data extracted for {input_file}.")
            return
            
        full_df = pd.concat(chunks, ignore_index=True)
        
        # Standardize date columns
        if 'date' in full_df.columns:
            full_df['date'] = pd.to_datetime(full_df['date'], errors='coerce')
            
        full_df.to_parquet(output_path, index=False)
        elapsed = round(time.time() - start_time, 2)
        print(f"‚úÖ Saved {len(full_df):,} rows to {output_file} in {elapsed}s.\n")
        
    except Exception as e:
        print(f"‚ùå Error processing {input_file}: {e}\n")

# --- 2. Run the Conversions ---
# We filter Spot and Div for SPX. Yield curve applies to everything, so no filter.
convert_ancillary_file(FILE_SPOT, OUT_SPOT, filter_spx=True)
convert_ancillary_file(FILE_YIELD, OUT_YIELD, filter_spx=False)
convert_ancillary_file(FILE_DIV, OUT_DIV, filter_spx=True)