WRDS Options Cleaner

**Objective:**

1. **Audit:** Peek inside massive CSV files to see what dates/tickers they contain without loading the whole file.
    
2. **Filter:** Extract only the tickers we care about (e.g., `SPX` for S&P 500) to create a manageable dataset.
    
3. **Compress:** Save the cleaned data to `.parquet` format (much faster and smaller than CSV).
    

**Target Ticker:** `SPX` (Standard S&P 500 options) - Best for Heston calibration.

## Imports and Setup

In [6]:
import pandas as pd
import os
import time

# --- 1. Configuration ---
# Set this to where your downloaded files are
DATA_DIR = r"G:\My Drive\00) Interview Prep\00) Quant\Data Sources\WRDS Data\Returns\Options"

# Put the exact names of your .gz files here:
FILE_PRICES      = "Options_Price_S&PDATAONLY_AllYears.gz"       # UPDATE THIS
FILE_VOL_SURFACE = "Options_VolatilitySurface_S&PDATAONLY_AllYears.gz"    # UPDATE THIS

# Output names
OUT_PRICES      = "SPX_OptionPrices_Cleaned.parquet"
OUT_VOL_SURFACE = "SPX_VolSurface_Cleaned.parquet"

TARGET_TICKER = "SPX"

# --- 2. Processing Function ---
def process_to_parquet(input_filename, output_filename, target_cols, file_type):
    input_path = os.path.join(DATA_DIR, input_filename)
    output_path = os.path.join(DATA_DIR, output_filename)
    
    if not os.path.exists(input_path):
        print(f"‚ùå Skipping {file_type}: Could not find {input_filename}")
        return
        
    print(f"\nüè≠ Processing {file_type}: {input_filename}...")
    start_time = time.time()
    filtered_chunks = []
    
    try:
        # Read in chunks, decompressing on the fly
        chunk_iter = pd.read_csv(input_path, chunksize=500_000, compression='infer', low_memory=False)
        
        for i, chunk in enumerate(chunk_iter):
            chunk.columns = [c.lower() for c in chunk.columns]
            
            # Filter by ticker if the column exists
            if 'ticker' in chunk.columns:
                target_data = chunk[chunk['ticker'] == TARGET_TICKER].copy()
            elif 'secid' in chunk.columns and 'ticker' not in chunk.columns:
                # SPX secid is typically 108105 in OptionMetrics
                target_data = chunk[chunk['secid'] == 108105].copy()
            else:
                target_data = chunk.copy() # Assume it's already filtered
                
            if not target_data.empty:
                # Keep only available columns from our target list
                available_cols = list(set(target_cols) & set(target_data.columns))
                filtered_chunks.append(target_data[available_cols])
                
            if i % 5 == 0 and i > 0:
                print(f"  > Scanned {i * 500_000:,} rows...")

        if not filtered_chunks:
            print(f"‚ùå No SPX data found in {input_filename}!")
            return
            
        # Combine and Save
        full_df = pd.concat(filtered_chunks, ignore_index=True)
        
        # Convert dates
        if 'date' in full_df.columns: full_df['date'] = pd.to_datetime(full_df['date'], errors='coerce')
        if 'exdate' in full_df.columns: full_df['exdate'] = pd.to_datetime(full_df['exdate'], errors='coerce')
        
        full_df.to_parquet(output_path, index=False)
        
        elapsed = round(time.time() - start_time, 2)
        print(f"‚úÖ Success! Saved {len(full_df):,} rows to {output_filename} in {elapsed}s.")
        
    except Exception as e:
        print(f"‚ùå Error processing {input_filename}: {e}")

# --- 3. Execute ---

# Columns for the Heston Playground
price_cols = ['date', 'exdate', 'cp_flag', 'strike_price', 'best_bid', 'best_offer', 'impl_volatility', 'ticker']

# Columns for Machine Learning
surface_cols = ['date', 'days', 'delta', 'impl_volatility', 'ticker']

# Run the Forge
process_to_parquet(FILE_PRICES, OUT_PRICES, price_cols, "Option Prices")
process_to_parquet(FILE_VOL_SURFACE, OUT_VOL_SURFACE, surface_cols, "Volatility Surface")


üè≠ Processing Option Prices: Options_Price_S&PDATAONLY_AllYears.gz...
  > Scanned 2,500,000 rows...
  > Scanned 5,000,000 rows...
  > Scanned 7,500,000 rows...
  > Scanned 10,000,000 rows...
  > Scanned 12,500,000 rows...
  > Scanned 15,000,000 rows...
  > Scanned 17,500,000 rows...
  > Scanned 20,000,000 rows...
  > Scanned 22,500,000 rows...
  > Scanned 25,000,000 rows...
  > Scanned 27,500,000 rows...
  > Scanned 30,000,000 rows...
  > Scanned 32,500,000 rows...
  > Scanned 35,000,000 rows...
  > Scanned 37,500,000 rows...
  > Scanned 40,000,000 rows...
  > Scanned 42,500,000 rows...
  > Scanned 45,000,000 rows...
  > Scanned 47,500,000 rows...
  > Scanned 50,000,000 rows...
  > Scanned 52,500,000 rows...
  > Scanned 55,000,000 rows...
  > Scanned 57,500,000 rows...
  > Scanned 60,000,000 rows...
  > Scanned 62,500,000 rows...
  > Scanned 65,000,000 rows...
  > Scanned 67,500,000 rows...
  > Scanned 70,000,000 rows...
  > Scanned 72,500,000 rows...
  > Scanned 75,000,000 rows...
