# CRSP Data Exploration

Explore the structure and contents of the CRSP data files.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

DATA_PATH = Path("../US_CRSP_NYSE/")

## 1. Available Files

In [None]:
# List all files
for f in DATA_PATH.rglob("*.csv"):
    print(f"{f.relative_to(DATA_PATH)} - {f.stat().st_size / 1e6:.1f} MB")

## 2. Main Data Files (Matrix Format)

Files:
- `OPCL`: Open-to-Close returns
- `pvCLCL`: Previous Close-to-Close returns  
- `volume`: Trading volume
- `volMM`: Volume in millions

In [None]:
# Load close-to-close returns (main return series)
returns_df = pd.read_csv(DATA_PATH / "Matrix_Format_SubsetUniverse/pvCLCL_20000103_20201231.csv", index_col=0)
print(f"Shape: {returns_df.shape}")
print(f"Tickers (rows): {returns_df.shape[0]}")
print(f"Dates (cols): {returns_df.shape[1]}")
returns_df.head()

In [None]:
# Check date range
dates = returns_df.columns.tolist()
print(f"First date: {dates[0]}")
print(f"Last date: {dates[-1]}")
print(f"Total trading days: {len(dates)}")

In [None]:
# Sample tickers
print("Sample tickers:")
print(returns_df.index[:20].tolist())

In [None]:
# Check for missing values
missing_pct = returns_df.isna().sum().sum() / returns_df.size * 100
print(f"Missing values: {missing_pct:.2f}%")

In [None]:
# Load volume data
volume_df = pd.read_csv(DATA_PATH / "Matrix_Format_SubsetUniverse/volume_20000103_20201231.csv", index_col=0)
print(f"Volume shape: {volume_df.shape}")
volume_df.head()

In [None]:
# Load open-to-close returns
opcl_df = pd.read_csv(DATA_PATH / "Matrix_Format_SubsetUniverse/OPCL_20000103_20201231.csv", index_col=0)
print(f"OPCL shape: {opcl_df.shape}")
opcl_df.head()

## 3. Sector Data

In [None]:
# S&P 500 sectors
sectors_sp500 = pd.read_csv(DATA_PATH / "Sectors/Sectors_SP500_YahooNWikipedia.csv")
print(f"S&P 500 sectors shape: {sectors_sp500.shape}")
sectors_sp500.head(10)

In [None]:
# Column names
print("Columns:", sectors_sp500.columns.tolist())

In [None]:
# Unique sectors
print("\nUnique sectors:")
if 'Sector' in sectors_sp500.columns:
    print(sectors_sp500['Sector'].value_counts())
elif 'GICS Sector' in sectors_sp500.columns:
    print(sectors_sp500['GICS Sector'].value_counts())
else:
    print(sectors_sp500.iloc[:, 1:].head())

In [None]:
# S&P 1500 sectors
sectors_sp1500 = pd.read_csv(DATA_PATH / "Sectors/Sectors_SP1500.csv")
print(f"S&P 1500 sectors shape: {sectors_sp1500.shape}")
sectors_sp1500.head()

## 4. Filter Energy Sector Stocks

In [None]:
# Find energy stocks
sector_col = [c for c in sectors_sp500.columns if 'sector' in c.lower()]
ticker_col = [c for c in sectors_sp500.columns if 'symbol' in c.lower() or 'ticker' in c.lower()]

print(f"Sector column: {sector_col}")
print(f"Ticker column: {ticker_col}")

In [None]:
# Extract energy tickers
if sector_col and ticker_col:
    energy_mask = sectors_sp500[sector_col[0]].str.contains('Energy', case=False, na=False)
    energy_tickers = sectors_sp500.loc[energy_mask, ticker_col[0]].tolist()
    print(f"Energy sector stocks: {len(energy_tickers)}")
    print(energy_tickers)

In [None]:
# Check overlap with returns data
if 'energy_tickers' in dir():
    available_energy = [t for t in energy_tickers if t in returns_df.index]
    print(f"Energy tickers in returns data: {len(available_energy)} / {len(energy_tickers)}")
    print(available_energy)

## 5. Summary Statistics

In [None]:
# Basic stats for a few tickers
sample_tickers = returns_df.index[:5].tolist()
returns_df.loc[sample_tickers].T.describe()

In [None]:
# Check if returns are in percentage or decimal form
print("Sample return values:")
print(returns_df.iloc[0, :10].values)
print(f"\nMean of first ticker: {returns_df.iloc[0].mean():.6f}")
print(f"Std of first ticker: {returns_df.iloc[0].std():.6f}")