In [None]:
"""
This file contains our SQL Query to Wharton Research Dataservices (WRDS).

We connect to the database, and then submit a query per year. This code was
written by Keyi Wang, and is used to pull data from WRDS for the S&P 500 index
options data, S&P 500 index closing prices, and risk-free rate data.
"""

import time
from pathlib import Path

import pandas as pd
import wrds

from settings import config

OUTPUT_DIR = Path(config("OUTPUT_DIR"))
DATA_DIR = Path(config("DATA_DIR"))
WRDS_USERNAME = config("WRDS_USERNAME")

In [None]:
import wrds
import pandas as pd
from decouple import config
from dateutil.relativedelta import relativedelta

# Read WRDS_USERNAME from .env
WRDS_USERNAME = config("WRDS_USERNAME")

# Connect to WRDS
db = wrds.Connection(wrds_username=WRDS_USERNAME)

# SECID for S&P 500 options
secid = 108105  

# Define the range of years
years = range(1996, 2022)

# Store query results
option_data_list = []

for year in years:
    # Check if the table exists
    tables = db.list_tables('optionm')
    if f"opprcd{year}" not in tables:
        print(f"⚠️ Option data table for {year} does not exist, skipping...")
        continue

    # Query S&P 500 options data (keep all records from the 15th of each month)
    option_query = f"""
    SELECT date, secid, strike_price, cp_flag, best_bid, best_offer, volume, 
           open_interest, impl_volatility, exdate
    FROM optionm.opprcd{year}
    WHERE secid = {secid} 
    AND EXTRACT(DAY FROM date) = 15
    """
    option_df = db.raw_sql(option_query)
    option_data_list.append(option_df)

# Close WRDS connection
db.close()




Loading library list...
Done
✅ 数据下载完成！


In [None]:
# Merge all data
option_data = pd.concat(option_data_list, ignore_index=True)
    
# Calculate mid price
option_data["mid_price"] = (option_data["best_bid"] + option_data["best_offer"]) / 2

# Convert date and exdate to datetime format
option_data["date"] = pd.to_datetime(option_data["date"])
option_data["exdate"] = pd.to_datetime(option_data["exdate"])

# Calculate the number of months between date and exdate
option_data["months_to_expiry"] = option_data.apply(
    lambda row: (relativedelta(row["exdate"], row["date"]).years * 12 + 
                 relativedelta(row["exdate"], row["date"]).months), 
    axis=1
)

# Sort data by date and months_to_expiry in ascending order
option_data = option_data.sort_values(by=["date", "months_to_expiry"]).reset_index(drop=True)

# Save data
option_data.to_csv("sp500_option_data_15th_sorted_1996_2021.csv", index=False)
print("✅ Data has been sorted by months_to_expiry and saved!")



✅ 数据已按 months_to_expiry 排序并保存！


In [None]:
# Connect to WRDS
db = wrds.Connection(wrds_username=WRDS_USERNAME)

# Store data for all years
rf_data_list = []

for year in range(1996, 2022):
    try:
        rf_query = f"""
        SELECT date, borrowrate
        FROM optionm.borrate{year}
        WHERE borrowrate != -99.990000  -- Filter out invalid data
        """
        rf_year_data = db.raw_sql(rf_query)

        if not rf_year_data.empty:
            rf_data_list.append(rf_year_data)
        else:
            print(f"⚠️ No valid Borrow Rate data for {year}.")

    except Exception as e:
        print(f"❌ Failed to retrieve `borrate` data for {year}: {e}")

# Merge data from all years
if rf_data_list:
    rf_data = pd.concat(rf_data_list, ignore_index=True)
else:
    print("❌ No Borrow Rate data retrieved!")

# Close WRDS connection
db.close()


Loading library list...
Done


In [None]:
# Select the first record of each month
monthly_rf_data = rf_data.resample('M').first().reset_index()

# Ensure no invalid data is included
monthly_rf_data = monthly_rf_data[monthly_rf_data["borrowrate"] != -99.990000]

# Save data to a CSV file
monthly_rf_data.to_csv("monthly_risk_free_rate.csv", index=False)

print("✅ Monthly risk-free rate has been saved to `monthly_risk_free_rate.csv`!")
# Display results
print(monthly_rf_data.head())



  monthly_rf_data = rf_data.resample('M').first().reset_index()


✅ 每月的风险利率已保存至 `monthly_risk_free_rate.csv`！
        date  borrowrate
0 1996-01-31    0.071522
1 1996-02-29    0.019370
2 1996-03-31    0.026780
3 1996-04-30    0.007053
4 1996-05-31    0.044384


In [9]:
conn = wrds.Connection(wrds_username='WRDS_USERNAME')  # Connect to WRDS

# Query the S&P 500 index options data
query = """
SELECT caldt, sprtrn, spindx
FROM crsp.msp500
WHERE caldt BETWEEN '1996-01-01' AND '2021-12-31'
"""
df_index = conn.raw_sql(query)

# Rename and format the date column
df_index = df_index.rename(columns={'caldt': 'date'})
df_index['date'] = pd.to_datetime(df_index['date'])

# Save as CSV
df_index.to_csv("SP500_index_data_1996_2021.csv", index=False)

# Check output
print("CSV file saved: SP500_index_data_1996_2021.csv")
print(df_index.head())

# Close WRDS connection
conn.close()



WRDS recommends setting up a .pgpass file.
Created .pgpass file successfully.
You can create this file yourself at any time with the create_pgpass_file() function.
Loading library list...
Done
CSV file saved: SP500_index_data_1996_2021.csv
        date    sprtrn  spindx
0 1996-01-31  0.032617  636.02
1 1996-02-29  0.006934  640.43
2 1996-03-29  0.007917  645.50
3 1996-04-30  0.013432  654.17
4 1996-05-31  0.022853  669.12
