In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

# Load the data from CSV (adjust the path as needed)
d = pd.read_csv("CG Uncertainty.csv", parse_dates=['date'])
# Filter years between 1996 and 2022
d = d[(d['date'].dt.year >= 1996) & (d['date'].dt.year <= 2022)].copy()

# Inspect the data
d.head()


: 

In [None]:
# Calculate For_error and For_revision based on Annualized Variance
d['For_error'] = d['Annualized Variance'] - d['avg_impl_variance30']
d['For_revision'] = d['avg_impl_variance30'] - d['Forecast,t-1 of X(t,t+h)']

# Inspect the newly created columns
d[['date', 'For_error', 'For_revision']].head()


In [None]:
print("=== CG Model: Regress For_error on For_revision ===")

# Clean the data: drop rows with NaN or infinite values
d_model1 = d[['For_revision', 'For_error']].replace([np.inf, -np.inf], np.nan).dropna()

X1 = sm.add_constant(d_model1['For_revision'])
y1 = d_model1['For_error']

model1 = sm.OLS(y1, X1).fit()
print(model1.summary())


In [None]:
# Check if the column 'h=1' exists
if 'h=1' in d.columns:
    print("=== CG Model with Uncertainty Extension (Equation 6) ===")
    # Recalculate to be sure
    d['For_error'] = d['Annualized Variance'] - d['avg_impl_variance30']
    d['For_revision'] = d['avg_impl_variance30'] - d['Forecast,t-1 of X(t,t+h)']
    
    # Create an interaction term between For_revision and h=1
    d['interaction'] = d['For_revision'] * d['h=1']
    
    # Clean data for regression
    d_model2 = d[['For_revision', 'h=1', 'interaction', 'For_error']].replace([np.inf, -np.inf], np.nan).dropna()
    
    X2 = sm.add_constant(d_model2[['For_revision', 'h=1', 'interaction']])
    y2 = d_model2['For_error']
    
    model2 = sm.OLS(y2, X2).fit()
    print(model2.summary())
else:
    print("Column 'h=1' not found in the data.")


In [None]:
if 'lag_h1' in d.columns:
    print("=== CG Model Extension: For_error ~ For_revision + For_revision*lag_h1 ===")
    
    # Create an interaction term using lag_h1
    d['interaction2'] = d['For_revision'] * d['lag_h1']
    
    # Clean data for regression
    d_model3 = d[['For_revision', 'lag_h1', 'interaction2', 'For_error']].replace([np.inf, -np.inf], np.nan).dropna()
    
    X3 = sm.add_constant(d_model3[['For_revision', 'lag_h1', 'interaction2']])
    y3 = d_model3['For_error']
    
    model3 = sm.OLS(y3, X3).fit()
    print(model3.summary())
else:
    print("Column 'lag_h1' not found in the data.")


In [None]:
if 'historical variance' in d.columns:
    print("=== CG Model with Historical Variance ===")
    
    d['For_error'] = d['historical variance'] - d['avg_impl_variance30']
    d['For_revision'] = d['avg_impl_variance30'] - d['Forecast,t-1 of X(t,t+h)']
    
    d_model4 = d[['For_revision', 'For_error']].replace([np.inf, -np.inf], np.nan).dropna()
    X4 = sm.add_constant(d_model4['For_revision'])
    y4 = d_model4['For_error']
    
    model4 = sm.OLS(y4, X4).fit()
    print(model4.summary())
else:
    print("Column 'historical variance' not found in the data.")


In [None]:
if 'historical variance' in d.columns and 'h=1' in d.columns:
    print("=== CG Model with Historical Variance + Uncertainty Extension ===")
    
    # Use the previously recalculated For_error and For_revision
    d['interaction_hist'] = d['For_revision'] * d['h=1']
    
    d_model5 = d[['For_revision', 'interaction_hist', 'For_error']].replace([np.inf, -np.inf], np.nan).dropna()
    X5 = sm.add_constant(d_model5[['For_revision', 'interaction_hist']])
    y5 = d_model5['For_error']
    
    model5 = sm.OLS(y5, X5).fit()
    print(model5.summary())
else:
    print("Required columns for model 5 not found in the data.")


In [None]:
# Load Stein data
df = pd.read_csv("Stein data thesis.csv", parse_dates=['date'])
df = df[(df['date'].dt.year >= 1996) & (df['date'].dt.year <= 2022)].copy()

# AR(1) regression on avg_impl_variance30
df['lag_var30'] = df['avg_impl_variance30'].shift(1)
df_model = df[['avg_impl_variance30', 'lag_var30']].replace([np.inf, -np.inf], np.nan).dropna()

print("=== STEIN AR(1) with Monthly Data (Equation 3) ===")
X = sm.add_constant(df_model['lag_var30'])
y = df_model['avg_impl_variance30']
model_ar1 = sm.OLS(y, X).fit()
print(model_ar1.summary())

# Equation 1: Spread analysis
mean_variance = df['historical variance'].mean(skipna=True)
df['spread_short'] = df['avg_impl_variance30'] - mean_variance
df['spread_long'] = df['avg_impl_variance60'] - mean_variance
df_model2 = df[['spread_short', 'spread_long']].replace([np.inf, -np.inf], np.nan).dropna()

print("=== STEIN Equation 1: spread_long ~ spread_short ===")
X2 = sm.add_constant(df_model2['spread_short'])
y2 = df_model2['spread_long']
model_spread = sm.OLS(y2, X2).fit()
print(model_spread.summary())


In [None]:
# Stein daily analysis using lag variables and volatility
df = pd.read_csv("Stein data thesis.csv", parse_dates=['date'])
df = df[(df['date'].dt.year >= 1996) & (df['date'].dt.year <= 2022)].copy()

print("=== Stein Daily Analysis: 1-week lag ===")
df_1w = df[['avg_impl_variance30','var30_lag1w']].replace([np.inf, -np.inf], np.nan).dropna()
X_1w = sm.add_constant(df_1w['var30_lag1w'])
y_1w = df_1w['avg_impl_variance30']
model_1w = sm.OLS(y_1w, X_1w).fit()
print(model_1w.summary())

# Similar cells can be written for 2-week, 3-week, and 4-week lags.
