In [10]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

# Load the data from CSV (adjust the path as needed)
d = pd.read_csv("CG Uncertainty.csv", parse_dates=['date'])
# Filter years between 1996 and 2022
d = d[(d['date'].dt.year >= 1996) & (d['date'].dt.year <= 2022)].copy()

# Inspect the data
d.head()


Unnamed: 0,date,avg_impl_variance30,avg_impl_variance60,"Forecast,t-1 of X(t,t+h)",Annualized Variance,volatility,historical variance,month_year,Date,h=1,h=3,h=12,lag_h1,h1_change
0,1996-01-04,0.017401,0.017632,,0.016155,0.099637,0.015984,1996-01,1996-01-01,0.626934,0.736389,0.862091,,
1,1996-01-05,0.016994,0.01746,,0.016882,0.098612,0.016664,1996-01,1996-01-01,0.626934,0.736389,0.862091,,
2,1996-01-08,0.016003,0.016644,,0.016379,0.098716,0.016248,1996-01,1996-01-01,0.626934,0.736389,0.862091,,
3,1996-01-09,0.02324,0.019927,,0.013235,0.109438,0.016379,1996-01,1996-01-01,0.626934,0.736389,0.862091,,
4,1996-01-10,0.023445,0.021856,,0.007592,0.12462,0.013235,1996-01,1996-01-01,0.626934,0.736389,0.862091,,


In [11]:
# Calculate For_error and For_revision based on Annualized Variance
d['For_error'] = d['Annualized Variance'] - d['avg_impl_variance30']
d['For_revision'] = d['avg_impl_variance30'] - d['Forecast,t-1 of X(t,t+h)']

# Inspect the newly created columns
d[['date', 'For_error', 'For_revision']].head()


Unnamed: 0,date,For_error,For_revision
0,1996-01-04,-0.001246,
1,1996-01-05,-0.000112,
2,1996-01-08,0.000375,
3,1996-01-09,-0.010005,
4,1996-01-10,-0.015853,


In [12]:
print("=== CG Model: Regress For_error on For_revision ===")

# Clean the data: drop rows with NaN or infinite values
d_model1 = d[['For_revision', 'For_error']].replace([np.inf, -np.inf], np.nan).dropna()

X1 = sm.add_constant(d_model1['For_revision'])
y1 = d_model1['For_error']

model1 = sm.OLS(y1, X1).fit()
print(model1.summary())


=== CG Model: Regress For_error on For_revision ===
                            OLS Regression Results                            
Dep. Variable:              For_error   R-squared:                       0.026
Model:                            OLS   Adj. R-squared:                  0.026
Method:                 Least Squares   F-statistic:                     181.4
Date:                Sun, 16 Mar 2025   Prob (F-statistic):           7.92e-41
Time:                        21:17:00   Log-Likelihood:                 9803.9
No. Observations:                6775   AIC:                        -1.960e+04
Df Residuals:                    6773   BIC:                        -1.959e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------

In [13]:
# Check if the column 'h=1' exists
if 'h=1' in d.columns:
    print("=== CG Model with Uncertainty Extension (Equation 6) ===")
    # Recalculate to be sure
    d['For_error'] = d['Annualized Variance'] - d['avg_impl_variance30']
    d['For_revision'] = d['avg_impl_variance30'] - d['Forecast,t-1 of X(t,t+h)']
    
    # Create an interaction term between For_revision and h=1
    d['interaction'] = d['For_revision'] * d['h=1']
    
    # Clean data for regression
    d_model2 = d[['For_revision', 'h=1', 'interaction', 'For_error']].replace([np.inf, -np.inf], np.nan).dropna()
    
    X2 = sm.add_constant(d_model2[['For_revision', 'h=1', 'interaction']])
    y2 = d_model2['For_error']
    
    model2 = sm.OLS(y2, X2).fit()
    print(model2.summary())
else:
    print("Column 'h=1' not found in the data.")


=== CG Model with Uncertainty Extension (Equation 6) ===
                            OLS Regression Results                            
Dep. Variable:              For_error   R-squared:                       0.045
Model:                            OLS   Adj. R-squared:                  0.045
Method:                 Least Squares   F-statistic:                     107.2
Date:                Sun, 16 Mar 2025   Prob (F-statistic):           7.85e-68
Time:                        21:17:00   Log-Likelihood:                 9871.6
No. Observations:                6775   AIC:                        -1.974e+04
Df Residuals:                    6771   BIC:                        -1.971e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------

In [14]:
if 'lag_h1' in d.columns:
    print("=== CG Model Extension: For_error ~ For_revision + For_revision*lag_h1 ===")
    
    # Create an interaction term using lag_h1
    d['interaction2'] = d['For_revision'] * d['lag_h1']
    
    # Clean data for regression
    d_model3 = d[['For_revision', 'lag_h1', 'interaction2', 'For_error']].replace([np.inf, -np.inf], np.nan).dropna()
    
    X3 = sm.add_constant(d_model3[['For_revision', 'lag_h1', 'interaction2']])
    y3 = d_model3['For_error']
    
    model3 = sm.OLS(y3, X3).fit()
    print(model3.summary())
else:
    print("Column 'lag_h1' not found in the data.")


=== CG Model Extension: For_error ~ For_revision + For_revision*lag_h1 ===
                            OLS Regression Results                            
Dep. Variable:              For_error   R-squared:                       0.034
Model:                            OLS   Adj. R-squared:                  0.033
Method:                 Least Squares   F-statistic:                     78.39
Date:                Sun, 16 Mar 2025   Prob (F-statistic):           7.68e-50
Time:                        21:17:00   Log-Likelihood:                 9830.0
No. Observations:                6775   AIC:                        -1.965e+04
Df Residuals:                    6771   BIC:                        -1.962e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------

In [15]:
if 'historical variance' in d.columns:
    print("=== CG Model with Historical Variance ===")
    
    d['For_error'] = d['historical variance'] - d['avg_impl_variance30']
    d['For_revision'] = d['avg_impl_variance30'] - d['Forecast,t-1 of X(t,t+h)']
    
    d_model4 = d[['For_revision', 'For_error']].replace([np.inf, -np.inf], np.nan).dropna()
    X4 = sm.add_constant(d_model4['For_revision'])
    y4 = d_model4['For_error']
    
    model4 = sm.OLS(y4, X4).fit()
    print(model4.summary())
else:
    print("Column 'historical variance' not found in the data.")


=== CG Model with Historical Variance ===
                            OLS Regression Results                            
Dep. Variable:              For_error   R-squared:                       0.025
Model:                            OLS   Adj. R-squared:                  0.025
Method:                 Least Squares   F-statistic:                     172.4
Date:                Sun, 16 Mar 2025   Prob (F-statistic):           6.50e-39
Time:                        21:17:00   Log-Likelihood:                 9739.5
No. Observations:                6775   AIC:                        -1.948e+04
Df Residuals:                    6773   BIC:                        -1.946e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const 

In [16]:
if 'historical variance' in d.columns and 'h=1' in d.columns:
    print("=== CG Model with Historical Variance + Uncertainty Extension ===")
    
    # Use the previously recalculated For_error and For_revision
    d['interaction_hist'] = d['For_revision'] * d['h=1']
    
    d_model5 = d[['For_revision', 'interaction_hist', 'For_error']].replace([np.inf, -np.inf], np.nan).dropna()
    X5 = sm.add_constant(d_model5[['For_revision', 'interaction_hist']])
    y5 = d_model5['For_error']
    
    model5 = sm.OLS(y5, X5).fit()
    print(model5.summary())
else:
    print("Required columns for model 5 not found in the data.")


=== CG Model with Historical Variance + Uncertainty Extension ===
                            OLS Regression Results                            
Dep. Variable:              For_error   R-squared:                       0.026
Model:                            OLS   Adj. R-squared:                  0.026
Method:                 Least Squares   F-statistic:                     90.57
Date:                Sun, 16 Mar 2025   Prob (F-statistic):           1.53e-39
Time:                        21:17:00   Log-Likelihood:                 9743.8
No. Observations:                6775   AIC:                        -1.948e+04
Df Residuals:                    6772   BIC:                        -1.946e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------

In [17]:
# Load Stein data
df = pd.read_csv("Stein data thesis.csv", parse_dates=['date'])
df = df[(df['date'].dt.year >= 1996) & (df['date'].dt.year <= 2022)].copy()

# AR(1) regression on avg_impl_variance30
df['lag_var30'] = df['avg_impl_variance30'].shift(1)
df_model = df[['avg_impl_variance30', 'lag_var30']].replace([np.inf, -np.inf], np.nan).dropna()

print("=== STEIN AR(1) with Monthly Data (Equation 3) ===")
X = sm.add_constant(df_model['lag_var30'])
y = df_model['avg_impl_variance30']
model_ar1 = sm.OLS(y, X).fit()
print(model_ar1.summary())

# Equation 1: Spread analysis
mean_variance = df['historical variance'].mean(skipna=True)
df['spread_short'] = df['avg_impl_variance30'] - mean_variance
df['spread_long'] = df['avg_impl_variance60'] - mean_variance
df_model2 = df[['spread_short', 'spread_long']].replace([np.inf, -np.inf], np.nan).dropna()

print("=== STEIN Equation 1: spread_long ~ spread_short ===")
X2 = sm.add_constant(df_model2['spread_short'])
y2 = df_model2['spread_long']
model_spread = sm.OLS(y2, X2).fit()
print(model_spread.summary())


=== STEIN AR(1) with Monthly Data (Equation 3) ===
                             OLS Regression Results                            
Dep. Variable:     avg_impl_variance30   R-squared:                       0.932
Model:                             OLS   Adj. R-squared:                  0.932
Method:                  Least Squares   F-statistic:                 9.285e+04
Date:                 Sun, 16 Mar 2025   Prob (F-statistic):               0.00
Time:                         21:17:00   Log-Likelihood:                 20529.
No. Observations:                 6795   AIC:                        -4.105e+04
Df Residuals:                     6793   BIC:                        -4.104e+04
Df Model:                            1                                         
Covariance Type:             nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------

In [18]:
# Stein daily analysis using lag variables and volatility
df = pd.read_csv("Stein data thesis.csv", parse_dates=['date'])
df = df[(df['date'].dt.year >= 1996) & (df['date'].dt.year <= 2022)].copy()

print("=== Stein Daily Analysis: 1-week lag ===")
df_1w = df[['avg_impl_variance30','var30_lag1w']].replace([np.inf, -np.inf], np.nan).dropna()
X_1w = sm.add_constant(df_1w['var30_lag1w'])
y_1w = df_1w['avg_impl_variance30']
model_1w = sm.OLS(y_1w, X_1w).fit()
print(model_1w.summary())

# Similar cells can be written for 2-week, 3-week, and 4-week lags.


=== Stein Daily Analysis: 1-week lag ===
                             OLS Regression Results                            
Dep. Variable:     avg_impl_variance30   R-squared:                       0.797
Model:                             OLS   Adj. R-squared:                  0.797
Method:                  Least Squares   F-statistic:                 2.671e+04
Date:                 Sun, 16 Mar 2025   Prob (F-statistic):               0.00
Time:                         21:17:00   Log-Likelihood:                 16815.
No. Observations:                 6791   AIC:                        -3.363e+04
Df Residuals:                     6789   BIC:                        -3.361e+04
Df Model:                            1                                         
Covariance Type:             nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------