# In this file we fit our model with the real data.

In [1]:
from datetime import datetime
import random

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.dates as mdates
from scipy.stats import norm, chi2

from statsmodels.tsa.stattools import adfuller
from statsmodels.stats.diagnostic import acorr_ljungbox  

from Garch import GARCH

#### Define helper functions 

In [2]:
def adf_test(series):
    result = adfuller(series, autolag='AIC')  
    print(f"ADF Statistic: {result[0]}")
    print(f"p-value: {result[1]}")
    print("Critical Values:")
    for key, value in result[4].items():
        print(f"   {key}: {value}")

    if result[1] <= 0.05:
        print(" The series is stationary (reject H0)")
    else:
        print(" The series is non-stationary (fail to reject H0)")

In [3]:
def generate_ar_data(ar_coef, T):
    res = [0]
    for t in range(1,T):
        res.append(ar_coef * res[-1] + random.gauss(0, 1))

    return res

In [4]:
def likelihood_ratio_test(ll_null: float, ll_alternative: float, df: int):
    if df <= 0:
        raise ValueError("Degrees of freedom must be positive.")
    if ll_alternative < ll_null:
        raise ValueError("ll_null should be smaller than ll_alternative.")
    
    # Compute test statistic
    lr_stat = 2 * (ll_alternative - ll_null)
    
    # Compute p-value
    p_value = 1 - chi2.cdf(lr_stat, df)
    
    return p_value

## 1. Load Data
1. S&P500 data
2. Sentiment data (consisting of mean Positive, Negative and Neutral sentiment per day)
3. VIX (for comparison purpose)

In [5]:
# Load S&P data
prices_df = pd.read_csv('../data/tesla_prices.csv')
prices_df.index = pd.to_datetime(prices_df['Date'], format='%Y-%m-%d %H:%M:%S%z', utc=True).dt.date

# Load Sentiment data
sentiment_df = pd.read_csv('../data/tesla_sentiment_gpt_summarised.csv')
sentiment_df.index = pd.DatetimeIndex(sentiment_df['adjusted_date'])

# Load US interest rate data
t_rates_df = pd.read_csv('../data/daily-treasury-rates.csv')
t_rates_df.index = pd.DatetimeIndex(t_rates_df['Date'])
t_rates_df = t_rates_df.drop(columns=['Date','Unnamed: 11',	'Unnamed: 12', '26 WEEKS BANK DISCOUNT', '26 WEEKS COUPON EQUIVALENT'])

In [6]:
sentiment_df

Unnamed: 0_level_0,adjusted_date,mean_pos_sentiment,mean_neg_sentiment,mean_neutral_sentiment,mean_pos_preamble_sentiment,mean_neg_preamble_sentiment,mean_neutral_preamble_sentiment
adjusted_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2024-01-01,1/1/24,0.257112,0.560906,0.181982,0.481200,0.408570,0.110230
2021-01-10,1/10/21,0.144524,0.786026,0.069450,0.154926,0.794357,0.050718
2023-01-11,1/11/23,0.334139,0.621210,0.044651,0.353742,0.596856,0.049402
2020-01-12,1/12/20,0.450078,0.220043,0.329879,0.695714,0.094795,0.209491
2023-01-12,1/12/23,0.059800,0.904017,0.036183,0.159795,0.797815,0.042389
...,...,...,...,...,...,...,...
2022-09-08,9/8/22,0.026218,0.887631,0.086151,0.029794,0.879863,0.090342
2020-09-09,9/9/20,0.075703,0.606183,0.318115,0.077259,0.710299,0.212441
2021-09-09,9/9/21,0.285432,0.379544,0.335024,0.145217,0.586420,0.268363
2022-09-09,9/9/22,0.826727,0.018678,0.154595,0.913103,0.012185,0.074712


In [7]:
# Merge all into one dataframe
#vix.index = pd.DatetimeIndex(vix.index.tz_localize(None))
data_with_sentiment = prices_df.join(sentiment_df, how='inner')#.join(vix[['VIX Close']], how='left')
data_with_sentiment = data_with_sentiment.drop(columns=['Date', 'adjusted_date'])

data_with_sentiment['neg_sentiment_lag1'] = data_with_sentiment['mean_neg_sentiment'].shift(1)
data_with_sentiment['neg_sentiment_diff'] = data_with_sentiment['mean_neg_sentiment'] - data_with_sentiment['neg_sentiment_lag1']
data_with_sentiment = data_with_sentiment.dropna()

data_with_sentiment = data_with_sentiment.join(t_rates_df)
# Fill missing i/r data. Missing data is sparse, but might have to investigate.
data_with_sentiment = data_with_sentiment.ffill()

log_returns = data_with_sentiment['log_returns']

data_with_sentiment.head()

Unnamed: 0,Open,High,Low,Close,Volume,Dividends,Stock Splits,returns,log_returns,mean_pos_sentiment,...,neg_sentiment_lag1,neg_sentiment_diff,4 WEEKS BANK DISCOUNT,4 WEEKS COUPON EQUIVALENT,8 WEEKS BANK DISCOUNT,8 WEEKS COUPON EQUIVALENT,13 WEEKS BANK DISCOUNT,13 WEEKS COUPON EQUIVALENT,52 WEEKS BANK DISCOUNT,52 WEEKS COUPON EQUIVALENT
2019-10-24,19.891333,20.328667,19.280001,19.978666,445813500,0.0,0.0,0.176692,0.162707,0.415098,...,0.950202,-0.699335,1.72,1.75,1.7,1.73,1.64,1.67,1.55,1.59
2019-10-29,21.332666,21.620001,20.983334,21.081333,190264500,0.0,0.0,-0.035062,-0.035691,0.916847,...,0.250868,-0.235355,1.63,1.66,1.64,1.67,1.6,1.63,1.55,1.59
2019-10-30,20.866667,21.252666,20.664667,21.000668,144627000,0.0,0.0,-0.003826,-0.003834,0.643225,...,0.015513,0.065621,1.59,1.62,1.57,1.6,1.59,1.62,1.55,1.59
2019-11-13,23.666668,23.755333,23.011999,23.073999,126301500,0.0,0.0,-0.010916,-0.010977,0.740729,...,0.081134,-0.028947,1.53,1.56,1.54,1.57,1.54,1.57,1.53,1.57
2019-11-15,23.375999,23.52,23.224001,23.478001,72135000,0.0,0.0,0.008072,0.00804,0.017608,...,0.052188,0.856669,1.56,1.59,1.53,1.56,1.54,1.57,1.5,1.54


## 2. Fit a baseline GARCH(1,1) model without the exogenous term. 

In [34]:
garch_baseline = GARCH(p=1, q=1, z=0, verbose=True)
garch_baseline.train(100*log_returns)

garch_baseline.summary()

Optimising...
Optimising finished in 2.715s
  message: CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL
  success: True
   status: 0
      fun: 3.776631928359827
        x: [ 6.358e-01 -2.818e+00 -1.928e-01]
      nit: 47
      jac: [-7.105e-07 -1.021e-06 -1.865e-06]
     nfev: 404
     njev: 101
 hess_inv: <3x3 LbfgsInvHessProduct with dtype=float64>
{'omega': 1.8884843215979803, 'alpha': 0.05973117017806405, 'beta': 0.8246766543091436}


Unnamed: 0,coef,std err,t,P>|t|
omega,1.888484,0.631966,2.988269,0.001454512
alpha,0.059731,0.01817,3.287272,0.0005324304
beta,0.824677,0.047244,17.45585,9.323287e-57


In [35]:
baseline_log_like = garch_baseline.loglikelihood
print(f"Log likelihood: {baseline_log_like:.3f}")

Log likelihood: -1875.246


## 2. Fit model with the exogenous variables. 

In [36]:
exo_sentiment = data_with_sentiment[['mean_pos_sentiment',	'mean_neg_sentiment', 'mean_neutral_sentiment']].to_numpy()

#### Try alternative data using PCA since the three sentiments features are correlated.  

In [38]:
# ! pip install scikit-learn
# from sklearn.decomposition import PCA

# pca = PCA(n_components=1)
# pca_exo_sentiment = pca.fit_transform(exo_sentiment)

In [42]:
garch_with_sentiment = GARCH(p=1, q=1, z=1, verbose=True)
garch_with_sentiment.train(100*log_returns, x=exo_sentiment)

garch_with_sentiment.summary()

Optimising...
Optimising finished in 4.367s
  message: ABNORMAL_TERMINATION_IN_LNSRCH
  success: False
   status: 2
      fun: 4.123578725434396
        x: [-5.667e-01 -7.423e-01 -6.464e-01 -5.887e-01 -5.456e-01
            -6.772e-01]
      nit: 4
      jac: [-1.659e-01 -2.764e-01 -1.213e+00 -4.009e-02 -6.378e-02
            -4.956e-03]
     nfev: 651
     njev: 93
 hess_inv: <6x6 LbfgsInvHessProduct with dtype=float64>
{'omega': 0.5673948188539584, 'alpha': 0.47601655963128386, 'beta': 0.5239517564999061, 'gamma': array([[0.55505981, 0.57946627, 0.50802628]])}


Unnamed: 0,coef,std err,t,P>|t|
omega,0.567395,0.651216,0.871285,0.191957
alpha,0.476017,0.004224,112.699045,0.0
beta,0.523952,0.004226,123.973858,0.0
gamma[0],0.55506,0.97501,0.569286,0.284677
gamma[1],0.579466,0.869162,0.666696,0.252599
gamma[2],0.508026,3.801789,0.133628,0.446869


In [43]:
with_sentiment_log_like = garch_with_sentiment.loglikelihood
print(f"Log likelihood: {with_sentiment_log_like:.3f}")

Log likelihood: -1991.126


## 2.2 Fit with only negative sentiment.

In [104]:
data_with_sentiment

Unnamed: 0,Open,High,Low,Close,Volume,Dividends,Stock Splits,returns,log_returns,mean_pos_sentiment,...,neg_sentiment_lag1,neg_sentiment_diff,4 WEEKS BANK DISCOUNT,4 WEEKS COUPON EQUIVALENT,8 WEEKS BANK DISCOUNT,8 WEEKS COUPON EQUIVALENT,13 WEEKS BANK DISCOUNT,13 WEEKS COUPON EQUIVALENT,52 WEEKS BANK DISCOUNT,52 WEEKS COUPON EQUIVALENT
2019-10-24,19.891333,20.328667,19.280001,19.978666,445813500,0.0,0.0,0.176692,0.162707,0.415098,...,0.950202,-0.699335,1.72,1.75,1.70,1.73,1.64,1.67,1.55,1.59
2019-10-29,21.332666,21.620001,20.983334,21.081333,190264500,0.0,0.0,-0.035062,-0.035691,0.916847,...,0.250868,-0.235355,1.63,1.66,1.64,1.67,1.60,1.63,1.55,1.59
2019-10-30,20.866667,21.252666,20.664667,21.000668,144627000,0.0,0.0,-0.003826,-0.003834,0.643225,...,0.015513,0.065621,1.59,1.62,1.57,1.60,1.59,1.62,1.55,1.59
2019-11-13,23.666668,23.755333,23.011999,23.073999,126301500,0.0,0.0,-0.010916,-0.010977,0.740729,...,0.081134,-0.028947,1.53,1.56,1.54,1.57,1.54,1.57,1.53,1.57
2019-11-15,23.375999,23.520000,23.224001,23.478001,72135000,0.0,0.0,0.008072,0.008040,0.017608,...,0.052188,0.856669,1.56,1.59,1.53,1.56,1.54,1.57,1.50,1.54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-09-05,223.490005,235.000000,222.250000,230.169998,119355000,0.0,0.0,0.049041,0.047876,0.377149,...,0.346941,-0.176921,5.08,5.17,5.04,5.15,4.94,5.07,4.59,4.76
2024-09-13,228.000000,232.669998,226.320007,230.289993,59515100,0.0,0.0,0.002089,0.002086,0.266633,...,0.170020,0.323076,4.94,5.03,4.94,5.05,4.78,4.90,4.48,4.65
2024-09-18,230.089996,235.679993,226.880005,227.199997,78010200,0.0,0.0,-0.002940,-0.002945,0.949651,...,0.493096,-0.480494,4.71,4.79,4.70,4.80,4.65,4.77,4.37,4.53
2024-09-23,242.610001,250.000000,241.919998,250.000000,86927200,0.0,0.0,0.049318,0.048140,0.489154,...,0.012602,0.306459,4.66,4.74,4.65,4.75,4.53,4.65,4.26,4.41


In [105]:
neg_sentiment = data_with_sentiment[['mean_pos_sentiment']].to_numpy()#exo_sentiment[:, [1]]  # mean_pos_preamble_sentiment
#neg_sentiment_normalised = (neg_sentiment - np.mean(neg_sentiment)) / np.var(neg_sentiment)

#### Run Ljung-Box test (default lags=10)


In [93]:
#result = acorr_ljungbox(neg_sentiment, lags=[x for x in range(11)], return_df=True)
#print(result)

#### Test exogenous data for stationarity


In [94]:
print("ADF Test for X:")
adf_test(neg_sentiment)

ADF Test for X:
ADF Statistic: -9.330690266665046
p-value: 9.346411145092255e-16
Critical Values:
   1%: -3.4402516575519346
   5%: -2.8659091963995573
   10%: -2.569096752341498
 The series is stationary (reject H0)


In [106]:
garch_with_neg_sentiment = GARCH(p=1, q=1, z=1, verbose=True)
garch_with_neg_sentiment.train(log_returns, x=neg_sentiment)

Recommendation: pass in 100*y
Optimising...
Optimising finished in 2.776s
  message: CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL
  success: True
   status: 0
      fun: -5.415433351739128
        x: [-6.911e+00 -2.345e+00 -1.646e+00 -7.246e+00]
      nit: 32
      jac: [-2.753e-06  6.217e-07 -3.553e-07 -5.329e-07]
     nfev: 190
     njev: 38
 hess_inv: <4x4 LbfgsInvHessProduct with dtype=float64>
{'omega': 0.000996460788982849, 'alpha': 0.09587217543605371, 'beta': 0.1927340964349931, 'gamma': array([[0.00071285]])}


In [107]:
garch_with_neg_sentiment.summary()

Unnamed: 0,coef,std err,t,P>|t|
omega,0.000996,0.000278,3.578223,0.000186
alpha,0.095872,0.033042,2.901512,0.001918
beta,0.192734,0.196428,0.981195,0.163427
gamma[0],0.000713,0.000382,1.866885,0.031179


In [49]:
with_neg_sentiment_log_like = garch_with_sentiment.loglikelihood
print(f"Log likelihood: {with_neg_sentiment_log_like:.3f}")

Log likelihood: -1991.126


## 2.3 Fit with other data.

In [32]:
rf_data = data_with_sentiment[['4 WEEKS BANK DISCOUNT', 'mean_neg_sentiment']]   # '8 WEEKS BANK DISCOUNT',

In [33]:
garch_with_neg_sentiment = GARCH(p=1, q=1, z=1, verbose=True)
garch_with_neg_sentiment.train(100*log_returns, x=rf_data)

Optimising...
Optimising finished in 10.487s
{'omega': 0.0015569817707142767, 'alpha': 0.16578033629485042, 'beta': 0.8014779205806908, 'gamma': array([[6.34962711e-07, 2.07599812e-01]])}


In [34]:
garch_with_neg_sentiment.summary()

Unnamed: 0,coef,std err,t,P>|t|
omega,0.001556982,0.011936,0.130439,0.4481201
alpha,0.1657803,0.032517,5.098203,1.984559e-07
beta,0.8014779,0.034868,22.986158,8.475967e-98
gamma[0],6.349627e-07,0.000551,0.001152,0.4995407
gamma[1],0.2075998,0.053287,3.895872,5.156809e-05


In [23]:
with_neg_sentiment_log_like = garch_with_neg_sentiment.loglikelihood
print(f"Log likelihood: {with_neg_sentiment_log_like:.3f}")

Log likelihood: -1766.900


In [54]:
likelihood_ratio_test(garch_baseline.loglikelihood, 
                      garch_with_neg_sentiment.loglikelihood, 2)

0.9625447383437402

---

In [15]:
def mse(actual, pred):
    return np.sum((actual-pred) ** 2)

mse_baseline = mse(data_with_sentiment['VIX Close'], garch_baseline.sigma2)
mse_sentiment = mse(data_with_sentiment['VIX Close'], garch_with_sentiment.sigma2)

print(f"MSE baseline: {mse_baseline:.3f}")
print(f"MSE with sentiment: {mse_sentiment:.3f}")

MSE baseline: 509949.092
MSE with sentiment: 511752.741


## Checking with arch library to make sure we are correct.

In [99]:
from arch import arch_model

model = arch_model(100*log_returns, vol='GARCH', mean='ARX', p=1, q=1)
garch_fit = model.fit(disp='off')

In [100]:
garch_fit.summary()

0,1,2,3
Dep. Variable:,log_returns,R-squared:,0.0
Mean Model:,AR,Adj. R-squared:,0.0
Vol Model:,GARCH,Log-Likelihood:,-1766.91
Distribution:,Normal,AIC:,3541.82
Method:,Maximum Likelihood,BIC:,3562.27
,,No. Observations:,1229.0
Date:,"Sat, Mar 01 2025",Df Residuals:,1228.0
Time:,05:23:16,Df Model:,1.0

0,1,2,3,4,5
,coef,std err,t,P>|t|,95.0% Conf. Int.
Const,0.0974,2.528e-02,3.854,1.163e-04,"[4.788e-02, 0.147]"

0,1,2,3,4,5
,coef,std err,t,P>|t|,95.0% Conf. Int.
omega,0.0359,1.396e-02,2.569,1.020e-02,"[8.504e-03,6.323e-02]"
alpha[1],0.1567,3.461e-02,4.527,5.980e-06,"[8.886e-02, 0.225]"
beta[1],0.8226,3.520e-02,23.368,9.138e-121,"[ 0.754, 0.892]"
