# In this file we fit our model with the real data.

In [72]:
from datetime import datetime
import random

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.dates as mdates
from scipy.stats import norm, chi2

from statsmodels.tsa.stattools import adfuller
from statsmodels.stats.diagnostic import acorr_ljungbox  

from Garch import GARCH

#### Define helper functions 

In [73]:
def adf_test(series):
    result = adfuller(series, autolag='AIC')  
    print(f"ADF Statistic: {result[0]}")
    print(f"p-value: {result[1]}")
    print("Critical Values:")
    for key, value in result[4].items():
        print(f"   {key}: {value}")

    if result[1] <= 0.05:
        print(" The series is stationary (reject H0)")
    else:
        print(" The series is non-stationary (fail to reject H0)")

In [74]:
def generate_ar_data(ar_coef, T):
    res = [0]
    for t in range(1,T):
        res.append(ar_coef * res[-1] + random.gauss(0, 1))

    return res

In [75]:
def likelihood_ratio_test(ll_null: float, ll_alternative: float, df: int):
    if df <= 0:
        raise ValueError("Degrees of freedom must be positive.")
    if ll_alternative < ll_null:
        raise ValueError("ll_null should be smaller than ll_alternative.")
    
    # Compute test statistic
    lr_stat = 2 * (ll_alternative - ll_null)
    
    # Compute p-value
    p_value = 1 - chi2.cdf(lr_stat, df)
    
    return p_value

## 1. Load Data
1. S&P500 data
2. Sentiment data (consisting of mean Positive, Negative and Neutral sentiment per day)
3. VIX (for comparison purpose)

In [76]:
# Load S&P data
prices_df = pd.read_csv('../data/tesla_prices.csv')
prices_df.index = pd.to_datetime(prices_df['Date'], format='%Y-%m-%d %H:%M:%S%z', utc=True).dt.date

# Load Sentiment data
sentiment_df = pd.read_csv('../data/tesla_sentiment_gpt_summarised.csv')
sentiment_df.index = pd.DatetimeIndex(sentiment_df['adjusted_date'])

# Load US interest rate data
t_rates_df = pd.read_csv('../data/daily-treasury-rates.csv')
t_rates_df.index = pd.DatetimeIndex(t_rates_df['Date'])
t_rates_df = t_rates_df.drop(columns=['Date','Unnamed: 11',	'Unnamed: 12', '26 WEEKS BANK DISCOUNT', '26 WEEKS COUPON EQUIVALENT'])

In [77]:
sentiment_df

Unnamed: 0_level_0,adjusted_date,mean_pos_sentiment,mean_neg_sentiment,mean_neutral_sentiment,mean_pos_preamble_sentiment,mean_neg_preamble_sentiment,mean_neutral_preamble_sentiment
adjusted_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2024-01-01,1/1/24,0.257112,0.560906,0.181982,0.481200,0.408570,0.110230
2021-01-10,1/10/21,0.144524,0.786026,0.069450,0.154926,0.794357,0.050718
2023-01-11,1/11/23,0.334139,0.621210,0.044651,0.353742,0.596856,0.049402
2020-01-12,1/12/20,0.450078,0.220043,0.329879,0.695714,0.094795,0.209491
2023-01-12,1/12/23,0.059800,0.904017,0.036183,0.159795,0.797815,0.042389
...,...,...,...,...,...,...,...
2022-09-08,9/8/22,0.026218,0.887631,0.086151,0.029794,0.879863,0.090342
2020-09-09,9/9/20,0.075703,0.606183,0.318115,0.077259,0.710299,0.212441
2021-09-09,9/9/21,0.285432,0.379544,0.335024,0.145217,0.586420,0.268363
2022-09-09,9/9/22,0.826727,0.018678,0.154595,0.913103,0.012185,0.074712


In [19]:
# Merge all into one dataframe
#vix.index = pd.DatetimeIndex(vix.index.tz_localize(None))
data_with_sentiment = prices_df.join(sentiment_df, how='inner')#.join(vix[['VIX Close']], how='left')
data_with_sentiment = data_with_sentiment.drop(columns=['Date', 'adjusted_date'])

data_with_sentiment['neg_sentiment_lag1'] = data_with_sentiment['mean_neg_sentiment'].shift(1)
data_with_sentiment['neg_sentiment_diff'] = data_with_sentiment['mean_neg_sentiment'] - data_with_sentiment['neg_sentiment_lag1']
data_with_sentiment = data_with_sentiment.dropna()

data_with_sentiment = data_with_sentiment.join(t_rates_df)
# Fill missing i/r data. Missing data is sparse, but might have to investigate.
data_with_sentiment = data_with_sentiment.ffill()

log_returns = data_with_sentiment['log_returns']

data_with_sentiment.head()

Unnamed: 0,Open,High,Low,Close,Volume,Dividends,Stock Splits,returns,log_returns,mean_pos_sentiment,...,neg_sentiment_lag1,neg_sentiment_diff,4 WEEKS BANK DISCOUNT,4 WEEKS COUPON EQUIVALENT,8 WEEKS BANK DISCOUNT,8 WEEKS COUPON EQUIVALENT,13 WEEKS BANK DISCOUNT,13 WEEKS COUPON EQUIVALENT,52 WEEKS BANK DISCOUNT,52 WEEKS COUPON EQUIVALENT
2019-10-24,19.891333,20.328667,19.280001,19.978666,445813500,0.0,0.0,0.176692,0.162707,0.415098,...,0.950202,-0.699335,1.72,1.75,1.7,1.73,1.64,1.67,1.55,1.59
2019-10-29,21.332666,21.620001,20.983334,21.081333,190264500,0.0,0.0,-0.035062,-0.035691,0.916847,...,0.250868,-0.235355,1.63,1.66,1.64,1.67,1.6,1.63,1.55,1.59
2019-10-30,20.866667,21.252666,20.664667,21.000668,144627000,0.0,0.0,-0.003826,-0.003834,0.643225,...,0.015513,0.065621,1.59,1.62,1.57,1.6,1.59,1.62,1.55,1.59
2019-11-13,23.666668,23.755333,23.011999,23.073999,126301500,0.0,0.0,-0.010916,-0.010977,0.740729,...,0.081134,-0.028947,1.53,1.56,1.54,1.57,1.54,1.57,1.53,1.57
2019-11-15,23.375999,23.52,23.224001,23.478001,72135000,0.0,0.0,0.008072,0.00804,0.017608,...,0.052188,0.856669,1.56,1.59,1.53,1.56,1.54,1.57,1.5,1.54


## 2. Fit a baseline GARCH(1,1) model without the exogenous term. 

In [20]:
garch_baseline = GARCH(p=1, q=1, z=0, verbose=True)
garch_baseline.train(100*log_returns)

garch_baseline.summary()

Optimising...
Optimising finished in 1.144s
  message: CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL
  success: True
   status: 0
      fun: 3.7766319283422862
        x: [ 6.358e-01 -2.818e+00 -1.928e-01]
      nit: 53
      jac: [-1.243e-06  4.441e-08 -6.262e-06]
     nfev: 460
     njev: 115
 hess_inv: <3x3 LbfgsInvHessProduct with dtype=float64>
{'omega': 1.8885643766986113, 'alpha': 0.059734202639481274, 'beta': 0.8246689621981558}


Unnamed: 0,coef,std err,t,P>|t|
omega,1.888564,0.631978,2.98834,0.001454178
alpha,0.059734,0.018171,3.287306,0.0005323667
beta,0.824669,0.047244,17.455376,9.376497999999999e-57


In [21]:
baseline_log_like = garch_baseline.loglikelihood
print(f"Log likelihood: {baseline_log_like:.3f}")

Log likelihood: -1875.246


## 2. Fit model with the exogenous variables. 

In [22]:
exo_sentiment = data_with_sentiment[['mean_pos_sentiment',	'mean_neg_sentiment', 'mean_neutral_sentiment']].to_numpy()

#### Try alternative data using PCA since the three sentiments features are correlated.  

In [23]:
# ! pip install scikit-learn
#from sklearn.decomposition import PCA

# pca = PCA(n_components=1)
# pca_exo_sentiment = pca.fit_transform(exo_sentiment)

In [24]:
garch_with_sentiment = GARCH(p=1, q=1, z=1, verbose=True)
garch_with_sentiment.train(100*log_returns, x=exo_sentiment)

garch_with_sentiment.summary()

Optimising...
Optimising finished in 1.774s
  message: ABNORMAL_TERMINATION_IN_LNSRCH
  success: False
   status: 2
      fun: 4.123565025143794
        x: [-5.667e-01 -7.423e-01 -6.464e-01 -5.887e-01 -5.457e-01
            -6.772e-01]
      nit: 4
      jac: [-1.659e-01 -2.765e-01 -1.213e+00 -4.010e-02 -6.379e-02
            -4.957e-03]
     nfev: 658
     njev: 94
 hess_inv: <6x6 LbfgsInvHessProduct with dtype=float64>
{'omega': 0.5673913318159649, 'alpha': 0.47601397300750264, 'beta': 0.5239341056494212, 'gamma': array([[0.55505902, 0.57946488, 0.5080262 ]])}


Unnamed: 0,coef,std err,t,P>|t|
omega,0.567391,0.651334,0.871122,0.192001
alpha,0.476014,0.005322,89.441066,0.0
beta,0.523934,0.005327,98.350422,0.0
gamma[0],0.555059,0.97495,0.569321,0.284666
gamma[1],0.579465,0.869123,0.666724,0.25259
gamma[2],0.508026,3.801052,0.133654,0.446858


In [25]:
with_sentiment_log_like = garch_with_sentiment.loglikelihood
print(f"Log likelihood: {with_sentiment_log_like:.3f}")

Log likelihood: -1991.122


## 2.2 Fit with only negative sentiment.

In [26]:
data_with_sentiment

Unnamed: 0,Open,High,Low,Close,Volume,Dividends,Stock Splits,returns,log_returns,mean_pos_sentiment,...,neg_sentiment_lag1,neg_sentiment_diff,4 WEEKS BANK DISCOUNT,4 WEEKS COUPON EQUIVALENT,8 WEEKS BANK DISCOUNT,8 WEEKS COUPON EQUIVALENT,13 WEEKS BANK DISCOUNT,13 WEEKS COUPON EQUIVALENT,52 WEEKS BANK DISCOUNT,52 WEEKS COUPON EQUIVALENT
2019-10-24,19.891333,20.328667,19.280001,19.978666,445813500,0.0,0.0,0.176692,0.162707,0.415098,...,0.950202,-0.699335,1.72,1.75,1.70,1.73,1.64,1.67,1.55,1.59
2019-10-29,21.332666,21.620001,20.983334,21.081333,190264500,0.0,0.0,-0.035062,-0.035691,0.916847,...,0.250868,-0.235355,1.63,1.66,1.64,1.67,1.60,1.63,1.55,1.59
2019-10-30,20.866667,21.252666,20.664667,21.000668,144627000,0.0,0.0,-0.003826,-0.003834,0.643225,...,0.015513,0.065621,1.59,1.62,1.57,1.60,1.59,1.62,1.55,1.59
2019-11-13,23.666668,23.755333,23.011999,23.073999,126301500,0.0,0.0,-0.010916,-0.010977,0.740729,...,0.081134,-0.028947,1.53,1.56,1.54,1.57,1.54,1.57,1.53,1.57
2019-11-15,23.375999,23.520000,23.224001,23.478001,72135000,0.0,0.0,0.008072,0.008040,0.017608,...,0.052188,0.856669,1.56,1.59,1.53,1.56,1.54,1.57,1.50,1.54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-09-05,223.490005,235.000000,222.250000,230.169998,119355000,0.0,0.0,0.049041,0.047876,0.377149,...,0.346941,-0.176921,5.08,5.17,5.04,5.15,4.94,5.07,4.59,4.76
2024-09-13,228.000000,232.669998,226.320007,230.289993,59515100,0.0,0.0,0.002089,0.002086,0.266633,...,0.170020,0.323076,4.94,5.03,4.94,5.05,4.78,4.90,4.48,4.65
2024-09-18,230.089996,235.679993,226.880005,227.199997,78010200,0.0,0.0,-0.002940,-0.002945,0.949651,...,0.493096,-0.480494,4.71,4.79,4.70,4.80,4.65,4.77,4.37,4.53
2024-09-23,242.610001,250.000000,241.919998,250.000000,86927200,0.0,0.0,0.049318,0.048140,0.489154,...,0.012602,0.306459,4.66,4.74,4.65,4.75,4.53,4.65,4.26,4.41


In [27]:
neg_sentiment = data_with_sentiment[['mean_pos_sentiment']].to_numpy()#exo_sentiment[:, [1]]  # mean_pos_preamble_sentiment
#neg_sentiment_normalised = (neg_sentiment - np.mean(neg_sentiment)) / np.var(neg_sentiment)

#### Run Ljung-Box test (default lags=10)


In [28]:
#result = acorr_ljungbox(neg_sentiment, lags=[x for x in range(11)], return_df=True)
#print(result)

#### Test exogenous data for stationarity


In [29]:
print("ADF Test for X:")
adf_test(neg_sentiment)

ADF Test for X:
ADF Statistic: -9.154977118460776
p-value: 2.6246398261107458e-15
Critical Values:
   1%: -3.4402516575519346
   5%: -2.8659091963995573
   10%: -2.569096752341498
 The series is stationary (reject H0)


In [30]:
garch_with_neg_sentiment = GARCH(p=1, q=1, z=1, verbose=True)
garch_with_neg_sentiment.train(log_returns, x=neg_sentiment)

Recommendation: pass in 100*y
Optimising...
Optimising finished in 1.140s
  message: CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH
  success: True
   status: 0
      fun: -5.415433348773559
        x: [-6.912e+00 -2.345e+00 -1.645e+00 -7.246e+00]
      nit: 31
      jac: [ 3.739e-05  1.688e-06  1.430e-05  5.418e-06]
     nfev: 205
     njev: 41
 hess_inv: <4x4 LbfgsInvHessProduct with dtype=float64>
{'omega': 0.00099626198175752, 'alpha': 0.09585820777749968, 'beta': 0.1929270926242445, 'gamma': array([[0.0007127]])}


In [31]:
garch_with_neg_sentiment.summary()

Unnamed: 0,coef,std err,t,P>|t|
omega,0.000996,0.000279,3.575195,0.000188
alpha,0.095858,0.033041,2.901204,0.00192
beta,0.192927,0.196585,0.981391,0.163379
gamma[0],0.000713,0.000382,1.865608,0.031268


In [32]:
with_neg_sentiment_log_like = garch_with_sentiment.loglikelihood
print(f"Log likelihood: {with_neg_sentiment_log_like:.3f}")

Log likelihood: -1991.122


## 2.3 Fit with other data.

In [33]:
rf_data = data_with_sentiment[['4 WEEKS BANK DISCOUNT', 'mean_neg_sentiment']]   # '8 WEEKS BANK DISCOUNT',

In [34]:
garch_with_neg_sentiment = GARCH(p=1, q=1, z=1, verbose=True)
garch_with_neg_sentiment.train(100*log_returns, x=rf_data)

Optimising...
Optimising finished in 0.860s
  message: ABNORMAL_TERMINATION_IN_LNSRCH
  success: False
   status: 2
      fun: 4.092270075996755
        x: [-7.016e-01 -7.819e-01 -6.180e-01 -7.082e-01 -6.326e-01]
      nit: 2
      jac: [-6.010e-02 -7.476e-02 -3.282e-01  1.556e-01 -1.671e-02]
     nfev: 306
     njev: 51
 hess_inv: <5x5 LbfgsInvHessProduct with dtype=float64>
{'omega': 0.49577406929720225, 'alpha': 0.4575243769258762, 'beta': 0.538994845794549, 'gamma': array([[0.49253389, 0.53119737]])}


In [35]:
garch_with_neg_sentiment.summary()

  diagnosis_df = pd.DataFrame(data={'coef': coef, 'std err': np.sqrt(info_mat_inv)}, index=index)


Unnamed: 0,coef,std err,t,P>|t|
omega,0.495774,0.146679,3.379987,0.000384
alpha,0.457524,,,
beta,0.538995,,,
gamma[0],0.492534,,,
gamma[1],0.531197,,,


In [36]:
with_neg_sentiment_log_like = garch_with_neg_sentiment.loglikelihood
print(f"Log likelihood: {with_neg_sentiment_log_like:.3f}")

Log likelihood: -1980.669


In [37]:
likelihood_ratio_test(garch_baseline.loglikelihood, 
                      garch_with_neg_sentiment.loglikelihood, 2)

ValueError: ll_null should be smaller than ll_alternative.

---

In [None]:
def mse(actual, pred):
    return np.sum((actual-pred) ** 2)

mse_baseline = mse(data_with_sentiment['VIX Close'], garch_baseline.sigma2)
mse_sentiment = mse(data_with_sentiment['VIX Close'], garch_with_sentiment.sigma2)

print(f"MSE baseline: {mse_baseline:.3f}")
print(f"MSE with sentiment: {mse_sentiment:.3f}")

MSE baseline: 509949.092
MSE with sentiment: 511752.741


## Checking with arch library to make sure we are correct.

In [304]:
from arch import arch_model

model = arch_model(100*log_returns, vol='GARCH', mean='ARX', p=1, q=1)
garch_fit = model.fit(disp='off')

In [305]:
garch_fit.summary()

0,1,2,3
Dep. Variable:,log_returns,R-squared:,0.0
Mean Model:,AR,Adj. R-squared:,0.0
Vol Model:,GARCH,Log-Likelihood:,-1881.7
Distribution:,Normal,AIC:,3771.41
Method:,Maximum Likelihood,BIC:,3789.42
,,No. Observations:,668.0
Date:,"Thu, Mar 13 2025",Df Residuals:,667.0
Time:,07:23:01,Df Model:,1.0

0,1,2,3,4,5
,coef,std err,t,P>|t|,95.0% Conf. Int.
Const,0.1394,0.150,0.928,0.354,"[ -0.155, 0.434]"

0,1,2,3,4,5
,coef,std err,t,P>|t|,95.0% Conf. Int.
omega,2.1555,1.493,1.444,0.149,"[ -0.771, 5.082]"
alpha[1],0.0662,3.338e-02,1.984,4.729e-02,"[7.918e-04, 0.132]"
beta[1],0.8032,0.108,7.426,1.120e-13,"[ 0.591, 1.015]"


### Loading in Tesla Data

In [306]:
tesla_v1 = pd.read_csv('../data/tesla_gpt_summarised_v1.csv')
tesla_v2 = pd.read_csv('../data/tesla_gpt_summarised_v2(strictprompt).csv')
market_sentiment = pd.read_csv('../data/nyt_snp_headlines_with_sentiment.csv')

In [307]:
tesla_v1['News'] = 'tesla'
tesla_v2['News'] = 'tesla'
market_sentiment['News'] = 'Market'
relevant_columns = ['pos_sentiment','neg_sentiment','neutral_sentiment','pos_sentiment_w_preamb','neg_sentiment_w_preamb,neutral_sentiment_w_preamb','News','adjusted_date']

In [308]:
tesla_v1['adjusted_date'] = pd.to_datetime(tesla_v1['adjusted_date'], format="%d/%m/%y").dt.strftime("%Y-%m-%d")
tesla_v2['adjusted_date'] = pd.to_datetime(tesla_v2['adjusted_date'], format="%d/%m/%y").dt.strftime("%Y-%m-%d")


In [309]:
cols = ['adjusted_date','pos_sentiment','neg_sentiment','neutral_sentiment','pos_sentiment_w_preamb','neg_sentiment_w_preamb','neutral_sentiment_w_preamb','News']
tesla_v1_with_market = pd.concat([tesla_v1, market_sentiment],ignore_index=True)
tesla_v2_with_market = pd.concat([tesla_v2, market_sentiment],ignore_index=True)

In [310]:
tesla_v1_with_market = tesla_v1_with_market[cols]
tesla_v2_with_market = tesla_v2_with_market[cols]
tesla_v1_with_market.index = pd.DatetimeIndex(tesla_v1_with_market['adjusted_date'])
tesla_v2_with_market.index = pd.DatetimeIndex(tesla_v2_with_market['adjusted_date'])
tesla_v1_with_market = tesla_v1_with_market.drop(columns = 'adjusted_date')
tesla_v2_with_market = tesla_v2_with_market.drop(columns = 'adjusted_date')

In [327]:
def merge_cleanup(prices_df, union_sentiment_df, option, stock_name):
    """
    Cleans and merges stock price data with sentiment data based on the specified option.

    Parameters
    ----------
    prices_df : pandas.DataFrame
        DataFrame containing stock price data with a datetime index.
    union_sentiment_df : pandas.DataFrame
        DataFrame containing sentiment data with a datetime index.
    option : {1, 2, 3, 4}
        An integer specifying the merging strategy:
        - 1: Drop days without stock sentiment.
        - 2: Use stock sentiment when available; otherwise, replace it with market sentiment.
        - 3: Compute a 50-50 weighted aggregate for days with both stock and market sentiment.
        - 4: Aggregate based on the number of news articles when both stock and market sentiment are present.
    stock_name : str
        The stock name (e.g., "Tesla", "Microsoft") as found in the "News" column of `union_sentiment_df`. 
        This is used to filter stock-specific sentiment data.

    Returns
    -------
    pandas.DataFrame
        A merged DataFrame with sentiment data adjusted according to the selected option.
    """
    stocksentiment_df = union_sentiment_df[union_sentiment_df['News'] == stock_name] 
    marketsentiment_df = union_sentiment_df[union_sentiment_df['News'] == 'Market']

    #Option 1: Drop the days without the stocks_df
    if option == 1:
        data_with_sentiment = prices_df.join(stocksentiment_df, how='inner')#.join(vix[['VIX Close']], how='left')
        data_with_sentiment = data_with_sentiment.drop(columns=['Date'])
        data_with_sentiment['neg_sentiment_lag1'] = data_with_sentiment['neg_sentiment'].shift(1)
        data_with_sentiment['neg_sentiment_diff'] = data_with_sentiment['neg_sentiment'] - data_with_sentiment['neg_sentiment_lag1']
        data_with_sentiment = data_with_sentiment.dropna()
        data_with_sentiment = data_with_sentiment.join(t_rates_df)
        # Fill missing i/r data. Missing data is sparse, but might have to investigate.
        data_with_sentiment = data_with_sentiment.ffill()
        log_returns = data_with_sentiment['log_returns']
        data_with_sentiment = data_with_sentiment.reset_index()
        relevant_columns = ['Date','Stock Splits','log_returns','pos_sentiment','neg_sentiment','neutral_sentiment', 'pos_sentiment_w_preamb','neg_sentiment_w_preamb','neutral_sentiment_w_preamb','neg_sentiment_lag1','neg_sentiment_diff','4 WEEKS BANK DISCOUNT']
        data_with_sentiment = data_with_sentiment[relevant_columns]
        agg_func = {
            'pos_sentiment': 'mean',
            'neg_sentiment': 'mean',
            'neutral_sentiment': 'mean',
            'pos_sentiment_w_preamb': 'mean',
            'neg_sentiment_w_preamb': 'mean',
            'neutral_sentiment_w_preamb': 'mean',
            'Stock Splits': 'mean',
            'log_returns': 'mean',
            'neg_sentiment_lag1': 'mean',
            'neg_sentiment_diff': 'mean',
            '4 WEEKS BANK DISCOUNT': 'mean'}
        column_rename = {
            'pos_sentiment': 'mean_pos_sentiment',
            'neg_sentiment': 'mean_neg_sentiment',
            'neutral_sentiment': 'mean_neutral_sentiment',
            'pos_sentiment_w_preamb': 'mean_pos_preamble_sentiment',
            'neg_sentiment_w_preamb': 'mean_neg_preamble_sentiment',
            'neutral_sentiment_w_preamb': 'mean_neutral_preamble_sentiment'}
        final_df = data_with_sentiment.groupby(by='Date').agg(agg_func).rename(columns=column_rename).reset_index()
        return final_df

        
    
    #Option 2: 
    # On days where there is no stock sentiment, we replace it with market sentiment. 
    # On days with, we just use stock sentiment alone
    if option == 2:
        agg_func = {
            'pos_sentiment': 'mean',
            'neg_sentiment': 'mean',
            'neutral_sentiment': 'mean',
            'pos_sentiment_w_preamb': 'mean',
            'neg_sentiment_w_preamb': 'mean',
            'neutral_sentiment_w_preamb': 'mean'}
        column_rename = {
            'pos_sentiment': 'mean_pos_sentiment',
            'neg_sentiment': 'mean_neg_sentiment',
            'neutral_sentiment': 'mean_neutral_sentiment',
            'pos_sentiment_w_preamb': 'mean_pos_preamble_sentiment',
            'neg_sentiment_w_preamb': 'mean_neg_preamble_sentiment',
            'neutral_sentiment_w_preamb': 'mean_neutral_preamble_sentiment'}
        #stocksentiment_df = stocksentiment_df.drop(columns=['adjusted_date'])
        #marketsentiment_df = marketsentiment_df.drop(columns=['adjusted_date'])
        stocksentiment_df_agg = stocksentiment_df.groupby(by='adjusted_date').agg(agg_func).rename(columns=column_rename).reset_index()
        stocksentiment_df_agg['News'] = stock_name
        marketsentiment_df_agg = marketsentiment_df.groupby(by='adjusted_date').agg(agg_func).rename(columns=column_rename).reset_index()
        marketsentiment_df_agg['News'] = 'Market'
        stocksentiment_df_agg.index = pd.DatetimeIndex(stocksentiment_df_agg['adjusted_date'])
        marketsentiment_df_agg.index = pd.DatetimeIndex(marketsentiment_df_agg['adjusted_date'])
        data_with_sentiment = prices_df.join(stocksentiment_df_agg, how='left')#.join(vix[['VIX Close']], how='left')
        data_with_sentiment = data_with_sentiment.join(marketsentiment_df_agg, how='left',lsuffix= stock_name, rsuffix= 'market')#.join(vix[['VIX Close']], how='left')
        #print(data_with_sentiment.columns)
        data_with_sentiment.rename(columns={
            'mean_pos_sentimenttesla': 'mean_pos_sentiment',
            'mean_neg_sentimenttesla': 'mean_neg_sentiment',
            'mean_neutral_sentimenttesla': 'mean_neutral_sentiment',
            'mean_pos_preamble_sentimenttesla': 'mean_pos_preamble_sentiment',
            'mean_neg_preamble_sentimenttesla': 'mean_neg_preamble_sentiment',
            'mean_neutral_preamble_sentimenttesla': 'mean_neutral_preamble_sentiment',
            'Newstesla':'News'}, inplace=True)
        
        sentiment_columns = ["mean_pos_sentiment", "mean_neg_sentiment", "mean_neutral_sentiment",
                         "mean_pos_preamble_sentiment", "mean_neg_preamble_sentiment", "mean_neutral_preamble_sentiment",'News']
        
        stock_columns = [col for col in sentiment_columns]
        market_columns = [col + "market" for col in sentiment_columns]
        # Ensure stock_columns and market_columns are correctly aligned
        for stock_col, market_col in zip(stock_columns, market_columns):
            data_with_sentiment[stock_col] = data_with_sentiment[stock_col].where(
            data_with_sentiment[stock_col].notna(),  # Keep stock sentiment if it's NOT NaN
            data_with_sentiment[market_col]  # Otherwise, replace with market sentiment
            )

        #rename_columns = {col + stock_name : col for col in sentiment_columns}
        #data_with_sentiment.rename(columns=rename_columns, inplace=True)
        market_sentiment_cols = [col + "market" for col in sentiment_columns]
        data_with_sentiment.drop(columns=market_sentiment_cols, inplace=True)
        return data_with_sentiment

    #Option 3: 
    # On days where there is no stock sentiment, we replace it with market sentiment.
    # On days where there is stock sentiment, we will do the aggregation (50% stock sentiment, 50% market sentiment)
    if option == 3:
        agg_func = {
            'pos_sentiment': 'mean',
            'neg_sentiment': 'mean',
            'neutral_sentiment': 'mean',
            'pos_sentiment_w_preamb': 'mean',
            'neg_sentiment_w_preamb': 'mean',
            'neutral_sentiment_w_preamb': 'mean'}
        column_rename = {
            'pos_sentiment': 'mean_pos_sentiment',
            'neg_sentiment': 'mean_neg_sentiment',
            'neutral_sentiment': 'mean_neutral_sentiment',
            'pos_sentiment_w_preamb': 'mean_pos_preamble_sentiment',
            'neg_sentiment_w_preamb': 'mean_neg_preamble_sentiment',
            'neutral_sentiment_w_preamb': 'mean_neutral_preamble_sentiment'}
        stocksentiment_df_agg = stocksentiment_df.groupby(by='adjusted_date').agg(agg_func).rename(columns=column_rename).reset_index()
        marketsentiment_df_agg = marketsentiment_df.groupby(by='adjusted_date').agg(agg_func).rename(columns=column_rename).reset_index()
        #This will give me a df where there are two entries per day if there exists both market and stock sentiment
        unionsentiment_df_agg =  pd.concat([stocksentiment_df_agg, marketsentiment_df_agg], ignore_index=True)
        unionsentiment_df_agg.index = pd.DatetimeIndex(unionsentiment_df_agg['adjusted_date'])
        data_with_sentiment = prices_df.join(unionsentiment_df_agg, how='inner')#.join(vix[['VIX Close']], how='left')
        data_with_sentiment = data_with_sentiment.drop(columns=['Date', 'adjusted_date'])
        data_with_sentiment['neg_sentiment_lag1'] = data_with_sentiment['mean_neg_sentiment'].shift(1)
        data_with_sentiment['neg_sentiment_diff'] = data_with_sentiment['mean_neg_sentiment'] - data_with_sentiment['neg_sentiment_lag1']
        data_with_sentiment = data_with_sentiment.dropna()
        data_with_sentiment = data_with_sentiment.join(t_rates_df)
        data_with_sentiment = data_with_sentiment.reset_index()
        relevant_columns = ['Date','Stock Splits','log_returns','mean_pos_sentiment','mean_neg_sentiment','mean_neutral_sentiment', 'mean_pos_preamble_sentiment','mean_neg_preamble_sentiment','mean_neutral_preamble_sentiment','neg_sentiment_lag1','neg_sentiment_diff','4 WEEKS BANK DISCOUNT']
        data_with_sentiment = data_with_sentiment[relevant_columns]
        agg_func = {
            'mean_pos_sentiment': 'mean',
            'mean_neg_sentiment': 'mean',
            'mean_neutral_sentiment': 'mean',
            'mean_pos_preamble_sentiment': 'mean',
            'mean_neg_preamble_sentiment': 'mean',
            'mean_neutral_preamble_sentiment': 'mean',
            'Stock Splits': 'mean',
            'log_returns': 'mean',
            'neg_sentiment_lag1': 'mean',
            'neg_sentiment_diff': 'mean',
            '4 WEEKS BANK DISCOUNT': 'mean'}

        data_with_sentiment_agg = data_with_sentiment.groupby(by='Date').agg(agg_func).reset_index()
        return data_with_sentiment_agg

    #Option 4: 
    # On days where there is no stock sentiment, we replace it with market sentiment.
    # On days where there is stock sentiment, we aggregate based on the number of news articles.
    if option == 4:
        total_sentiment =  pd.concat([stocksentiment_df.reset_index(), marketsentiment_df.reset_index()], ignore_index=True)
        agg_func = {
            'pos_sentiment': 'mean',
            'neg_sentiment': 'mean',
            'neutral_sentiment': 'mean',
            'pos_sentiment_w_preamb': 'mean',
            'neg_sentiment_w_preamb': 'mean',
            'neutral_sentiment_w_preamb': 'mean'}
        column_rename = {
            'pos_sentiment': 'mean_pos_sentiment',
            'neg_sentiment': 'mean_neg_sentiment',
            'neutral_sentiment': 'mean_neutral_sentiment',
            'pos_sentiment_w_preamb': 'mean_pos_preamble_sentiment',
            'neg_sentiment_w_preamb': 'mean_neg_preamble_sentiment',
            'neutral_sentiment_w_preamb': 'mean_neutral_preamble_sentiment'}

        total_sentiment_agg = total_sentiment.groupby(by='adjusted_date').agg(agg_func).reset_index()
        total_sentiment_agg.index = pd.DatetimeIndex(total_sentiment_agg['adjusted_date'])
        data_with_sentiment = prices_df.join(total_sentiment_agg, how='left')#.join(vix[['VIX Close']], how='left')
        return data_with_sentiment

In [328]:
tesla_v1_1 = merge_cleanup(prices_df, tesla_v1_with_market,1,'tesla')
tesla_v1_2 = merge_cleanup(prices_df, tesla_v1_with_market,2,'tesla')
tesla_v1_3 = merge_cleanup(prices_df, tesla_v1_with_market,3,'tesla')
tesla_v1_4 = merge_cleanup(prices_df, tesla_v1_with_market,4,'tesla')

In [329]:
tesla_v2_1 = merge_cleanup(prices_df, tesla_v2_with_market,1,'tesla')
tesla_v2_2 = merge_cleanup(prices_df, tesla_v2_with_market,2,'tesla')
tesla_v2_3 = merge_cleanup(prices_df, tesla_v2_with_market,3,'tesla')
tesla_v2_4 = merge_cleanup(prices_df, tesla_v2_with_market,4,'tesla')

In [None]:
garch_with_sentiment = GARCH(p=1, q=1, z=1, verbose=True)
garch_with_sentiment.train(100*log_returns, x=exo_sentiment)

garch_with_sentiment.summary()