In [1]:
import pandas as pd

In [2]:
sentiment_idx_df = pd.read_excel('https://raw.githubusercontent.com/inga-maria01/master_thesis/main/index/sentiment_index_unweighted_v7.xlsx')


In [3]:
sentiment_idx_df

Unnamed: 0,date,sentiment_score,sentiment_slope
0,2015-01-01,17.132027,-1.195109
1,2015-01-02,14.827878,-0.130807
2,2015-01-03,18.088204,0.938619
3,2015-01-04,16.568480,1.079692
4,2015-01-05,18.532825,-1.688521
...,...,...,...
1821,2019-12-27,22.540491,2.444858
1822,2019-12-28,25.024275,-1.818651
1823,2019-12-29,26.507670,-0.658612
1824,2019-12-30,23.692474,-0.751765


In [6]:
sentiment_idx_df.dtypes

date               datetime64[ns]
sentiment_score           float64
sentiment_slope           float64
dtype: object

In [7]:
# Check for missing days
import pandas as pd

def find_missing_days(df, date_col='date', start_date='2015-01-01', end_date='2019-12-31'):
    """
    Find missing dates within a specified range in a DataFrame.
    
    Args:
        df (pd.DataFrame): DataFrame containing a date column.
        date_col (str): The name of the date column in the DataFrame.
        start_date (str): The start date in 'YYYY-MM-DD' format.
        end_date (str): The end date in 'YYYY-MM-DD' format.
    
    Returns:
        pd.DatetimeIndex: An index of missing dates.
    """
    # Ensure the date column is a datetime type
    df[date_col] = pd.to_datetime(df[date_col])
    
    # Filter the data to the specified range
    df = df[(df[date_col] >= start_date) & (df[date_col] <= end_date)]
    
    # Set the date column as index if not already set
    df.set_index(date_col, inplace=True)
    
    # Generate a complete date range with daily frequency
    full_range = pd.date_range(start=start_date, end=end_date, freq='D')
    
    # Identify missing dates by finding the difference
    missing_dates = full_range.difference(df.index)
    
    return missing_dates

In [8]:
missing_days = find_missing_days(sentiment_idx_df, date_col='date', start_date='2015-01-01', end_date='2019-12-31')
print(missing_days)

DatetimeIndex([], dtype='datetime64[ns]', freq='D')


There are no missing dates!

Next, do the Markov-switching regression (MRS) model

In [None]:
from statsmodels.tsa.regime_switching.markov_regression import MarkovRegression

df = sentiment_idx_df.copy()
# set date as index
df.set_index('date', inplace=True)

# Instantiate a two-state Markov-Switching Regression model
model = MarkovRegression(df['sentiment_score'], k_regimes=2, trend='c', switching_variance=True)

# Fit the model to the data
result = model.fit()

# Predict the next regime based on the current states
predicted_regimes = result.predict(n_periods=1)

# Access the predicted regime for the next day
next_day_regime = predicted_regimes[-1]

# Get full summary of results
summary = result.summary()
print(summary)


In [15]:
# Create copy of sentiment index
df = sentiment_idx_df.copy()

# set date as index
df.set_index('date', inplace=True)

# Split data into training and test
train_data = df[:'2018-06-30'].copy()
test_data = df['2018-07-01':].copy()


In [18]:
train_data

Unnamed: 0_level_0,sentiment_score,sentiment_slope
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-01,17.132027,-1.195109
2015-01-02,14.827878,-0.130807
2015-01-03,18.088204,0.938619
2015-01-04,16.568480,1.079692
2015-01-05,18.532825,-1.688521
...,...,...
2018-06-26,23.830125,-0.589143
2018-06-27,19.477187,-0.789443
2018-06-28,26.278540,1.807002
2018-06-29,19.547577,-1.919690


In [19]:
test_data

Unnamed: 0_level_0,sentiment_score,sentiment_slope
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-07-01,24.215623,1.497774
2018-07-02,24.988730,0.397401
2018-07-03,19.706582,1.138870
2018-07-04,25.836597,-0.953103
2018-07-05,21.223874,-0.973658
...,...,...
2019-12-27,22.540491,2.444858
2019-12-28,25.024275,-1.818651
2019-12-29,26.507670,-0.658612
2019-12-30,23.692474,-0.751765


In [None]:

# Rolling window (window size is 80% of data)

import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")


# Initialize lists to store MSE values and forecasts
res_list = []
forecast_list = []
conf = []

#initial training and test sets:
#train_data = y_train.copy() # ['sentiment_score'].copy()
#test_data = y_test.copy() #sentiment_df_test['sentiment_score'].copy()
train_data = portfolio_df_sent[:'2018-07-01']['moving_average_10day'].dropna().copy()
test_data = portfolio_df_sent['2018-07-01':]['moving_average_10day'].copy()

for i in test_data.index:
    #print(len(train_data))
    # model = AutoReg(train_data, lags=optimal_lag_bic) # optimal lag = 10
    model = ARIMA(train_data, order=(10, 0, 10))
    model_fit = model.fit()

    #forecast = model_fit.predict(start=len(train_data), end=len(train_data))
    # forecast = model_fit.forecast(steps=1)
    get_forecast = model_fit.get_forecast(steps=1)

    forecast = get_forecast.predicted_mean
    forecast_conf = get_forecast.conf_int(alpha=0.05)

    #print(len(forecast))

    #print(forecast)
    forecast_list.append(forecast)
    conf.append(forecast_conf)

    #print(test_data[i])

    res = test_data[i] - forecast
    res_list.append(res)

    # Expand estimation window
    test_to_train = pd.Series([test_data[i]], index=[i])
    #print(test_to_train)
    train_data = pd.concat([train_data, test_to_train])
    train_data = train_data.iloc[1:]
    #print(len(train_data))


    #print(train_data[-5:])

# Calculate the overall average MSE
mse = np.mean(np.square(res))
print("Overall average MSE:", mse)