In [1]:
import pandas as pd

In [49]:
sentiment_index_df = pd.read_excel('https://raw.githubusercontent.com/inga-maria01/master_thesis/main/index/sentiment_index_unweighted_v5.xlsx').set_index('date')


In [51]:
sentiment_index_df.head()

Unnamed: 0_level_0,sentiment_score,sentiment_slope
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-02,14.185966,-0.127143
2015-01-05,18.631666,-1.688521
2015-01-06,10.882382,0.48448
2015-01-07,17.620721,-1.852799
2015-01-08,15.960441,0.916068


In [59]:
sentiment_index_lag_df = sentiment_index_df.shift(1).drop(sentiment_index_df.index[0]).rename(columns={'sentiment_score': 'sentiment_score_lag', 'sentiment_slope': 'sentiment_slope_lag'})

In [60]:
sentiment_index_lag_df.head()

Unnamed: 0_level_0,sentiment_score_lag,sentiment_slope_lag
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-05,14.185966,-0.127143
2015-01-06,18.631666,-1.688521
2015-01-07,10.882382,0.48448
2015-01-08,17.620721,-1.852799
2015-01-09,15.960441,0.916068


## Define input and target variables

In [61]:
input = sentiment_index_lag_df.copy()
target = pd.DataFrame(sentiment_index_df['sentiment_score']).drop(sentiment_index_df.index[0])

In [66]:
input.head()

Unnamed: 0_level_0,sentiment_score_lag,sentiment_slope_lag
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-05,14.185966,-0.127143
2015-01-06,18.631666,-1.688521
2015-01-07,10.882382,0.48448
2015-01-08,17.620721,-1.852799
2015-01-09,15.960441,0.916068


In [67]:
target.head()

Unnamed: 0_level_0,sentiment_score
date,Unnamed: 1_level_1
2015-01-05,18.631666
2015-01-06,10.882382
2015-01-07,17.620721
2015-01-08,15.960441
2015-01-09,20.128338


## Random Forest

In [68]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Assuming 'input' and 'target' are your DataFrames with lagged input variables and current sentiment scores, respectively
# Assuming the input variables are sentiment_score_lag1 and sentiment_slope_lag1
# Assuming the current sentiment score to be predicted is sentiment_score

# Define the size of the rolling window
window_size = 30  # Adjust as needed

# Initialize lists to store predictions and actual values
predictions = []
actual_values = []

# Iterate over each date in the target DataFrame, using a rolling window
for date in target.index[window_size:]:
    # Get the start and end dates for the rolling window
    window_start = date - pd.DateOffset(days=window_size)
    window_end = date - pd.DateOffset(days=1)
    
    # Extract the input variables for the current rolling window
    window_input = input.loc[window_start:window_end]
    
    # Extract the target variable (current sentiment score) for the current date
    target_value = target.loc[date, 'sentiment_score']
    
    # Prepare the input features and target variable for training
    X = window_input[['sentiment_score_lag1', 'sentiment_slope_lag1']]
    y = window_input['sentiment_score']
    
    # Initialize and train the Random Forest model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X, y)
    
    # Make a one-day-ahead prediction for the current sentiment score
    prediction = model.predict([[window_input.loc[date, 'sentiment_score_lag1'], 
                                  window_input.loc[date, 'sentiment_slope_lag1']]])  
    
    # Append the prediction and actual value to the respective lists
    predictions.append(prediction)
    actual_values.append(target_value)

# Calculate the mean squared error for the predictions
mse = mean_squared_error(actual_values, predictions)

# Calculate the R² score
r2 = r2_score(actual_values, predictions)

print("Mean Squared Error:", mse)
print("R² Score:", r2)


KeyError: "None of [Index(['sentiment_score_lag1', 'sentiment_slope_lag1'], dtype='object')] are in the [columns]"

In [38]:
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.metrics import R2Score
from sklearn.preprocessing import MinMaxScaler


# Set seeds to ensure reproducibility
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

# Load your data
data = sentiment_index_df

# Normalize the data
scaler = MinMaxScaler()
data[['sentiment_score', 'sentiment_slope']] = scaler.fit_transform(data[['sentiment_score', 'sentiment_slope']])

# Function to create sequences
def create_sequences(data, n_steps):
    X, y = [], []
    for i in range(len(data)):
        end_ix = i + n_steps
        if end_ix > len(data)-1:
            break
        seq_x, seq_y = data[i:end_ix], data[end_ix]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

# Define the number of steps
n_steps = 5
sequence_data = data[['sentiment_score', 'sentiment_slope']].values
X, y = create_sequences(sequence_data, n_steps)

# Split the data into training and test sets
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size, 0], y[train_size:, 0]

# Build the LSTM model
model = Sequential([
    LSTM(100, activation='relu', input_shape=(n_steps, 2)),
    # Dropout(0.2),
    Dense(1)
])

# Compile the model using the built-in R2Score metric
model.compile(optimizer='adam', loss='mean_squared_error', metrics=[R2Score()])

# Train the model
model.fit(X_train, y_train, epochs=50, validation_data=(X_test, y_test), verbose=1)

# Evaluate the model
train_loss, train_r2 = model.evaluate(X_train, y_train, verbose=0)
test_loss, test_r2 = model.evaluate(X_test, y_test, verbose=0)
print('Train loss:', train_loss, 'Train R²:', train_r2)
print('Test loss:', test_loss, 'Test R²:', test_r2)


In [39]:
y

array([[0.59201808, 0.16814289],
       [0.32958076, 0.46549857],
       [0.5939269 , 0.25969023],
       ...,
       [0.55896241, 0.60681399],
       [0.78599471, 0.35262893],
       [0.75149786, 0.31008715]])

In [40]:
X

array([[[0.39259014, 0.41510382],
        [0.54178927, 0.2589343 ],
        [0.28172074, 0.47627857],
        [0.50786165, 0.24250312],
        [0.45214207, 0.51944616]],

       [[0.54178927, 0.2589343 ],
        [0.28172074, 0.47627857],
        [0.50786165, 0.24250312],
        [0.45214207, 0.51944616],
        [0.59201808, 0.16814289]],

       [[0.28172074, 0.47627857],
        [0.50786165, 0.24250312],
        [0.45214207, 0.51944616],
        [0.59201808, 0.16814289],
        [0.32958076, 0.46549857]],

       ...,

       [[0.78634132, 0.5311224 ],
        [0.57455571, 0.21753637],
        [0.80706174, 0.36190239],
        [0.5279942 , 0.21987101],
        [0.95618685, 0.60240313]],

       [[0.57455571, 0.21753637],
        [0.80706174, 0.36190239],
        [0.5279942 , 0.21987101],
        [0.95618685, 0.60240313],
        [0.55896241, 0.60681399]],

       [[0.80706174, 0.36190239],
        [0.5279942 , 0.21987101],
        [0.95618685, 0.60240313],
        [0.55896241, 0.60

In [34]:
y_train

array([0.59201808, 0.32958076, 0.5939269 , ..., 0.35304032, 0.64506056,
       0.19722688])

## Load more input data
Trying to include more input variables to improve the prediction

In [28]:
further_inputs_df = pd.read_csv('https://raw.githubusercontent.com/inga-maria01/master_thesis/main/data/model_data_no_sentiment.csv').drop(columns=['Unnamed: 0'])
further_inputs_df['Date'] = pd.to_datetime(further_inputs_df['Date'])

In [29]:
further_inputs_df.head()

Unnamed: 0,Date,Volatility,stock_returns,stock_volume,US_log_inflation,US_interest_rate,consumer_barometer,Volatility_lag1,stock_returns_lag1,stock_volume_lag1,US_log_inflation_lag1,US_interest_rate_lag1,consumer_barometer_lag1
0,2015-01-06,19.05,-1.340836,66205500,-0.639069,0.75,0.353886,18.0,-1.466868,36521300.0,-0.639069,0.75,0.353886
1,2015-01-07,17.2,1.289125,37577400,-0.639069,0.75,0.353886,19.05,-1.340836,66205500.0,-0.639069,0.75,0.353886
2,2015-01-08,16.61,1.91396,40212600,-0.639069,0.75,0.353886,17.2,1.289125,37577400.0,-0.639069,0.75,0.353886
3,2015-01-09,16.6,-0.658282,41410100,-0.639069,0.75,0.353886,16.61,1.91396,40212600.0,-0.639069,0.75,0.353886
4,2015-01-12,17.88,-1.042701,34129800,-0.639069,0.75,0.353886,16.6,-0.658282,41410100.0,-0.639069,0.75,0.353886


In [31]:
QQQ_df = pd.read_csv('https://raw.githubusercontent.com/inga-maria01/master_thesis/main/data/QQQ.csv')

In [32]:
QQQ_df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2015-01-02,103.760002,104.199997,102.440002,102.940002,95.704712,31314600
1,2015-01-05,102.489998,102.610001,101.139999,101.430000,94.300850,36521300
2,2015-01-06,101.580002,101.750000,99.620003,100.070000,93.036430,66205500
3,2015-01-07,100.730003,101.599998,100.489998,101.360001,94.235786,37577400
4,2015-01-08,102.220001,103.500000,102.110001,103.300003,96.039421,40212600
...,...,...,...,...,...,...,...
1252,2019-12-23,212.000000,212.149994,211.630005,211.809998,206.367172,23774700
1253,2019-12-24,212.000000,212.089996,211.440002,211.919998,206.474350,7089000
1254,2019-12-26,212.259995,213.809998,212.229996,213.789993,208.296280,17067500
1255,2019-12-27,214.539993,214.559998,213.039993,213.610001,208.120926,18134100


### lag the sentiment_score and sentiment_slope

In [41]:
sentiment_index_lag_df = sentiment_index_df.shift(1)

In [43]:
sentiment_index_df

Unnamed: 0,date,sentiment_score,sentiment_slope
0,2015-01-02,0.392590,0.415104
1,2015-01-05,0.541789,0.258934
2,2015-01-06,0.281721,0.476279
3,2015-01-07,0.507862,0.242503
4,2015-01-08,0.452142,0.519446
...,...,...,...
1253,2019-12-24,0.527994,0.219871
1254,2019-12-26,0.956187,0.602403
1255,2019-12-27,0.558962,0.606814
1256,2019-12-30,0.785995,0.352629


In [42]:
sentiment_index_lag_df

Unnamed: 0,date,sentiment_score,sentiment_slope
0,NaT,,
1,2015-01-02,0.392590,0.415104
2,2015-01-05,0.541789,0.258934
3,2015-01-06,0.281721,0.476279
4,2015-01-07,0.507862,0.242503
...,...,...,...
1253,2019-12-23,0.807062,0.361902
1254,2019-12-24,0.527994,0.219871
1255,2019-12-26,0.956187,0.602403
1256,2019-12-27,0.558962,0.606814
