In [1]:
# Load data and define sharpe function to evaluate the models: 
#need volatility data and the data from the risk_adjusted_returns file:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#Get volatility historical data:
url_volatility = 'https://raw.githubusercontent.com/inga-maria01/master_thesis/main/data/volatility_data.csv'
volatility_df = pd.read_csv(url_volatility)
#make the date a datetime object:
volatility_df['Date'] = pd.to_datetime(volatility_df['Date'])

# first we need the historical data: 
url_nasdaq_price = 'https://raw.githubusercontent.com/inga-maria01/master_thesis/main/data/QQQ.csv'
price_df = pd.read_csv(url_nasdaq_price)
price_df['Date'] = pd.to_datetime(price_df['Date'])
price_df.head()
#this dataset includes the prices and the volume!
# we now need to calculate the returns:
price_df['r_ndq'] = (price_df['Adj Close'] - price_df['Adj Close'].shift(1)) / price_df['Adj Close'].shift(1)
price_df.dropna(inplace=True)
price_df.head()

rf_df = pd.read_csv('https://raw.githubusercontent.com/inga-maria01/master_thesis/main/data/IEF.csv')
rf_df['r_rf'] = (rf_df['Adj Close'] - rf_df['Adj Close'].shift(1)) / rf_df['Adj Close'].shift(1)
rf_df.dropna(inplace=True)
rf_df['Date'] = pd.to_datetime(rf_df['Date'])

portfolio_df = pd.merge(price_df[['Date', 'r_ndq']], 
                     rf_df[['Date', 'r_rf']], 
                     on='Date', 
                     how='inner')

portfolio_df = pd.merge(portfolio_df,  
                     volatility_df[['Date', 'Price']],
                     on='Date', 
                     how='inner')

portfolio_df.rename(columns={'Price' : 'Volatility'}, inplace = True)
portfolio_df.head()

r_rf_df = pd.read_csv('https://raw.githubusercontent.com/inga-maria01/master_thesis/main/data/risk_free_rate.csv')

#divide by 100 because it was already in % -> we dont want that here!
#then divide by 252 since its on a yearly basis but everything else is on a daily basis
#(252 ~ number of us trading days a year)
r_rf_df['daily_rf'] = r_rf_df['rf_rate']/(100*250)
r_rf_df['Date'] = pd.to_datetime(r_rf_df['Date'])
#r_rf_df.set_index('Date', inplace=True)
r_rf_df.head()

portfolio_df = pd.merge(portfolio_df, 
                        r_rf_df[['Date', 'rf_rate', 'daily_rf']], 
                        on='Date', 
                        how='inner')

sentiment_index_df = pd.read_excel('https://raw.githubusercontent.com/inga-maria01/master_thesis/main/index/sentiment_index_unweighted_v7.xlsx')
sentiment_index_df.rename(columns={'date':'Date'}, inplace=True)
# sentiment_index_df['lag_sentiment'] = sentiment_index_df['sentiment_score'].shift(1)
sentiment_index_df.head()

portfolio_df_sent = pd.merge(portfolio_df, sentiment_index_df, on='Date', how='inner')
portfolio_df_sent.set_index('Date', inplace=True)
portfolio_df_sent['moving_average_10day'] = portfolio_df_sent['sentiment_score'].rolling(window = 10).mean() # maybe try exponential?
portfolio_df_sent['moving_average_5day'] = portfolio_df_sent['sentiment_score'].rolling(window = 5).mean() # maybe try exponential?
portfolio_df_sent['moving_average_20day'] = portfolio_df_sent['sentiment_score'].rolling(window = 20).mean() # maybe try exponential?
portfolio_df_sent['moving_average_30day'] = portfolio_df_sent['sentiment_score'].rolling(window = 30).mean() # maybe try exponential?
portfolio_df_sent['moving_average_15day'] = portfolio_df_sent['sentiment_score'].rolling(window = 15).mean() # maybe try exponential?

#want to just look at the training period for threshold optimization
#and then just at the test period for calculating the sharpe
portfolio_df_sent_train = portfolio_df_sent[:'2018-06-30'].copy()
portfolio_df_sent_test = portfolio_df_sent['2018-07-01':].copy()


data_df = pd.merge(portfolio_df_sent, price_df[['Date', 'Close', 'Adj Close']], on='Date', how='inner')
data_df = data_df.rename(columns={'Close': 'close_qqq', 'Adj Close': 'adj_close_qqq'})



In [2]:
# future MA includes today as today's sentiment is still unknown before the trading day 
portfolio_df_sent['sentiment_ma_next_10'] = portfolio_df_sent['sentiment_score'].rolling(window=10).mean().shift(-9)

portfolio_df_sent['sentiment_ma_next_5'] = portfolio_df_sent['sentiment_score'].rolling(window=5).mean().shift(-4)

portfolio_df_sent['sentiment_ma_next_15'] = portfolio_df_sent['sentiment_score'].rolling(window=15).mean().shift(-14)


In [3]:
# ma_past = portfolio_df_sent.copy()
# ma_past = ma_past.drop(columns='sentiment_ma_next_10')
# ma_past = ma_past.dropna()

ma_future_5day = portfolio_df_sent.copy()
ma_future_5day = ma_future_5day[ma_future_5day['sentiment_ma_next_5'].notna()]

In [4]:
def sharpe_correct_5day(sentiment, version): # version is either sentiment_score or moving_average_10day
    w_ndq = []
    w_rf = []

    # Assuming 'sentiment' is a pandas Series
    
    upper = ma_future_5day[version].shift(1).rolling(63).quantile(0.75) #apply(lambda x: weighted_quantile(x, quantile=0.75), raw=True)
    lower = ma_future_5day[version].shift(1).rolling(63).quantile(0.25) #apply(lambda x: weighted_quantile(x, quantile=0.25), raw=True)
    test_sent = sentiment['2018-07-01':]
    test_df = ma_future_5day['2018-07-01':].copy()
    for i in test_sent.index:
        score = sentiment[i]
        if score > upper[i]: #23: changing to see if it improves
            w_ndq.append(0.8)
            w_rf.append(0.2)
        elif score <= upper[i] and score > lower[i]: # was 20:
            w_ndq.append(0.6)
            w_rf.append(0.4)
        elif score <= lower[i]:
            w_ndq.append(0.2)
            w_rf.append(0.8)
    test_df['w_ndq'] = w_ndq
    test_df['w_rf'] = w_rf
    test_df['r_portfolio'] = test_df['w_ndq'] * test_df['r_ndq'] + test_df['w_rf'] * test_df['r_rf']
    test_df['excess returns'] = test_df['r_portfolio'] - test_df['daily_rf']
    test_df['trading day'] = (test_df['w_ndq'] != test_df['w_ndq'].shift(1)).astype(int)
    
    test_df.reset_index(inplace=True)
    # Identify maximum index in the dataset
    max_index = test_df.index.max()

    # Initialize starting portfolio value
    initial_portfolio_value = 1

    # Create a new DataFrame to store calculated values
    df_calc = pd.DataFrame(index=test_df.index, columns=['portfolio_value', 'ndq_start', 'rf_start', 'ndq_end', 'rf_end'])
    
    # Initialize first values from row 0
    df_calc.loc[0, 'portfolio_value'] = initial_portfolio_value
    df_calc.loc[0, 'ndq_start'] = 0
    df_calc.loc[0, 'rf_start'] = 0
    df_calc.loc[0, 'ndq_end'] = w_ndq[0] * initial_portfolio_value
    df_calc.loc[0, 'rf_end'] = w_rf[0] * initial_portfolio_value

    # Loop through rows from 1 to max_index to calculate the required columns
    for i in range(1, max_index + 1):
        # Calculate ndq_start and rf_start
        df_calc.loc[i, 'ndq_start'] = df_calc.loc[i - 1, 'ndq_end'] * (1 + test_df.loc[i, 'r_ndq'])
        df_calc.loc[i, 'rf_start'] = df_calc.loc[i - 1, 'rf_end'] * (1 + test_df.loc[i, 'r_rf'])

        # Calculate the new portfolio value
        df_calc.loc[i, 'portfolio_value'] = df_calc.loc[i, 'ndq_start'] + df_calc.loc[i, 'rf_start']

        # Calculate ndq_end and rf_end based on updated portfolio value
        df_calc.loc[i, 'ndq_end'] =  w_ndq[i] * df_calc.loc[i, 'portfolio_value']
        df_calc.loc[i, 'rf_end'] = w_rf[i] * df_calc.loc[i, 'portfolio_value']
    
    return2019 = (df_calc['portfolio_value'][-1:]/df_calc['portfolio_value'][0]) **(1/(365/250)) - 1
    risk_free2019 = test_df['rf_rate'].mean()/100
    std2019 = np.std(test_df['excess returns'])*np.sqrt(250)

    test_df.set_index('Date', inplace=True)
    
    return (return2019 - risk_free2019)/std2019, df_calc, test_df, upper, lower



In [5]:
data_df.set_index('Date', inplace=True)
ma_future_only = ma_future_5day.copy()
ma_future_only = pd.DataFrame(ma_future_only['sentiment_ma_next_5'])
merged_df = ma_future_only.join(data_df, how='inner')
original_features = merged_df.columns

original_features = original_features.drop(['r_rf', 'daily_rf'])
original_features
# want to use lagged values of everything but the future moving average itself

import pandas as pd

df = merged_df.copy()
# Create lagged features
for feature in original_features:
    for lag in range(1, 4):  # 3 lags
        df[f'{feature}_lag_{lag}'] = df[feature].shift(lag)

# Drop rows with NaN values that were created due to lagging
df = df.dropna()

# Split the data into training and testing sets based on the year
train_ma_df = df[:'2018-06-30']
test_ma_df = df['2018-07-01':]

# Prepare features and target variables
features_ma = [f for f in df.columns if 'lag' in f and 'sentiment_ma_next' not in f]

X_train_ma = train_ma_df[features_ma]
y_train_ma = train_ma_df['sentiment_ma_next_5']
y_train_sc = train_ma_df['sentiment_score']

X_test_ma = test_ma_df[features_ma]
y_test_ma = test_ma_df['sentiment_ma_next_5']
y_test_sc = test_ma_df['sentiment_score']

In [13]:
print(features_ma)

['r_ndq_lag_1', 'r_ndq_lag_2', 'r_ndq_lag_3', 'Volatility_lag_1', 'Volatility_lag_2', 'Volatility_lag_3', 'rf_rate_lag_1', 'rf_rate_lag_2', 'rf_rate_lag_3', 'sentiment_score_lag_1', 'sentiment_score_lag_2', 'sentiment_score_lag_3', 'sentiment_slope_lag_1', 'sentiment_slope_lag_2', 'sentiment_slope_lag_3', 'moving_average_10day_lag_1', 'moving_average_10day_lag_2', 'moving_average_10day_lag_3', 'moving_average_5day_lag_1', 'moving_average_5day_lag_2', 'moving_average_5day_lag_3', 'moving_average_20day_lag_1', 'moving_average_20day_lag_2', 'moving_average_20day_lag_3', 'moving_average_30day_lag_1', 'moving_average_30day_lag_2', 'moving_average_30day_lag_3', 'moving_average_15day_lag_1', 'moving_average_15day_lag_2', 'moving_average_15day_lag_3', 'close_qqq_lag_1', 'close_qqq_lag_2', 'close_qqq_lag_3', 'adj_close_qqq_lag_1', 'adj_close_qqq_lag_2', 'adj_close_qqq_lag_3']


In [6]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Models to be used
models = {
    "Random Forest": RandomForestRegressor(random_state=42),
    # "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

# Hyperparameters for tuning (simplified for rolling window example)
parameters = {
    "Random Forest": {'n_estimators': 30, 'max_depth': 7, 'min_samples_leaf' : 3},
    #"Gradient Boosting": {'n_estimators': 120, 'max_depth': 3, 'learning_rate': 0.1, 'min_samples_leaf' : 3}
    # "Gradient Boosting": {'n_estimators': 98, 'max_depth': 2, 'learning_rate': 0.23333333333333334} # not better than current one
}

# Initialize dictionaries to store results
model_performance_rw_ma = {}
model_predictions_rw_ma = {}

# Create test and train data set with date index
X_train_idx = X_train_ma.copy()
# X_train_idx.set_index()
y_train_idx = pd.DataFrame(y_train_ma).copy()

X_test_idx = X_test_ma.copy()
y_test_idx = pd.DataFrame(y_test_ma).copy()

# Implementing rolling window forecasting
for model_name in models:
    print('Model:  ', model_name)
    model = models[model_name]
    params = parameters[model_name]

    # Initialize storage for predictions and actuals
    predictions = []
    feature_importances = []
    # actuals = y_test_idx
    #print(actuals.head())

    # Start rolling window forecast
    for i in X_test_idx.index:

        # Scale x train:
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_idx)
        X_train_scaled_df = pd.DataFrame(X_train_scaled, columns = X_train_idx.columns)
        model.set_params(**params)
        # model.fit(X_train_scaled_df, y_train_idx)
        model.fit(X_train_scaled_df, y_train_idx.values.ravel())


        # Obtain feature importance
        importance = model.feature_importances_
        feature_importances.append(importance)

        # Scale x test for the current point:
        X_test_scaled = scaler.transform(X_test_idx.loc[[i]])
        X_test_scaled_df = pd.DataFrame(X_test_scaled, columns = X_test_idx.columns)
    
        # Make a prediction for the next point
        next_point_prediction = model.predict(X_test_scaled_df)
        predictions.append(next_point_prediction[0])

        # Rolling feature window
        # X_test_to_train = pd.Series([X_test[i]], index=[i])
        X_test_to_train = X_test_idx.loc[[i]]
        print(X_test_to_train)
        X_train_idx = pd.concat([X_train_idx, X_test_to_train])
        X_train_idx = X_train_idx.iloc[1:]

        # Rolling target variable window
        y_test_to_train = y_test_idx.loc[[i]]
        print(y_test_to_train)
        y_train_idx = pd.concat([y_train_idx, y_test_to_train])
        y_train_idx = y_train_idx.iloc[1:]
    
    # Calculate performance metrics
    rmse = np.sqrt(mean_squared_error(y_test_ma, predictions))
    r2 = r2_score(y_test_ma, predictions)

    # Store results
    model_performance_rw_ma[model_name] = {'RMSE': rmse, 'R2': r2}
    model_predictions_rw_ma[model_name] = {'y_pred': predictions, 'feature_importance': feature_importances}

# Output model performance
print(model_performance_rw_ma)


Model:   Random Forest
            r_ndq_lag_1  r_ndq_lag_2  r_ndq_lag_3  Volatility_lag_1  \
Date                                                                  
2018-07-02     0.002687     0.008602    -0.013599             19.26   

            Volatility_lag_2  Volatility_lag_3  rf_rate_lag_1  rf_rate_lag_2  \
Date                                                                           
2018-07-02             20.06             19.67           1.93           1.93   

            rf_rate_lag_3  sentiment_score_lag_1  ...  \
Date                                              ...   
2018-07-02           1.93              19.547577  ...   

            moving_average_30day_lag_3  moving_average_15day_lag_1  \
Date                                                                 
2018-07-02                   22.803948                   21.953136   

            moving_average_15day_lag_2  moving_average_15day_lag_3  \
Date                                                                 

In [8]:
rf_ma_predictions = model_predictions_rw_ma['Random Forest']['y_pred']
rf_ma_pred_series = pd.Series(rf_ma_predictions)
rf_ma_pred_series.index = pd.to_datetime(X_test_ma.index)
# 0.25/75, 1.24, 44 trading days / 0.33/66, many inputs

sharpe_rf_ma, df_calc_rf_ma, test_df_rf_ma, upper_rf_ma, lower_rf_ma = sharpe_correct_5day(rf_ma_pred_series, 'moving_average_10day')
print(sharpe_rf_ma)
test_df_rf_ma['trading day'].sum()

368    2.014898
Name: portfolio_value, dtype: object


83

## Implement a for loop to test several hyperparameter combinations

In [9]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from itertools import product

# Models to be used
models = {
    "Random Forest": RandomForestRegressor(random_state=42),
    # "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

# Hyperparameters for tuning
n_estimators = [30, 40, 50]
max_depth = [None, 5, 10]
min_samples_leaf = [1, 3, 5]

# Initialize dictionaries to store results
model_performance_rw_ma = {}
model_predictions_rw_ma = {}

# Create test and train data set with date index
X_train_idx = X_train_ma.copy()
y_train_idx = pd.DataFrame(y_train_ma).copy()
X_test_idx = X_test_ma.copy()
y_test_idx = pd.DataFrame(y_test_ma).copy()

# Implementing rolling window forecasting
for model_name in models:
    print('Model:  ', model_name)
    model = models[model_name]

    # Initialize storage for performance metrics
    best_rmse = float('inf')
    best_r2 = -float('inf')
    best_params = None
    best_predictions = None
    best_feature_importances = None

    # Iterate through all combinations of hyperparameters
    for params in product(n_estimators, max_depth, min_samples_leaf):
        param_dict = {
            'n_estimators': params[0],
            'max_depth': params[1],
            'min_samples_leaf': params[2]
        }
        
        # Initialize storage for predictions and actuals
        predictions = []
        feature_importances = []

        # Start rolling window forecast
        for i in X_test_idx.index:

            # Scale x train:
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train_idx)
            X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train_idx.columns)
            model.set_params(**param_dict)
            model.fit(X_train_scaled_df, y_train_idx.values.ravel())

            # Obtain feature importance
            importance = model.feature_importances_
            feature_importances.append(importance)

            # Scale x test for the current point:
            X_test_scaled = scaler.transform(X_test_idx.loc[[i]])
            X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test_idx.columns)

            # Make a prediction for the next point
            next_point_prediction = model.predict(X_test_scaled_df)
            predictions.append(next_point_prediction[0])

            # Rolling feature window
            X_test_to_train = X_test_idx.loc[[i]]
            X_train_idx = pd.concat([X_train_idx, X_test_to_train])
            X_train_idx = X_train_idx.iloc[1:]

            # Rolling target variable window
            y_test_to_train = y_test_idx.loc[[i]]
            y_train_idx = pd.concat([y_train_idx, y_test_to_train])
            y_train_idx = y_train_idx.iloc[1:]

        # Calculate performance metrics
        rmse = np.sqrt(mean_squared_error(y_test_ma, predictions))
        r2 = r2_score(y_test_ma, predictions)

        # Store results if performance is better
        if rmse < best_rmse and r2 > best_r2:
            best_rmse = rmse
            best_r2 = r2
            best_params = param_dict
            best_predictions = predictions
            best_feature_importances = feature_importances

    # Store best results for the model
    model_performance_rw_ma[model_name] = {'RMSE': best_rmse, 'R2': best_r2, 'Best Params': best_params}
    model_predictions_rw_ma[model_name] = {'y_pred': best_predictions, 'feature_importance': best_feature_importances}

# Output model performance
print(model_performance_rw_ma)


Model:   Random Forest
{'Random Forest': {'RMSE': 0.18975805398015316, 'R2': 0.9948238905728287, 'Best Params': {'n_estimators': 40, 'max_depth': None, 'min_samples_leaf': 1}}}


In [10]:
rf_ma_predictions = model_predictions_rw_ma['Random Forest']['y_pred']
rf_ma_pred_series = pd.Series(rf_ma_predictions)
rf_ma_pred_series.index = pd.to_datetime(X_test_ma.index)
# 0.25/75, 1.24, 44 trading days / 0.33/66, many inputs

sharpe_rf_ma, df_calc_rf_ma, test_df_rf_ma, upper_rf_ma, lower_rf_ma = sharpe_correct_5day(rf_ma_pred_series, 'moving_average_10day')
print(sharpe_rf_ma)
test_df_rf_ma['trading day'].sum()

368    4.333296
Name: portfolio_value, dtype: object


71