In [None]:
import pandas as pd
import os
import glob
import re 
import statsmodels.api as sm
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn.metrics import mean_squared_error
from math import sqrt
from pathlib import Path
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dropout, BatchNormalization
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.callbacks import EarlyStopping

# Load Data

In [None]:
# Get root directory
def get_root_dir() -> Path:
    return Path(os.getcwd()).resolve().parent

# Get data directory
def get_data():
    return get_root_dir() / 'data'

# Concat all embedding files
def concat_files(folder_path, file_pattern):
    full_pattern = f'{folder_path}/{file_pattern}'
    file_list = glob.glob(full_pattern)
    file_list.sort(key=lambda x: int(re.search(r'doc_(\d+)', x).group(1)))
    df_list = [pd.read_parquet(file) for file in file_list]
    concatenated_df = pd.concat(df_list, axis=0)
    return concatenated_df

In [None]:
# US Daily News Index
us_news = pd.read_csv(get_data() / 'All_Daily_Policy_Data.csv')

In [None]:
# Categorical EPU Data
epu_cat = pd.read_excel(get_data() / 'Categorical_EPU_Data.xlsx')

In [None]:
# Conference Call Embeddings
cc_emb = pd.read_parquet(get_data() / 'cc' / 'doc.pq')

In [None]:
# NYT Embeddings
nyt_emb = concat_files(get_data() / 'nyt', 'doc_*')

In [None]:
# WSJ Embeddings
wsj_emb = concat_files(get_data() / 'wsj', 'doc_*')

# Format Data

##### Format data into multindex ('permno', 'date')

In [None]:
# Function to create a multindex ('permno', 'date') for a dataframe with only 'date' index
# Parameters: 
    # Data is the actual dataframe with only 'date' index
    # Stock is a list of stocks (i.e., permnos, tickers, etc.)
def create_multi_index(data, stock):
    factor_values = pd.concat([data] * len(stock), ignore_index=True).values
    multi_index = pd.MultiIndex.from_product([stock, data.index])
    multi_index_factor = pd.DataFrame(factor_values, columns=data.columns, index=multi_index)
    multi_index_factor.index = multi_index_factor.index.set_names(['permno', 'date'])
    return multi_index_factor

# Function to get stock list (i.e., permnos, tickers, etc.) from a multindex dataframe ('stock', 'date')
def get_stock_list(data):
    return [stock for stock, df in data.groupby(data.index.names[0], group_keys=False)]

#### Get Stock List

In [None]:
# Get stock list from Conference Call Embeddings:
stock_list_cc = get_stock_list(cc_emb)

#### Daily US News Index

In [None]:
# View Present Data
us_news.head(5)

In [None]:
# Set Date Column
us_news['date'] = pd.to_datetime(us_news[['year', 'month', 'day']])
us_news = us_news.set_index('date')
us_news = us_news[['daily_policy_index']]
# Rename column
us_news.columns = ['daily_pol']

In [None]:
# Create Multindex
us_news_multi = create_multi_index(us_news, stock_list_cc)

In [None]:
# View Formatted Data
us_news.head(5)

In [None]:
# View Formatted Data
us_news_multi.head(5)

#### Categorical EPU Data

In [None]:
# View Present Data
epu_cat.head(5)

In [None]:
# Rename columns
column_names = ['date', 'epu', 'mon_pol', 'fisc_pol', 'tax', 'gov_spend', 'health_care', 
                'nat_sec', 'ent_prog', 'reg', 'fin_reg', 'trade_pol', 'debt']
epu_cat.columns = column_names

In [None]:
# View Last Row
epu_cat.tail(3)

In [None]:
# Remove Last Row
epu_cat = epu_cat.iloc[:-1]

In [None]:
# Convert date column to pd.datetime
epu_cat['date'] = pd.to_datetime(epu_cat['date'])
epu_cat = epu_cat.set_index('date')

In [None]:
# Create Multindex
epu_cat_multi = create_multi_index(epu_cat, stock_list_cc)

In [None]:
# View Formmated Data
epu_cat.head(5)

In [None]:
# View Formmated Data
epu_cat_multi.head(5)

### Conference Call Embeddings

In [None]:
# View Present Data
cc_emb.head(5)

In [None]:
# Set index to (permno)
cc_emb = cc_emb.reset_index(level='fid', drop=True).reset_index(level='date')

In [None]:
# Set date column to pd.datetime
cc_emb['date'] = pd.to_datetime(cc_emb['date'])
cc_emb['date'] = cc_emb['date'].dt.strftime('%Y-%m-%d')
cc_emb['date'] = pd.to_datetime(cc_emb['date'])

In [None]:
# Set index to (permno, date) 
cc_emb = cc_emb.reset_index().set_index(['permno', 'date']).sort_index(level=['permno', 'date'])

In [None]:
# For duplicate (permno, date) indices, assume the most recent date is the correct permno
cc_emb = cc_emb.loc[~cc_emb.index.duplicated(keep='last')]

In [None]:
# View Formmatted Data
cc_emb.head(5)

### NYT Embeddings

In [None]:
# View Present Data 
nyt_emb.head(5)

In [None]:
# Set index to date
nyt_emb['date'] = pd.to_datetime(nyt_emb['date'])
nyt_emb = nyt_emb.set_index('date')

In [None]:
# Remove duplicate indices (keep the most recent date)
nyt_emb = nyt_emb.loc[~nyt_emb.index.duplicated(keep='last')]

In [None]:
# Embeddings Columns
nyt_emb_col = nyt_emb.filter(regex='^c').columns
# Calculate the average of the embeddings
nyt_emb[nyt_emb_col] = nyt_emb[nyt_emb_col].div(nyt_emb['tcount'], axis=0)

In [None]:
# Rename embedding columns from 'c' to 'nyt'
rename_dict_nyt = {col: 'nyt' + col[1:] for col in nyt_emb.columns if col.startswith('c')}
nyt_emb.rename(columns=rename_dict_nyt, inplace=True)

In [None]:
# View Formmated Data
nyt_emb.head(5)

### WSJ Embeddings

In [None]:
# View Present Data 
wsj_emb.head(5)

In [None]:
# Set index to date
wsj_emb['date'] = pd.to_datetime(wsj_emb['date'])
wsj_emb = wsj_emb.set_index('date')

In [None]:
# Remove duplicate indices (keep the most recent date)
wsj_emb = wsj_emb.loc[~wsj_emb.index.duplicated(keep='last')]

In [None]:
# Embeddings Columns
wsj_emb_col = wsj_emb.filter(regex='^c').columns
# Calculate the average of the embeddings
wsj_emb[wsj_emb_col] = wsj_emb[wsj_emb_col].div(wsj_emb['tcount'], axis=0)

In [None]:
# Rename embedding columns from 'c' to 'nyt'
rename_dict_wsj = {col: 'wsj' + col[1:] for col in wsj_emb.columns if col.startswith('c')}
wsj_emb.rename(columns=rename_dict_wsj, inplace=True)

In [None]:
# View Formmated Data
wsj_emb.head(5)

# Merge Data

In [None]:
# # Merge Conference Call Embeddings with US News Daily Index
# all_data = pd.merge(cc_emb, us_news, left_index=True, right_index=True, how='left')
# # Create a temporary YYYY-MM index to merge with monthly interval Data
# all_data['year_month'] = all_data.index.get_level_values('date').strftime('%Y-%m')
# epu_cat['year_month'] = epu_cat.index.get_level_values('date').strftime('%Y-%m')
# # Set index to (permno, year_month)
# all_data = all_data.reset_index().set_index(['permno', 'year_month'])
# epu_cat = epu_cat.reset_index().set_index(['permno', 'year_month'])
# # Drop the date column in Categorical EPU Data to prevent merging _x and _y
# epu_cat = epu_cat.drop('date', axis=1)
# # Merge all_data with Categorical EPU Data
# all_data = all_data.merge(epu_cat, left_index=True, right_index=True, how='left')
# # Remove temporary YYYY-MM index and reformat to (permno, date) index
# all_data = all_data.reset_index().drop('year_month', axis=1).set_index(['permno', 'date'])

In [None]:
# Merge nyt embeddings with daily EPU
nyt_merge = pd.merge(us_news, nyt_emb, left_index=True, right_index=True, how='left').dropna()

In [None]:
# Merge wsj embeddings with daily EPU
wsj_merge = pd.merge(us_news, wsj_emb, left_index=True, right_index=True, how='left').dropna()

# Plotting Data

In [None]:
# Plot scatter
def scatter(actual_values, predictions, x_axis, y_axis, scale):
    if scale == False:
        plt.scatter(actual_values, predictions)
        plt.title('Scatter')
        plt.xlabel(x_axis)
        plt.ylabel(y_axis)
        plt.show()
        plt.close()
    else:
        plt.figure(figsize=(8, 8))  
        plt.scatter(actual_values, predictions)
        # Add labels and title
        plt.xlabel(x_axis)
        plt.ylabel(y_axis)
        plt.title('Scatter Plot with Equal Scale and 45 Degree Line')
        # Determine limits for equal scale
        combined = np.concatenate([actual_values, predictions])
        min_val = combined.min()
        max_val = combined.max()
        # Set limits for x and y axes
        plt.xlim(min_val, max_val)
        plt.ylim(min_val, max_val)
        # Plot a 45-degree line
        plt.plot([min_val, max_val], [min_val, max_val], 'r--')  # Red dashed line
        # Set aspect of plot to be equal
        plt.gca().set_aspect('equal', adjustable='box')
        plt.show()
        plt.close()

# Plot time-series
def time_series(actual_values, predictions):
    # Set the figure size
    plt.figure(figsize=(40, 10))
    plt.plot(actual_values, label='Actual Values')
    plt.plot(actual_values.index, predictions, label='Predicted Values')
    plt.title('My Combined Plot')
    plt.xlabel('Time')
    plt.ylabel('Value')
    plt.legend()
    plt.show()
    plt.close()

### NYT

In [None]:
# Plot Mean of embeddings
nyt_beta_col = nyt_merge.filter(regex='^nyt').columns
nyt_merge[nyt_beta_col].mean().hist(bins=200)

In [None]:
# Plot STD of embeddings
nyt_beta_col = nyt_merge.filter(regex='^nyt').columns
nyt_merge[nyt_beta_col].std().hist(bins=200)

### WSJ

In [None]:
# Plot Mean of embeddings
wsj_beta_col = wsj_merge.filter(regex='^wsj').columns
wsj_merge[wsj_beta_col].mean().hist(bins=200)

In [None]:
# Plot STD of embeddings
wsj_beta_col = wsj_merge.filter(regex='^wsj').columns
wsj_merge[wsj_beta_col].std().hist(bins=200)

# Insample OLS

In [None]:
# Function to run standard OLS
def exec_ols(data, beta_cols, y_col):
    # Setup LR Data
    X = data[beta_cols]
    X = sm.add_constant(X)
    Y = data[y_col]
    # Run LR
    model = sm.OLS(Y, X).fit()
    # Print out the statistics
    predictions = model.predict(X)
    # Calculate SSE (Sum of Squared Errors)
    SSE = np.sum((Y - predictions) ** 2)
    # Calculate TSS (Total Sum of Squares) without centering around the mean
    TSS = np.sum(Y ** 2)
    # Calculate custom R2
    MyR2 = 1 - (SSE / TSS)
    # Print out custom R2
    print("Custom R-squared:", MyR2)
    scatter(Y, predictions, 'Actual Value', 'Predicted Value', True)
    time_series(Y, predictions)
    return model, Y, predictions

### NYT

In [None]:
# View Data
nyt_merge.head(5)

In [None]:
# Get Beta Columns
nyt_beta_col = nyt_merge.filter(regex='^nyt').columns

In [None]:
# Run Regression
nyt_standard_model, Y, predictions = exec_ols(data=nyt_merge, beta_cols=nyt_beta_col, y_col='daily_pol')

#### Aggregate by Month

In [None]:
y_month = Y.to_frame()
pred_month = predictions.to_frame()
y_month = y_month.resample('M').mean()
pred_month = pred_month.resample('M').mean()

In [None]:
# Calculate SSE (Sum of Squared Errors)
SSE = np.sum((y_month.values - pred_month.values) ** 2)
# Calculate TSS (Total Sum of Squares) without centering around the mean
TSS = np.sum(y_month.values ** 2)
# Calculate custom R2
MyR2 = 1 - (SSE / TSS)
# Print out custom R2
print("Custom R-squared:", MyR2)

In [None]:
scatter(y_month, pred_month, 'Actual Value', 'Predicted Value', True)
time_series(y_month, pred_month)

### WSJ

In [None]:
# View Data
wsj_merge.head(5)

In [None]:
# Get Beta Columns
wsj_beta_col = wsj_merge.filter(regex='^wsj').columns

In [None]:
wsj_standard_model, Y, predictions = exec_ols(data=wsj_merge, beta_cols=wsj_beta_col, y_col='daily_pol')

#### Aggregate by Month

In [None]:
y_month = Y.to_frame()
pred_month = predictions.to_frame()
y_month = y_month.resample('M').mean()
pred_month = pred_month.resample('M').mean()

In [None]:
# Calculate SSE (Sum of Squared Errors)
SSE = np.sum((y_month.values - pred_month.values) ** 2)
# Calculate TSS (Total Sum of Squares) without centering around the mean
TSS = np.sum(y_month.values ** 2)
# Calculate custom R2
MyR2 = 1 - (SSE / TSS)
# Print out custom R2
print("Custom R-squared:", MyR2)

In [None]:
scatter(y_month, pred_month, 'Actual Value', 'Predicted Value', True)
time_series(y_month, pred_month)

# Split OLS (Train/Test)

In [None]:
# Execute predictions and calculate R^2
def exec_ols_test(model, data, beta_cols, y_col):
    X_test = data[beta_cols]
    X_test = sm.add_constant(X_test)
    predictions = model.predict(X_test)
    actual_values = data[y_col]
    mse = mean_squared_error(actual_values, predictions)
    rmse = sqrt(mse)
    # R-squared
    ss_residual = sum((actual_values - predictions) ** 2)
    ss_total = sum((actual_values - actual_values.mean()) ** 2)
    r_squared = 1 - (ss_residual / ss_total)
    # Custom R-squared
    # Calculate SSE (Sum of Squared Errors)
    SSE = np.sum((actual_values - predictions) ** 2)
    # Calculate TSS (Total Sum of Squares) without centering around the mean
    TSS = np.sum(actual_values ** 2)
    # Calculate custom R2
    MyR2 = 1 - (SSE / TSS)
    # Print out custom R2
    print("Custom R-squared:", MyR2)
    print("R-squared:", r_squared)
    print("Mean Squared Error:", mse)
    print("Root Mean Squared Error:", rmse)
    # Plot graphs
    scatter(actual_values, predictions, 'Actual Value', 'Predicted Value', True)
    time_series(actual_values, predictions)
    return predictions

### NYT

In [None]:
# Get Beta Columns
nyt_beta_col = nyt_merge.filter(regex='^nyt').columns

In [None]:
# Split data into Train and Test
split = 0.5
nyt_train = nyt_merge.iloc[:int(len(nyt_merge)*split)]
nyt_test = nyt_merge.iloc[int(len(nyt_merge)*split):]

In [None]:
# Run Regression
nyt_train_model = exec_ols(data=nyt_train, beta_cols=nyt_beta_col, y_col='daily_pol')

In [None]:
nyt_pred = exec_ols_test(model=nyt_train_model, data=nyt_test, beta_cols=nyt_beta_col, y_col='daily_pol')

### WSJ

In [None]:
# Get Beta Columns
wsj_beta_col = wsj_merge.filter(regex='^wsj').columns

In [None]:
# Split data into Train and Test
split = 0.5
wsj_train = wsj_merge.iloc[:int(len(wsj_merge)*split)]
wsj_test = wsj_merge.iloc[int(len(wsj_merge)*split):]

In [None]:
# Run Regression
wsj_train_model = exec_ols(data=wsj_train, beta_cols=wsj_beta_col, y_col='daily_pol')

In [None]:
wsj_pred = exec_ols_test(model=wsj_train_model, data=wsj_test, beta_cols=wsj_beta_col, y_col='daily_pol')

# L1 Split OLS (Train/Test)

In [None]:
# Function to run standard OLS
def exec_l1_ols(data, beta_cols, y_col, lasso_alpha, split):
    # Setup LR Data
    X = data[beta_cols]
    Y = data[y_col]
    X_train = X.iloc[:int(len(data)*split)]
    X_test = X.iloc[int(len(data)*split):]
    Y_train = Y.iloc[:int(len(data)*split)]
    Y_test = Y.iloc[int(len(data)*split):] 
    # Run LR
    lasso = Lasso(alpha=lasso_alpha)
    lasso.fit(X_train, Y_train)
    r_squared = lasso.score(X_test, Y_test)
    # Make predictions
    predictions = lasso.predict(X_test)
    # Calculate Mean Squared Error
    mse = mean_squared_error(Y_test, predictions)
    rmse = np.sqrt(mse)
    # Custom R-squared
    # Calculate SSE (Sum of Squared Errors)
    SSE = np.sum((Y_test - predictions) ** 2)
    # Calculate TSS (Total Sum of Squares) without centering around the mean
    TSS = np.sum(Y_test ** 2)
    # Calculate custom R2
    MyR2 = 1 - (SSE / TSS)
    # Print out custom R2
    print("Custom R-squared:", MyR2)
    print("R-squared:", r_squared)
    print("Mean Squared Error:", mse)
    print("Root Mean Squared Error:", rmse)
    # Plot graphs
    scatter(Y_test, predictions, 'Actual Value', 'Predicted Value', True)
    time_series(Y_test, predictions)
    return MyR2, Y_test, predictions

### NYT

In [None]:
# Get Beta Columns
nyt_beta_col = nyt_merge.filter(regex='^nyt').columns

In [None]:
# Run Regression
grid = [0]
i = 1
while max(grid)<=15:
    grid.append(0.5*i)
    i+=1
value_collect = []
r2_collect = []
for value in grid:
    print('-'*60)
    print(value)
    r2, Y, predictions = exec_l1_ols(data=nyt_merge, beta_cols=nyt_beta_col, y_col='daily_pol', lasso_alpha=value, split=0.5)
    value_collect.append(value)
    r2_collect.append(r2)

In [None]:
# Plot Distribution
scatter(value_collect, r2_collect, 'alpha', 'R2', False)

### WSJ

In [None]:
# Get Beta Columns
wsj_beta_col = wsj_merge.filter(regex='^wsj').columns

In [None]:
# Run Regression
grid = [0]
i = 1
while max(grid)<=15:
    grid.append(0.5*i)
    i+=1
value_collect = []
r2_collect = []
for value in grid:
    print('-'*60)
    print(value)
    r2, Y, predictions = exec_l1_ols(data=wsj_merge, beta_cols=wsj_beta_col, y_col='daily_pol', lasso_alpha=value, split=0.5)
    value_collect.append(value)
    r2_collect.append(r2)

In [None]:
# Plot Distribution
scatter(value_collect, r2_collect, 'alpha', 'R2', False)

# L2 Split OLS (Train/Test)

In [None]:
# Function to run standard OLS
def exec_l2_ols(data, beta_cols, y_col, ridge_alpha, split):
    # Setup LR Data
    X = data[beta_cols]
    Y = data[y_col]
    X_train = X.iloc[:int(len(data)*split)]
    X_test = X.iloc[int(len(data)*split):]
    Y_train = Y.iloc[:int(len(data)*split)]
    Y_test = Y.iloc[int(len(data)*split):] 
    # Run LR
    ridge = Ridge(alpha=ridge_alpha)
    ridge.fit(X_train, Y_train)
    r_squared = ridge.score(X_test, Y_test)
    # Make predictions
    predictions = ridge.predict(X_test)
    # Calculate Mean Squared Error
    mse = mean_squared_error(Y_test, predictions)
    rmse = np.sqrt(mse)
    # Custom R-squared
    # Calculate SSE (Sum of Squared Errors)
    SSE = np.sum((Y_test - predictions) ** 2)
    # Calculate TSS (Total Sum of Squares) without centering around the mean
    TSS = np.sum(Y_test ** 2)
    # Calculate custom R2
    MyR2 = 1 - (SSE / TSS)
    # Print out custom R2
    print("Custom R-squared:", MyR2)
    print("R-squared:", r_squared)
    print("Mean Squared Error:", mse)
    print("Root Mean Squared Error:", rmse)
    # Plot graphs
    scatter(Y_test, predictions, 'Actual Value', 'Predicted Value', True)
    time_series(Y_test, predictions)
    return MyR2, Y_test, predictions

### NYT

In [None]:
# Get Beta Columns
nyt_beta_col = nyt_merge.filter(regex='^nyt').columns

In [None]:
# Run Regression
grid = [0]
i = 1
while max(grid)<=30:
    grid.append(0.5*i)
    i+=1
value_collect = []
r2_collect = []
for value in grid:
    print('-'*60)
    print(value)
    r2, Y, predictions = exec_l2_ols(data=nyt_merge, beta_cols=nyt_beta_col, y_col='daily_pol', ridge_alpha=value, split=0.5)
    value_collect.append(value)
    r2_collect.append(r2)

In [None]:
# Plot Distribution
scatter(value_collect, r2_collect, 'alpha', 'R2', False)

### WSJ

In [None]:
# Get Beta Columns
wsj_beta_col = wsj_merge.filter(regex='^wsj').columns

In [None]:
# Run Regression
grid = [0]
i = 1
while max(grid)<=30:
    grid.append(0.5*i)
    i+=1
value_collect = []
r2_collect = []
for value in grid:
    print('-'*60)
    print(value)
    r2, Y, predictions = exec_l2_ols(data=wsj_merge, beta_cols=wsj_beta_col, y_col='daily_pol', ridge_alpha=value, split=0.5)
    value_collect.append(value)
    r2_collect.append(r2)

In [None]:
# Plot Distribution
scatter(value_collect, r2_collect, 'alpha', 'R2', False)

#### Aggregate by Month

##### Daily to Month

In [None]:
r2, Y, predictions = exec_l2_ols(data=wsj_merge, beta_cols=wsj_beta_col, y_col='daily_pol', ridge_alpha=0.5, split=0.5)

In [None]:
y_month = Y.to_frame()
pred_month = pd.DataFrame({'predictions':predictions}, index=Y.index)
y_month = y_month.resample('M').mean()
pred_month = pred_month.resample('M').mean()

In [None]:
# Calculate SSE (Sum of Squared Errors)
SSE = np.sum((y_month.values - pred_month.values) ** 2)
# Calculate TSS (Total Sum of Squares) without centering around the mean
TSS = np.sum((y_month.values)** 2)
# Calculate custom R2
MyR2 = 1 - (SSE / TSS)
# Print out custom R2
print("Custom R-squared:", MyR2)

In [None]:
scatter(y_month, pred_month, 'Actual Value', 'Predicted Value', True)
time_series(y_month, pred_month)

##### Month

In [None]:
wsj_merge_month = wsj_merge.resample('M').mean()

In [None]:
r2, Y, predictions = exec_l2_ols(data=wsj_merge_month, beta_cols=wsj_beta_col, y_col='daily_pol', ridge_alpha=0.5, split=0.5)

# Elastic Net OLS (Train/Test)

In [None]:
# Function to run standard OLS
def exec_en_ols(data, beta_cols, y_col, en_alpha, l1_ratio, split):
    # Setup LR Data
    X = data[beta_cols]
    Y = data[y_col]
    X_train = X.iloc[:int(len(nyt_merge)*split)]
    X_test = X.iloc[int(len(nyt_merge)*split):]
    Y_train = Y.iloc[:int(len(nyt_merge)*split)]
    Y_test = Y.iloc[int(len(nyt_merge)*split):] 
    # Run LR
    elastic_net = ElasticNet(alpha=en_alpha, l1_ratio=l1_ratio)
    elastic_net.fit(X_train, Y_train)
    r_squared = elastic_net.score(X_test, Y_test)
    # Make predictions
    predictions = elastic_net.predict(X_test)
    # Calculate Mean Squared Error
    mse = mean_squared_error(Y_test, predictions)
    rmse = np.sqrt(mse)
    # Custom R-squared
    # Calculate SSE (Sum of Squared Errors)
    SSE = np.sum((Y_test - predictions) ** 2)
    # Calculate TSS (Total Sum of Squares) without centering around the mean
    TSS = np.sum(Y_test ** 2)
    # Calculate custom R2
    MyR2 = 1 - (SSE / TSS)
    print("Custom R-squared:", MyR2)
    print("R-squared:", r_squared)
    print("Mean Squared Error:", mse)
    print("Root Mean Squared Error:", rmse)
    # Plot graphs
    scatter(Y_test, predictions, 'Actual Value', 'Predicted Value', True)
    time_series(Y_test, predictions)
    return MyR2

### NYT

In [None]:
# Get Beta Columns
nyt_beta_col = nyt_merge.filter(regex='^nyt').columns

In [None]:
# Run Regression
alpha_values = [0.05, 0.10, 0.25, 0.50, 1.0, 2.0, 5.0, 10.0]
l1_ratio_values = [0.05, 0.1, 0.3, 0.5, 0.7, 0.9]
grid = [(alpha, l1_ratio) for alpha in alpha_values for l1_ratio in l1_ratio_values]
for value in grid:
    print(f"Alpha {value[0]:^10} | l1_ratio{value[1]:^10}")
    r_squared = exec_en_ols(data=nyt_merge, beta_cols=nyt_beta_col, y_col='daily_pol', en_alpha=value[0], l1_ratio=value[1], split=0.5)

### WSJ

In [None]:
# Get Beta Columns
wsj_beta_col = wsj_merge.filter(regex='^wsj').columns

In [None]:
# Run Regression
alpha_values = [0.05, 0.10, 0.25, 0.50, 1.0, 2.0, 5.0, 10.0]
l1_ratio_values = [0.05, 0.1, 0.3, 0.5, 0.7, 0.9]
grid = [(alpha, l1_ratio) for alpha in alpha_values for l1_ratio in l1_ratio_values]
for value in grid:
    print(f"Alpha {value[0]:^10} | l1_ratio{value[1]:^10}")
    r_squared = exec_en_ols(data=wsj_merge, beta_cols=wsj_beta_col, y_col='daily_pol', en_alpha=value[0], l1_ratio=value[1], split=0.5)

# Detrending Y-Value

In [None]:
nyt_merge_detrend = nyt_merge.copy(deep=True)

In [None]:
# Get Beta Columns
nyt_beta_col = nyt_merge_detrend.filter(regex='^nyt').columns

In [None]:
# Detrend on a rolling 5 day basis
nyt_merge_detrend['daily_pol_5'] = nyt_merge_detrend['daily_pol'] - nyt_merge_detrend['daily_pol'].rolling(5).mean()
# Detrend on a rolling 5 day basis
nyt_merge_detrend['daily_pol_60'] = nyt_merge_detrend['daily_pol'] - nyt_merge_detrend['daily_pol'].rolling(60).mean()

In [None]:
nyt_merge_detrend = nyt_merge_detrend.dropna()

In [None]:
# Run Regression
grid = [0]
i = 1
while max(grid)<=30:
    grid.append(0.5*i)
    i+=1
value_collect = []
r2_collect = []
for value in grid:
    print('-'*60)
    print(value)
    r2 = exec_l2_ols(data=nyt_merge_detrend, beta_cols=nyt_beta_col, y_col='daily_pol_60', ridge_alpha=value, split=0.5)
    value_collect.append(value)
    r2_collect.append(r2)

In [None]:
# Plot Distribution
scatter(value_collect, r2_collect, 'alpha', 'R2', False)

# Detrending X-Value and Y-Value

In [None]:
nyt_merge_detrend = nyt_merge.copy(deep=True)

In [None]:
# Get Beta Columns
nyt_beta_col = nyt_merge_detrend.filter(regex='^nyt').columns

In [None]:
# Detrend on a rolling 5 day basis
nyt_merge_detrend['daily_pol_5'] = nyt_merge_detrend['daily_pol'] - nyt_merge_detrend['daily_pol'].rolling(5).mean()
# Detrend on a rolling 5 day basis
nyt_merge_detrend['daily_pol_60'] = nyt_merge_detrend['daily_pol'] - nyt_merge_detrend['daily_pol'].rolling(60).mean()

In [None]:
for col in nyt_beta_col:
    nyt_merge_detrend[col] = nyt_merge_detrend[col] - nyt_merge_detrend[col].rolling(5).mean()

In [None]:
nyt_merge_detrend = nyt_merge_detrend.dropna()

In [None]:
# Run Regression
grid = [0]
i = 1
while max(grid)<=30:
    grid.append(0.5*i)
    i+=1
value_collect = []
r2_collect = []
for value in grid:
    print('-'*60)
    print(value)
    r2 = exec_l2_ols(data=nyt_merge_detrend, beta_cols=nyt_beta_col, y_col='daily_pol_5', ridge_alpha=value, split=0.5)
    value_collect.append(value)
    r2_collect.append(r2)

In [None]:
# Plot Distribution
scatter(value_collect, r2_collect, 'alpha', 'R2', False)

# Standard Neural Network

In [None]:
def exec_standard_nn_test(data, beta_cols, y_col, epochs, batch_size, split):
    # Setup LR Data
    X = data[beta_cols]
    Y = data[y_col]
    X_train = X.iloc[:int(len(nyt_merge)*split)]
    X_test = X.iloc[int(len(nyt_merge)*split):]
    Y_train = Y.iloc[:int(len(nyt_merge)*split)]
    Y_test = Y.iloc[int(len(nyt_merge)*split):] 

    #NN Model
    nn_model = Sequential([
        Dense(64, activation='relu', input_shape=(len(beta_cols),)),
        Dense(32, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1)
    ])
    
    nn_model.compile(optimizer=Adam(), loss='mean_squared_error')
    nn_model.fit(X_train, Y_train, epochs, batch_size)
    
    predictions = nn_model.predict(X_test)

    # Calculate metrics
    mse = mean_squared_error(Y_test, predictions)
    rmse = sqrt(mse)

    # R-squared
    ss_residual = np.sum((Y_test - predictions.flatten()) ** 2)
    ss_total = np.sum((Y_test - np.mean(Y_test)) ** 2)
    r_squared = 1 - (ss_residual / ss_total)
    ss_total = np.sum((Y_test) ** 2)
    cr_squared = 1 - (ss_residual / ss_total)

    # Print out metrics
    print("Custom R-squared:", cr_squared)
    print("R-squared:", r_squared)
    print("Mean Squared Error:", mse)
    print("Root Mean Squared Error:", rmse)
    
    # Plot graphs
    scatter(Y_test.values, predictions.flatten(), 'Actual Value', 'Predicted Value', True)
    time_series(Y_test, predictions)
    return Y_test, predictions

## WSJ

In [None]:
# Get Beta Columns
wsj_beta_col = wsj_merge.filter(regex='^wsj').columns

In [None]:
# Execute the test function with the neural network
Y, predictions = exec_standard_nn_test(wsj_merge, wsj_beta_col, 'daily_pol', 10, 32, 0.5)

# L1, L2, Early Stopping, Dropout, BatchNormalization Neural Network

In [None]:
def exec_standard_nn_test(data, beta_cols, y_col, epochs, batch_size, split, val):
    # Setup LR Data
    X = data[beta_cols]
    Y = data[y_col]
    X_train = X.iloc[:int(len(nyt_merge)*split)]
    X_test = X.iloc[int(len(nyt_merge)*split):]
    Y_train = Y.iloc[:int(len(nyt_merge)*split)] 
    Y_test = Y.iloc[int(len(nyt_merge)*split):] 
    
    # X_train = X_train.iloc[:int(len(X_train)*val)]
    # Y_train = Y_train.iloc[:int(len(Y_train)*val)]
    # X_val = X_train.iloc[:int(len(X_train)*val):]
    # Y_val = X_train.iloc[:int(len(Y_train)*val):]


    #NN Model
    nn_model = Sequential([
        Dense(128, activation='relu', input_shape=(len(beta_cols),), kernel_regularizer=l1_l2(l1=0, l2=1e-4)),
        Dropout(0.3),
        BatchNormalization(),
        Dense(64, activation='relu', kernel_regularizer=l1_l2(l1=0, l2=1e-4)),
        Dropout(0.3),
        Dense(1)
    ])

    #NN Model
    nn_model = Sequential([
        Dense(128, activation='relu', input_shape=(len(beta_cols),), kernel_regularizer=l1_l2(l1=0, l2=1e-2)),
        Dense(64, activation='relu', kernel_regularizer=l1_l2(l1=0, l2=1e-2)),
        Dropout(0.5),
        Dense(1)
    ])
    
    nn_model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
    
    # Implement Early Stopping
    early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
    # nn_model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, Y_val), callbacks=[early_stopping])
    nn_model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size)
    
    predictions = nn_model.predict(X_test)

    # Calculate metrics
    mse = mean_squared_error(Y_test, predictions)
    rmse = sqrt(mse)

    # R-squared
    ss_residual = np.sum((Y_test - predictions.flatten()) ** 2)
    ss_total = np.sum((Y_test - np.mean(Y_test)) ** 2)
    r_squared = 1 - (ss_residual / ss_total)
    ss_total = np.sum((Y_test) ** 2)
    cr_squared = 1 - (ss_residual / ss_total)

    # Print out metrics
    print("Custom R-squared:", cr_squared)
    print("R-squared:", r_squared)
    print("Mean Squared Error:", mse)
    print("Root Mean Squared Error:", rmse)
    
    # Plot graphs
    scatter(Y_test.values, predictions.flatten(), 'Actual Value', 'Predicted Value', True)
    time_series(Y_test, predictions)
    return Y_test, predictions

## WSJ

In [None]:
# Get Beta Columns
wsj_beta_col = wsj_merge.filter(regex='^wsj').columns

In [None]:
# Execute the test function with the neural network
Y, predictions = exec_standard_nn_test(wsj_merge, wsj_beta_col, 'daily_pol', 32, 32, 0.6, 0.8)