In [2]:
# TODO
# Find data set
# Split into in-sample and out of sample sets
# train the model on the in-sample set optimizing returns by adjusting the significance_level and the time to reverse_trade

In [3]:
import numpy as np
import pandas as pd
import yfinance as yf
from scipy.stats import norm
import pandas_market_calendars as mcal
from scipy import stats

In [4]:
def bns_jump_test(returns, significance_level=0.01):
    n = len(returns)
    mu = returns.mean(axis=0)
    sigma = returns.std(axis=0)

    z_stat = np.sqrt(n) * (np.abs(mu) - 0.5 * sigma ** 2) / sigma

    critical_value = norm.ppf(1 - significance_level)
    
    return z_stat > critical_value, z_stat, np.argmax(returns)

In [5]:
def get_price_data(tickers, start_date, end_date, interval):
    stock_data = {}
    for ticker in tickers:
        try:
            stock_data[ticker] = yf.download(ticker, start=start_date, end=end_date, progress=False, interval=interval)
        except:
            print("Error: Could not download data for ticker " + ticker)
            return None
    return stock_data

In [6]:
def get_return_data(tickers, yesterday_data, today_data):
    return_data = {}
    for ticker in tickers:
        price_data = yesterday_data[ticker]['Close'].values
        np.append(price_data, today_data[ticker]['Close'].values[0])
        returns = np.diff(np.log(price_data))
        return_data[ticker] = returns
    return return_data

In [7]:
def get_BNS_test_data(tickers, returns, threshold):
    bns_test_data = {}
    for ticker in tickers:
        bns_test_data[ticker] = bns_jump_test(returns[ticker], threshold)
    return bns_test_data

In [8]:
def get_overnight_jumps(data, returns):
    overnight_jumps = []
    for key, value in data.items():
        is_jump, test_stat, jump_index = value
        if is_jump:
            if jump_index == len(returns[key]) - 1:
                print(f"{key} jumped at the end of the day with a test statistic of {test_stat}")
                overnight_jumps.append((key, test_stat, returns[key][len(returns[key])-1]))
    overnight_jumps.sort(key=lambda x: x[1], reverse=True)
    return overnight_jumps

In [9]:
def get_trades(overnight_jumps, today_data, portfolio, today_date, close_time):

    potential_trade_list = [None] * min(10, len(overnight_jumps))
    return_total = sum([abs(x[2]) for x in overnight_jumps])
    for i in range(min(10, len(overnight_jumps))):
        ticker, _, jump = overnight_jumps[i]
        trade_weight = abs(jump) / return_total
        potential_trade_list[i] = (ticker, trade_weight, jump)

    trade_list = [None] * len(potential_trade_list)
    for i in range(len(potential_trade_list)):
        ticker, trade_weight, jump = potential_trade_list[i]
        if jump < 0:
            # take a long postion
            sell_price = today_data[ticker]['Close'].values[0]
            buy_price = today_data[ticker]['Close'].values[close_time] 
            profit = (sell_price - buy_price) * trade_weight * portfolio / sell_price
            trade_list[i] = (ticker, profit)
        elif jump > 0:
            # take a short position
            buy_price = today_data[ticker]['Close'].values[0]
            sell_price = today_data[ticker]['Close'].values[close_time]
            profit = (sell_price - buy_price) * trade_weight * portfolio / sell_price
            trade_list[i] = (ticker, profit)
        else:
            print(f'Error: On {today_date}, {ticker} had a jump of 0.')
    return trade_list

In [10]:
# Define global variables

set_start_date = '2023-03-07'
set_start_date_following = '2023-03-08'
set_end_date = '2023-05-03'
set_end_date = '2023-05-04'
set_interval = "5m" 
# 7=hourly, 78=5m, 387=1m
daily_price_length = 78

# Initial Portfolio
portfolio = 10000

# Create a trading day calendar
nyse = mcal.get_calendar('NYSE')
early = nyse.schedule(start_date=set_start_date, end_date=set_end_date, tz='America/New_York')

# TODO dataset of sp500 on minute intervals for the last ten years
sp500_tickers = ['MMM','AOS','AMZN','ABBV','ACN','ATVI','ADM','ADBE','ADP','AAP','AES','AFL','A','APD','AKAM','ALK','ALB','ARE','ALGN']

In [11]:
# Train Model
training_results = {}
jump_thresholds = [0.001, 0.002, 0.005, 0.01, 0.015, 0.02]
trade_close_times = [60, 80, 120, 140, 180]
for threshold in jump_thresholds:
    for close_time in trade_close_times:

        curr_training_trades = {}

        for i in range(2, len(early)-1, 2):

            yesterday_date = early.iloc[i-1]['market_close'].date()
            today_date = early.iloc[i]['market_close'].date()
            tomorrow_date = early.iloc[i+1]['market_close'].date()

            yesterday_data = get_price_data(sp500_tickers, yesterday_date, today_date, set_interval)
            if yesterday_data == None:
                continue
            today_data = get_price_data(sp500_tickers, today_date, tomorrow_date, set_interval)
            if today_data == None:
                continue
            for ticker in sp500_tickers:
                if len(today_data[ticker]) != daily_price_length:
                    print(f"Error: {ticker} data is missing for {today_date}")
                    continue

            # Get the percent change between the log of the price data 
            curr_returns = get_return_data(sp500_tickers, yesterday_data, today_data)

            # Perform the BNS jump test
            curr_bns_test_data = get_BNS_test_data(sp500_tickers, curr_returns, threshold)

            # Aggregate the data at which an overnight jump took place
            curr_overnight_jumps = get_overnight_jumps(curr_bns_test_data, curr_returns)

            # Formulate trades from the overnight jumps
            trade_list = get_trades(curr_overnight_jumps, today_data, today_date, portfolio, close_time)

            total_profit = sum([x[1] for x in trade_list])
            curr_training_trades[today_date] = (trade_list, total_profit)

        total_profits = 0
        for key, value in curr_training_trades.items():
            if len(value) > 0:
                for trade in value:
                    total_profits += trade[1]
                    
        training_results[(threshold, close_time)] = total_profits

Error: MMM data is missing for {'MMM':                                  Open        High         Low       Close   
Datetime                                                                    
2023-03-09 09:30:00-05:00  107.660004  107.849998  107.330002  107.801399  \
2023-03-09 09:35:00-05:00  107.879997  107.934998  107.760002  107.849998   
2023-03-09 09:40:00-05:00  107.894997  108.120003  107.800003  107.879997   
2023-03-09 09:45:00-05:00  107.885002  108.239998  107.885002  108.190002   
2023-03-09 09:50:00-05:00  108.190002  108.209999  107.919998  107.919998   
...                               ...         ...         ...         ...   
2023-03-09 15:35:00-05:00  106.160004  106.230003  105.959999  106.206703   
2023-03-09 15:40:00-05:00  106.180000  106.250000  106.029999  106.050003   
2023-03-09 15:45:00-05:00  106.050003  106.050003  105.839996  105.910004   
2023-03-09 15:50:00-05:00  105.860001  106.059998  105.610001  105.690002   
2023-03-09 15:55:00-05:00  105.695000

  mu = returns.mean(axis=0)
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


ValueError: attempt to get argmax of an empty sequence

In [None]:
# Get best hyperparameters
best_hyperparameters = None
best_profit = - np.inf

for key, value in training_results.items():
    if value > best_profit:
        best_profit = value
        best_hyperparameters = key

In [9]:
# Validate Model
total_trades = {}
jump_threshold = best_hyperparameters[0]
trade_close_time = best_hyperparameters[1]

for i in range(1, len(early)-1, 2):
    
    yesterday_date = early.iloc[i-1]['market_close'].date()
    today_date = early.iloc[i]['market_close'].date()
    tomorrow_date = early.iloc[i+1]['market_close'].date()
    
    yesterday_data = get_price_data(sp500_tickers, yesterday_date, today_date, set_interval)
    if yesterday_data == None:
        continue
    today_data = get_price_data(sp500_tickers, today_date, tomorrow_date, set_interval)
    if today_data == None:
        continue
    for ticker in sp500_tickers:
        if len(today_data[ticker]) != daily_price_length:
            print(f"Error: {ticker} data is missing for {today_data}")
            continue

    # Get the percent change between the log of the price data 
    curr_returns = get_return_data(sp500_tickers, yesterday_data, today_data)

    # Perform the BNS jump test
    curr_bns_test_data = get_BNS_test_data(sp500_tickers, curr_returns, jump_threshold)

    # TODO implement stats function to get the overnight_jump stats ie how many were positive jumps vs negative jumps, largest jump size, mean jump size, std jump size

    # Aggregate the data at which an overnight jump took place
    curr_overnight_jumps = get_overnight_jumps(curr_bns_test_data, curr_returns)

    # Formulate trades from the overnight jumps
    trade_list = get_trades(curr_overnight_jumps, today_data, today_date, portfolio, close_time)

    total_profit = sum([x[1] for x in trade_list])
    total_trades[today_date] = (trade_list, total_profit)

On 2022-01-04, the total return was 0.
On 2022-01-05, the total return was 0.
ADBE jumped at the end of the day with a test statistic of 4.593757360089037
On 2022-01-06, the total return was 40.96712918590703.
On 2022-01-07, the total return was 0.
On 2022-01-10, the total return was 0.
On 2022-01-11, the total return was 0.
On 2022-01-12, the total return was 0.
On 2022-01-13, the total return was 0.
On 2022-01-14, the total return was 0.
On 2022-01-18, the total return was 0.
On 2022-01-19, the total return was 0.
On 2022-01-20, the total return was 0.
On 2022-01-21, the total return was 0.
ABBV jumped at the end of the day with a test statistic of 2.6212144262568446
On 2022-01-24, the total return was 233.83206455357725.
On 2022-01-25, the total return was 0.
On 2022-01-26, the total return was 0.
On 2022-01-27, the total return was 0.
On 2022-01-28, the total return was 0.
On 2022-01-31, the total return was 0.
On 2022-02-01, the total return was 0.
On 2022-02-02, the total return 

In [1]:
trade_df = pd.DataFrame(trade_list, columns=['Date', 'Trades', 'Profit'])
total_profit = sum(trade_df['Profit'].values)
trade_df.head(10)

NameError: name 'pd' is not defined

In [None]:
# Calculate the BH strategy return of the S&P500
start_data = get_price_data("SPY", set_start_date, set_start_date_following)
end_data = get_price_data("SPY", set_end_date, set_start_date_following)
buy_price = start_data['Open'].values[0]
sell_price = end_data['Close'].values[-1]
sp500_profit = (sell_price - buy_price) * portfolio / sell_price

In [20]:
print(f"The total profits were {total_profits}.")
print(f"The total return was {total_profits / portfolio}%.")
print(f"The total return of the S&P 500 was {sp500_profit / portfolio}") #TODO: create function to calc return of S&P 500
print(f"Our strategy beats the S&P 500 by {(total_profits / portfolio) - (sp500_profit / portfolio)}%.")

The total profits were -219.2117623892016.
The total return was -0.02192117623892016%.
The total return of the S&P 500 was -18.11%
Our strategy beats the S&P 500 by 18.08807882376108%.
