# Hedge Fund Group Project - Codes

In [9]:
import yfinance as yf

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

import numpy as np
import pandas as pd
import datetime as dt

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

from sklearn.metrics import accuracy_score

import matplotlib as mpl
COLOR = 'white'
mpl.rcParams['text.color'] = COLOR
mpl.rcParams['axes.labelcolor'] = COLOR
mpl.rcParams['xtick.color'] = COLOR
mpl.rcParams['ytick.color'] = COLOR

In [10]:
# up-or-down label threshold
up_tres = 0.7
down_tres = 0.3

# how many days we use to calculate the historical beta for each ticker
β_historical_days = 2

# train ratio
train_ratio = 0.5

# in the ranking, how much good ones we long, how much bad ones we short
long_percentile = 1/3
short_percentile = 1/3

trading_freq = 1

# hyperparameter of the optimization problem
λ = 0.7

# maximum weigh in a portfolio
max_weight = 0.1

# total notional value
notional = 1000000

In [11]:
stocks = yf.Tickers(['^GSPC','GOOGL', 'BLK','INCY', 'AOS',
                     'HAL', 'JBHT', 'HSIC', 'AAP', 'WRB', 'KLAC',
                     'CSCO', 'GL', 'REGN', 'QRVO', 'BIIB', 
                     'FRC', 'SNA', 'TROW', 'AMAT', 'TRV', 'CTSH',
                     'ADM', 'LKQ', 'HIG', 'CBRE', 'RHI', 'SWKS', 'EXPD',
                     'HBAN', 'RF', 'EOG', 'META', 'WY', 'CVX',  
                     'BBY', 'ZION', 'COP', 'INTC', 'PFE', 'XOM', 'AFL', 'EQT', 'PXD',
                     'SIVB', 'WBA', 'SBNY', 'MU', 'CF', 'MRNA', 'FANG',
                     'MRO', 'TSN', 'LEN', 'CTRA', 'DHI', 'MOS',
                     'PVH', 'NUE', 'PFG', 'PHM'])
#NVR

In [None]:
today = dt.datetime.today().date()
start = today - dt.timedelta(30)
end = today - dt.timedelta(1)

date_list = []
while start < end:
    if start+dt.timedelta(6) < end: 
        date_list.append((dt.datetime.strftime(start,'%Y-%m-%d'),
                        dt.datetime.strftime(start+dt.timedelta(6),'%Y-%m-%d')))
    else:
        date_list.append((dt.datetime.strftime(start,'%Y-%m-%d'),
                        dt.datetime.strftime(end,'%Y-%m-%d')))               
    start += dt.timedelta(7)
#date_list

for i,(start,end) in enumerate(date_list):
    if i == 0:
        df = stocks.history(start=start,end=end,interval='1m')
    else:
        df = pd.concat([df,stocks.history(start=start,interval='1m')],axis=0)
df.head(5)

[**********************97%********************** ]  59 of 61 completed

In [None]:
df.Close = df.Close + df.Dividends
df.drop(['Dividends','Stock Splits','Open'],axis=1,level=0,inplace=True)

In [None]:
df.drop_duplicates(inplace=True)
df.sort_index(inplace=True)

In [None]:
df = df.bfill().ffill()
df.dropna(axis=1,inplace=True)

In [None]:
def insert_df(df,small_df,lev_one_name):
    small_df.columns = pd.MultiIndex.from_tuples([(lev_one_name,i) for i in small_df.columns])
    return pd.concat((df,small_df),axis=1)

## Calculate stochastic_oscillator

In [None]:
sto_Osci = (df['Close'] - df['Low'])/(df['High'] - df['Low'])
sto_Osci[sto_Osci>1] = 1
df = insert_df(df,sto_Osci,'stochastic_oscillator')

## Calculate MA

In [None]:
MA = dict()
for t in [7*i for i in [3,5,20,50,200]]:
    MA[f'MA_{int(t/7)}'] = df.Close.rolling(window=t).mean()

for i in [5,20,50,200]:
    MA[f'MA3/{i}'] = MA['MA_3']/MA[f'MA_{i}']
    df = insert_df(df,MA[f'MA3/{i}'],f'MA3/{i}')
df = insert_df(df,MA['MA_20'],'MA_20')

[**********************97%********************** ]  59 of 61 completed

## Calculate Bollinger Bands indicators (20 days, 2 standard deviations)

In [None]:
Mstd = df.Close.rolling(window=20).std()
over_sold = (df['MA_20'] - 2*Mstd - df['Close'])/df['Close']
over_bought = (df['Close']- df['MA_20'] - 2*Mstd)/df['Close']
df = insert_df(df,over_sold,'BB_over_sold')
df = insert_df(df,over_bought,'BB_over_bought')
df.drop('MA_20',axis=1,level=0,inplace=True)

## Calculate RSI

In [None]:
HourReturns = df.Close.pct_change()
RSI = (1 + HourReturns).apply(lambda x: x/(1+HourReturns['^GSPC']))
RSI[RSI>2] = 2
df = insert_df(df,RSI,'RSI')

## Calculate labels

In [None]:
log_return = (df.Close.pct_change()+1).apply(np.log)
up_or_down = log_return.copy()
up_or_down[(up_or_down.rank(pct=True)>up_tres) & (up_or_down > 0)] = 2
up_or_down[(up_or_down.rank(pct=True)<down_tres) & (up_or_down < 0)] = 0
up_or_down[(up_or_down != 2) & (up_or_down != 0)] = 1
df = insert_df(df,up_or_down,'up_or_down')
df['up_or_down'] = df['up_or_down'].astype(int)
df['up_or_down'] = df['up_or_down'].shift(-1)

In [None]:
df.bfill(inplace=True)
df.dropna(axis=0,inplace=True)

In [None]:
for stock in df.columns.get_level_values(1):
    df['up_or_down',stock] = df['up_or_down',stock].astype(int)

In [None]:
dfClose = df['Close']

In [None]:
df['High'] = df.High/df.Close
df['Low'] = df.Low/df.Close
df['Volume'] = (df['Close'].pct_change())*100
df['Close'] = (df['Close'].pct_change())*100

df.columns = df.columns.swaplevel(0, 1)
df.sort_index(axis=1, level=0, inplace=True)
df.bfill(inplace=True)
df.dropna(inplace=True)

In [None]:
train_df = df.iloc[:int(train_ratio*df.shape[0])]
test_df = df.iloc[int(train_ratio*df.shape[0])+1:]

for stock in df.columns.get_level_values(0).unique():
    xgbst = xgb.XGBClassifier(n_estimators=600,
                          objective='multi:softprob', # same to the loss func as while training
                        early_stopping_rounds=50, # 50 conseuctive performance decrease in validation set, then stop
                        max_depth=3,
                        eta=0.01,       # 0.01                 
                        n_jobs=4,
                        verbosity = 0,
                        num_classes=3
                       #tree_method = "gpu_hist"
                      )

    train_df[stock,'up_or_down'] =  train_df[stock,'up_or_down'].astype(int)
    X_train, X_val, y_train, y_val = train_test_split(train_df[stock].drop('up_or_down',axis=1),
                                                      train_df[stock]['up_or_down'],
                                                      test_size=2/7, random_state=1998)
    print(stock)
    try:
        xgbst.fit(X=X_train,y=y_train,eval_set = [(X_val,y_val)])
    except ValueError:
        y_train.iloc[0] = 1
        xgbst.fit(X=X_train,y=y_train,eval_set = [(X_val,y_val)])
    except XGBoostError:
        test_df.drop(stock,level=0,axis=1,inplace=True)
        continue
    pred_prob = xgbst.predict_proba(test_df[stock].drop('up_or_down',axis=1))
    pred = xgbst.predict(test_df[stock].drop('up_or_down',axis=1))
    test_df[stock,'down_prob'] = pred_prob[:,0]
    test_df[stock,'stable_prob'] = pred_prob[:,1]
    test_df[stock,'up_prob'] = pred_prob[:,2]
    test_df[stock,'pred'] = pred
    

In [None]:
fig,ax = plt.subplots(figsize=(12,7))
ax.plot(test_df.CVX['up_prob'].iloc[50:70])
ax2 = ax.twinx()
ax.set_ylim([0,0.3])
ax2.plot(test_df.CVX['Close'].shift(-1).iloc[50:70],color='Salmon')

In [None]:
test_df.head(3)

In [None]:
test_df.columns = test_df.columns.swaplevel(0, 1)
test_df.sort_index(axis=1, level=0, inplace=True)

In [None]:
test_df.head(5)

# Calculate $\beta$

In [None]:
pct_change = test_df['Close'].copy()
for col in pct_change.columns:
    pct_change[col+'*SP'] = pct_change[col] * pct_change['^GSPC']
pct_change['SP^2'] = pct_change['^GSPC']**2

rolling_mean = pct_change.rolling(window=7*β_historical_days).mean()
β = pd.DataFrame(index = pct_change.index)
for col in test_df['Close'].columns:
    β[col] = (rolling_mean[col+'*SP'] - rolling_mean['^GSPC']*rolling_mean[col])/(rolling_mean['SP^2'] - rolling_mean['^GSPC']**2)

test_df = insert_df(test_df,β,'beta')

In [None]:
test_df

In [None]:
import cvxpy as cp

In [None]:
test_df = test_df.dropna(axis=0)

i = 0
T = len(test_df.index)

n_stocks = len(test_df['Close'].columns)
n_longs = int(n_stocks*long_percentile)
n_shorts = int(n_stocks*short_percentile)

trading_weights = pd.DataFrame(columns=test_df['Close'].columns,
                              index=test_df.index)

In [None]:
test_df.head(5)

In [None]:
while i < T - 1:
    ind = test_df.index[i]
    up_p = test_df['up_prob'].iloc[i]
    down_p = test_df['down_prob'].iloc[i]
    beta = test_df['beta'].iloc[i]
    longs = up_p.rank()[up_p.rank()>n_stocks-n_longs].index
    shorts = down_p.rank()[down_p.rank()>n_stocks-n_shorts].index

    up_p = up_p[longs]
    down_p = down_p[shorts]

    portfolio = np.concatenate([longs,shorts])

    # cal c
    c = np.concatenate([up_p[longs],
                        down_p[shorts]])

    # assemble A_eq
    β_row = np.concatenate([beta[longs].to_numpy(),
                           -beta[shorts].to_numpy()])
    long_row = np.concatenate([np.ones(n_longs),
                                np.zeros(n_shorts)])
    short_row = np.concatenate([np.zeros(n_longs),
                                np.ones(n_shorts)])
    A_eq = np.concatenate([β_row.reshape(1,-1),
                          long_row.reshape(1,-1),
                          short_row.reshape(1,-1)],axis=0)

    # cal b_eq
    b_eq = np.array([0,1,1])

    # cal bounds
    bounds = [[0,1]]*(n_longs+n_shorts)

    # convex optimization
    x = cp.Variable((n_longs+n_shorts,1))
    objective = cp.Minimize(-(λ*c)@x + (1-λ)*cp.sum_squares(β_row@x))
    constraints = [long_row@x==1,short_row@x==1,0 <= x, x <= max_weight]
    prob = cp.Problem(objective, constraints)

    result = prob.solve(solver=cp.ECOS)
    
    # summarize
    res = x.value.flatten()
        
    opt_weights = pd.Series(res,index=portfolio)
    opt_weights[shorts] = -opt_weights[shorts]
    if not opt_weights.index.is_unique:
        dup_term = opt_weights[opt_weights.index.duplicated()]
        opt_weights[dup_term.index] = opt_weights[dup_term.index] + dup_term
        opt_weights = opt_weights[~opt_weights.index.duplicated()]
    trading_weights.loc[ind] = opt_weights
    #print(f'row{i} finished',end=' ')
    i += trading_freq

In [None]:
trading_weights.head(6)

In [None]:
trading_weights.sum(axis=1).mean()

# Calculate portfolio $\beta$

In [None]:
trading_weights_for_plot = trading_weights.fillna(0).iloc[100:130,1:10]
sns.set(rc={'figure.figsize':(30,10)})
sns.heatmap(trading_weights_for_plot.T)

In [None]:
trading_weights

In [None]:
port_beta = (test_df['beta']*trading_weights).sum(axis=1)

In [None]:
port_beta.mean()

In [None]:
trading_weights.head(10)

In [None]:
test_df.head(10)

# Calculate the number of shares we hold in each hour

In [None]:
test_df['Close'] = dfClose.ffill().bfill()

In [None]:
positions = (trading_weights * notional/test_df['Close'])
positions = positions.fillna(0).astype(int)

In [None]:
test_df['Close']

In [None]:
for_plot = ((test_df.Close.pct_change().shift(-1) * trading_weights).sum(axis=1) + 1).cumprod()

In [None]:
for_plot.name = 'cumprod'

In [None]:
for_plot = for_plot.reset_index()

In [None]:
for_plot

In [None]:
fix,ax = plt.subplots(figsize=(12,7))
ax.plot(for_plot['cumprod'])

In [None]:
import numpy as np
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

In [None]:
tree_class1 = DecisionTreeClassifier(max_depth=3,
                                    random_state=1998
                                    )
ticker = 'PFE'
X = train_df[ticker].drop('up_or_down',axis=1)
y = train_df[ticker]['up_or_down']

tree_class1.fit(X,y)

In [None]:
COLOR = 'black'
mpl.rcParams['text.color'] = COLOR
mpl.rcParams['axes.labelcolor'] = COLOR
mpl.rcParams['xtick.color'] = COLOR
mpl.rcParams['ytick.color'] = COLOR
plt.figure(figsize=(18,11))
treeplot = tree.plot_tree(tree_class1,feature_names=X.columns,class_names=['sell','hold','buy'],fontsize=12)
#treeplot.get_bbox_patch().set_facecolor('w')

In [None]:
test_df.High.CVX

In [None]:
pos_df = positions.copy()
clo_df = test_df['Close'].copy()

In [None]:
pos_df

In [None]:
clo_df.head(5)

In [None]:
commission_per_share = 0.001



port_df = pd.DataFrame(index=positions.index)
# plug in beta
port_df['beta'] = port_beta

# position value now
port_df['now_val'] = (pos_df * clo_df).sum(axis=1)

# position value if we hold the current position to the next minute
port_df['next_val'] = (pos_df.shift(1) * clo_df).sum(axis=1).shift(-1)

# pnl within next minute
port_df['minu_pnl'] = port_df['next_val'] - port_df['now_val']
port_df['minu_ret'] = port_df['minu_pnl']/notional

# calculate the commision fee
change_pos = pos_df.shift(-1) - pos_df
port_df['commission'] = (abs(change_pos) * commission_per_share).sum(axis=1)
port_df['minu_net_pnl'] = port_df['minu_pnl'] - port_df['commission']
port_df['minu_net_ret'] = port_df['minu_net_pnl']/notional
port_df['cum_commision'] = port_df['commission'].cumsum()
port_df['cum_pnl'] = port_df['minu_net_pnl'].cumsum()


In [None]:
port_df.reset_index().minu_pnl.cumsum().plot()
port_df.reset_index().cum_pnl.plot()

In [None]:
port_df.reset_index().minu_pnl.iloc[925:935]

In [None]:
((pos_df.shift(1) * clo_df).shift(-1)-(pos_df * clo_df)).iloc[929]

In [None]:
port_df['cum_pnl']

In [None]:
df_v3 = pd.concat([port_df,pos_df],axis=1).ffill()

In [None]:
df_v3.to_csv('performance_hf_v4.csv')

In [None]:
df_v3_daily = df_v3.copy()

In [None]:
for i in list(range(len(df_v3_daily.index)-2,-1,-1)):
    if (dt.datetime.date(df_v3_daily.index[i]) == dt.datetime.date(df_v3_daily.index[i+1])):
        df_v3_daily.drop(df_v3_daily.index[i],axis=0,inplace=True)

In [None]:
df_v3_daily.to_csv('performance_hf_v4_daily.csv')

In [None]:
df_v3.head(3)

In [None]:
fig,ax = plt.subplots(figsize=(12,7))

x_date = [i.strftime("%m/%d/%Y %H:%M:%S") for i in df_v3.index]

ax.plot(x_date,df_v3['beta'],label='beta change')

ax.set_xticks(x_date[::200])
ax.set_xticklabels(x_date[::200], rotation=45)

ax.legend()

In [None]:
import yfinance as yf

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

import numpy as np
import pandas as pd
import datetime as dt

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

from sklearn.metrics import accuracy_score

import matplotlib as mpl
COLOR = 'white'
mpl.rcParams['text.color'] = COLOR
mpl.rcParams['axes.labelcolor'] = COLOR
mpl.rcParams['xtick.color'] = COLOR
mpl.rcParams['ytick.color'] = COLOR
df_v3 = pd.read_csv('performance_hf_v3.csv')

In [None]:
bgd = '0.28'


β_mean = df_v3['beta'].mean()
β_std = df_v3['beta'].std()

fig,ax = plt.subplots(figsize=(12,7))

ax.hist(df_v3['beta'],bins=30,color='0.75')
max_ylim = ax.get_ylim()[1]

ax.axvline(β_mean, 
           color='w', 
           linestyle='dashed', 
           linewidth=1)
ax.fill_betweenx([0,max_ylim*0.9489],        # y axis interval     
                 β_mean-1.96*β_std,         # x interval lower bound
                 β_mean+1.96*β_std,         # x interval upper bound
                 color='w',                  # shadow color
                 alpha=0.2)                  # transparency


#ax.axvline(β_std +  color='k', linestyle='dashed', linewidth=1)

ax.set_xlim(left=-1,right=0.5)

ax.text(β_mean*1.27,                                      # x axis location
        -50,                                              # y axis location
        '{:.2f}'.format(df_v3['beta'].mean()),            # text content
        color='w')      

ax.text((β_mean-1.96*β_std)*1.07,                               
        -50,                                             
        '{:.2f}'.format(β_mean-1.96*β_std),
       color='w')     

ax.text((β_mean+1.96*β_std)*0.93,                               
        -50,                                             
        '{:.2f}'.format(β_mean+1.96*β_std),
       color='w')     



ax.set_xlabel(r'Equity Portfolio $\beta$')
ax.set_ylabel('Frequency')
ax.set_title(r'Frequency of $\beta$ in Our Dynamic Portfolio')
ax.set_facecolor('0.28')
fig.set_facecolor(bgd)
ax.title.set_size(20)


In [None]:
df_v3['beta'].std()

In [None]:
np.array([type(x_date[i]) == str for i in range(len(x_date))]).min()

In [None]:
test_df


In [None]:
test_df.columns = test_df.columns.swaplevel(0, 1)
test_df.sort_index(axis=1, level=0, inplace=True)
test_df

In [None]:
test_df.AAP