# Unsupervised Learning Trading Strategy

* Download/Load SP500 stocks prices data.
* Calculate different features and indicators on each stock.
* Aggregate on monthly level and filter top 150 most liquid stocks.
* Calculate Monthly Returns for different time-horizons.
* Download Fama-French Factors and Calculate Rolling Factor Betas.
* For each month fit a K-Means Clustering Algorithm to group similar assets based on their features.
* For each month select assets based on the cluster and form a portfolio based on Efficient Frontier max sharpe ratio optimization.
* Visualize Portfolio returns and compare to SP500 returns.

# All Packages Needed:
* pandas, numpy, matplotlib, statsmodels, pandas_datareader, datetime, yfinance, sklearn, PyPortfolioOpt

## 1. Download/Load SP500 stocks prices data.

In [17]:
from statsmodels.regression.rolling import RollingOLS
import pandas_datareader.data as web
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pandas as pd
import numpy as np
import datetime as dt
import yfinance as yf
import pandas_ta
import warnings
warnings.filterwarnings('ignore')

sp500 = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]

sp500['Symbol'] = sp500['Symbol'].str.replace('.', '-')

symbols_list = sp500['Symbol'].unique().tolist()

end_date = '2024-09-24'

start_date = pd.to_datetime(end_date)-pd.DateOffset(365*8)

df = yf.download(tickers=symbols_list,
                 start=start_date,
                 end=end_date).stack()

df.index.names = ['date', 'ticker']

df.columns = df.columns.str.lower()

df

[*********************100%***********************]  503 of 503 completed


Unnamed: 0_level_0,Price,adj close,close,high,low,open,volume
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-09-26 00:00:00+00:00,A,43.734447,46.570000,46.770000,46.160000,46.459999,2639100.0
2016-09-26 00:00:00+00:00,AAPL,26.029972,28.219999,28.347500,27.887501,27.910000,119477600.0
2016-09-26 00:00:00+00:00,ABBV,45.387066,64.070000,64.879997,64.019997,64.519997,5567700.0
2016-09-26 00:00:00+00:00,ABT,36.004753,41.680000,42.020000,41.610001,42.009998,6378200.0
2016-09-26 00:00:00+00:00,ACGL,26.486668,26.486668,26.653334,26.463333,26.653334,1023600.0
...,...,...,...,...,...,...,...
2024-09-23 00:00:00+00:00,XYL,134.509995,134.509995,135.600006,133.990005,135.270004,739200.0
2024-09-23 00:00:00+00:00,YUM,130.369995,130.369995,131.169998,129.679993,129.830002,2140700.0
2024-09-23 00:00:00+00:00,ZBH,107.570000,107.570000,107.690002,106.309998,107.279999,1292900.0
2024-09-23 00:00:00+00:00,ZBRA,362.390015,362.390015,367.890015,359.299988,367.339996,342600.0


## 2. Calculate features and technical indicators for each stock.

* Garman-Klass Volatility
* RSI
* Bollinger Bands
* ATR
* MACD
* Dollar Volume

\begin{equation}
\text{Garman-Klass Volatility} = \frac{(\ln(\text{High}) - \ln(\text{Low}))^2}{2} - (2\ln(2) - 1)(\ln(\text{Adj Close}) - \ln(\text{Open}))^2
\end{equation}

In [18]:
df['garman_klass_vol'] = ((np.log(df['high'])-np.log(df['low']))**2)/2-(2*np.log(2)-1)*((np.log(df['adj close'])-np.log(df['open']))**2)

df['rsi'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.rsi(close=x, length=20))

df['bb_low'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:,0])
                                                          
df['bb_mid'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:,1])
                                                          
df['bb_high'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:,2])

def compute_atr(stock_data):
    atr = pandas_ta.atr(high=stock_data['high'],
                        low=stock_data['low'],
                        close=stock_data['close'],
                        length=14)
    return atr.sub(atr.mean()).div(atr.std())

df['atr'] = df.groupby(level=1, group_keys=False).apply(compute_atr)

def compute_macd(close):
    macd = pandas_ta.macd(close=close, length=20).iloc[:,0]
    return macd.sub(macd.mean()).div(macd.std())

df['macd'] = df.groupby(level=1, group_keys=False)['adj close'].apply(compute_macd)

df['dollar_volume'] = (df['adj close']*df['volume'])/1e6

df

Unnamed: 0_level_0,Price,adj close,close,high,low,open,volume,garman_klass_vol,rsi,bb_low,bb_mid,bb_high,atr,macd,dollar_volume
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2016-09-26 00:00:00+00:00,A,43.734447,46.570000,46.770000,46.160000,46.459999,2639100.0,-0.001326,,,,,,,115.419580
2016-09-26 00:00:00+00:00,AAPL,26.029972,28.219999,28.347500,27.887501,27.910000,119477600.0,-0.001745,,,,,,,3109.998592
2016-09-26 00:00:00+00:00,ABBV,45.387066,64.070000,64.879997,64.019997,64.519997,5567700.0,-0.047706,,,,,,,252.701567
2016-09-26 00:00:00+00:00,ABT,36.004753,41.680000,42.020000,41.610001,42.009998,6378200.0,-0.009144,,,,,,,229.645516
2016-09-26 00:00:00+00:00,ACGL,26.486668,26.486668,26.653334,26.463333,26.653334,1023600.0,0.000010,,,,,,,27.111753
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-09-23 00:00:00+00:00,XYL,134.509995,134.509995,135.600006,133.990005,135.270004,739200.0,0.000059,52.857154,4.848016,4.893654,4.939292,0.930221,-0.006514,99.429788
2024-09-23 00:00:00+00:00,YUM,130.369995,130.369995,131.169998,129.679993,129.830002,2140700.0,0.000059,42.163815,4.875445,4.900356,4.925267,0.340858,-0.828775,279.083049
2024-09-23 00:00:00+00:00,ZBH,107.570000,107.570000,107.690002,106.309998,107.279999,1292900.0,0.000080,46.261499,4.620745,4.699861,4.778977,-0.670485,-0.486367,139.077253
2024-09-23 00:00:00+00:00,ZBRA,362.390015,362.390015,367.890015,359.299988,367.339996,342600.0,0.000208,62.653151,5.767259,5.838859,5.910458,0.268413,0.777368,124.154819


## 3. Aggregate to monthly level and filter top 150 most liquid stocks for each month.

* To reduce training time and experiment with features and strategies, we convert the business-daily data to month-end frequency.

In [19]:
last_cols = [c for c in df.columns.unique(0) if c not in ['dollar_volume', 'volume', 'open',
                                                          'high', 'low', 'close']]

data = (pd.concat([df.unstack('ticker')['dollar_volume'].resample('M').mean().stack('ticker').to_frame('dollar_volume'),
                   df.unstack()[last_cols].resample('M').last().stack('ticker')],
                  axis=1)).dropna()

data

Unnamed: 0_level_0,Unnamed: 1_level_0,dollar_volume,adj close,atr,bb_high,bb_low,bb_mid,garman_klass_vol,macd,rsi
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2016-10-31 00:00:00+00:00,A,75.765039,41.018745,-1.373480,3.839099,3.718486,3.778792,-0.001089,-0.809706,32.795983
2016-10-31 00:00:00+00:00,AAPL,3494.595147,26.182169,-1.302942,3.350880,3.293123,3.322002,-0.002541,-0.275811,43.456213
2016-10-31 00:00:00+00:00,ABBV,274.543306,39.878788,-1.003486,3.852823,3.744517,3.798670,-0.049190,-0.864856,22.957578
2016-10-31 00:00:00+00:00,ABT,317.927984,34.112484,-1.252474,3.650426,3.549492,3.599959,-0.008074,-0.839670,36.011789
2016-10-31 00:00:00+00:00,ACGL,29.912385,25.990000,-1.086603,3.322517,3.278161,3.300339,0.000021,-0.508293,46.128721
...,...,...,...,...,...,...,...,...,...,...
2024-09-30 00:00:00+00:00,XYL,168.773442,134.509995,0.930221,4.939292,4.848016,4.893654,0.000059,-0.006514,52.857154
2024-09-30 00:00:00+00:00,YUM,277.956378,130.369995,0.340858,4.925267,4.875445,4.900356,0.000059,-0.828775,42.163815
2024-09-30 00:00:00+00:00,ZBH,208.291736,107.570000,-0.670485,4.778977,4.620745,4.699861,0.000080,-0.486367,46.261499
2024-09-30 00:00:00+00:00,ZBRA,132.632235,362.390015,0.268413,5.910458,5.767259,5.838859,0.000208,0.777368,62.653151


In [53]:
new_data = data.copy()
# If you need ticker as a column, reset the index
new_data_reset = new_data.reset_index()
print(new_data_reset.head())



Empty DataFrame
Columns: [index, date, atr, bb_high, bb_low, bb_mid, garman_klass_vol, macd, rsi, return_1m, return_2m, return_3m, return_6m, return_9m, return_12m, Mkt-RF, SMB, HML, RMW, CMA]
Index: []


* Calculate 5-year rolling average of dollar volume for each stocks before filtering.

In [20]:
data['dollar_volume'] = (data.loc[:, 'dollar_volume'].unstack('ticker').rolling(5*12, min_periods=12).mean().stack())

data['dollar_vol_rank'] = (data.groupby('date')['dollar_volume'].rank(ascending=False))

data = data[data['dollar_vol_rank']<150].drop(['dollar_volume', 'dollar_vol_rank'], axis=1)

data

Unnamed: 0_level_0,Unnamed: 1_level_0,adj close,atr,bb_high,bb_low,bb_mid,garman_klass_vol,macd,rsi
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-09-30 00:00:00+00:00,AAPL,36.166752,-1.161724,3.684698,3.589136,3.636917,-0.001175,-0.458993,45.768657
2017-09-30 00:00:00+00:00,ABBV,65.390625,-0.480080,4.254232,4.050517,4.152374,-0.035700,1.020006,70.568704
2017-09-30 00:00:00+00:00,ABT,47.232559,-1.192606,3.877438,3.825775,3.851606,-0.006208,0.324536,63.665039
2017-09-30 00:00:00+00:00,ACN,121.064758,-1.130454,4.841457,4.770444,4.805951,-0.005643,0.160785,56.250187
2017-09-30 00:00:00+00:00,ADBE,149.179993,-1.408502,5.085897,4.977719,5.031808,0.000055,-0.295412,47.932457
...,...,...,...,...,...,...,...,...,...
2024-09-30 00:00:00+00:00,VZ,44.259998,-0.154778,3.841348,3.722256,3.781802,0.000084,1.853100,62.516065
2024-09-30 00:00:00+00:00,WFC,55.110001,1.437881,4.100455,3.966194,4.033324,0.000258,-0.421051,48.577460
2024-09-30 00:00:00+00:00,WMT,80.330002,1.931912,4.407265,4.332253,4.369759,0.000051,2.922900,69.038827
2024-09-30 00:00:00+00:00,XOM,117.360001,0.903737,4.795407,4.704741,4.750074,0.000145,-0.247735,55.129726


## 4. Calculate Monthly Returns for different time horizons as features.

* To capture time series dynamics that reflect, for example, momentum patterns, we compute historical returns using the method .pct_change(lag), that is, returns over various monthly periods as identified by lags.

In [21]:
def calculate_returns(df):

    outlier_cutoff = 0.005

    lags = [1, 2, 3, 6, 9, 12]

    for lag in lags:

        df[f'return_{lag}m'] = (df['adj close']
                              .pct_change(lag)
                              .pipe(lambda x: x.clip(lower=x.quantile(outlier_cutoff),
                                                     upper=x.quantile(1-outlier_cutoff)))
                              .add(1)
                              .pow(1/lag)
                              .sub(1))
    return df
    
    
data = data.groupby(level=1, group_keys=False).apply(calculate_returns).dropna()

data

Unnamed: 0_level_0,Unnamed: 1_level_0,adj close,atr,bb_high,bb_low,bb_mid,garman_klass_vol,macd,rsi,return_1m,return_2m,return_3m,return_6m,return_9m,return_12m
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2018-09-30 00:00:00+00:00,AAPL,53.774536,-0.893718,4.018120,3.958415,3.988268,-0.000718,-0.053507,61.186162,-0.008303,0.091080,0.069629,0.051986,0.033843,0.033607
2018-09-30 00:00:00+00:00,ABBV,72.047684,-0.713339,4.312119,4.257714,4.284916,-0.027228,-0.480625,49.718911,-0.014586,0.012660,0.010312,0.003293,0.000594,0.008112
2018-09-30 00:00:00+00:00,ABT,66.177361,-1.105082,4.211202,4.073479,4.142340,-0.003433,1.151629,79.127152,0.097547,0.057978,0.065081,0.035906,0.029900,0.028503
2018-09-30 00:00:00+00:00,ACN,155.388916,-1.090293,5.077972,5.037983,5.057977,-0.003063,0.130788,54.490806,0.006684,0.033549,0.013291,0.018852,0.012830,0.021018
2018-09-30 00:00:00+00:00,ADBE,269.950012,-0.818217,5.619120,5.555771,5.587446,0.000100,0.115332,56.971111,0.024439,0.050370,0.034532,0.037795,0.049180,0.050665
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-09-30 00:00:00+00:00,VRTX,460.000000,1.662944,6.210427,6.132821,6.171624,0.000078,-1.003082,41.206536,-0.072375,-0.036702,-0.006240,0.016081,0.013725,0.023588
2024-09-30 00:00:00+00:00,VZ,44.259998,-0.154778,3.841348,3.722256,3.781802,0.000084,1.853100,62.516065,0.059359,0.045132,0.029392,0.014379,0.023544,0.028640
2024-09-30 00:00:00+00:00,WFC,55.110001,1.437881,4.100455,3.966194,4.033324,0.000258,-0.421051,48.577460,-0.057465,-0.032627,-0.022146,-0.006161,0.014930,0.027744
2024-09-30 00:00:00+00:00,WMT,80.330002,1.931912,4.407265,4.332253,4.369759,0.000051,2.922900,69.038827,0.040140,0.083349,0.059629,0.049784,0.048409,0.034208


## 5. Download Fama-French Factors and Calculate Rolling Factor Betas.

* We will introduce the Fama—French data to estimate the exposure of assets to common risk factors using linear regression.

* The five Fama—French factors, namely market risk, size, value, operating profitability, and investment have been shown empirically to explain asset returns and are commonly used to assess the risk/return profile of portfolios. Hence, it is natural to include past factor exposures as financial features in models.

* We can access the historical factor returns using the pandas-datareader and estimate historical exposures using the RollingOLS rolling linear regression.

In [22]:
import pandas_datareader.data as web

# Retrieve factor data and process it
factor_data = web.DataReader('F-F_Research_Data_5_Factors_2x3',
                             'famafrench',
                             start='2020')[0].drop('RF', axis=1)

factor_data.index = factor_data.index.to_timestamp()

# Resample and adjust factor data
factor_data = factor_data.resample('M').last().div(100)

factor_data.index.name = 'date'

# Handle the index of data['return_1m'] if it's a MultiIndex
if isinstance(data['return_1m'].index, pd.MultiIndex):
    # If one of the levels is a datetime index, localize it
    if isinstance(data['return_1m'].index.get_level_values(0), pd.DatetimeIndex):
        data['return_1m'].index = data['return_1m'].index.set_levels(
            [data['return_1m'].index.levels[0].tz_localize(None), data['return_1m'].index.levels[1]],
            level=[0, 1]
        )
else:
    # Remove timezone information from index if it's a DatetimeIndex
    data['return_1m'].index = data['return_1m'].index.tz_localize(None)

# Join factor data with returns data and sort by index
factor_data = factor_data.join(data['return_1m']).sort_index()

# Output the final joined DataFrame
factor_data


data_factored=factor_data.copy()

Unnamed: 0_level_0,Unnamed: 1_level_0,Mkt-RF,SMB,HML,RMW,CMA,return_1m
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-31,AAPL,-0.0011,-0.0440,-0.0625,-0.0120,-0.0230,0.054010
2020-01-31,ABBV,-0.0011,-0.0440,-0.0625,-0.0120,-0.0230,-0.072570
2020-01-31,ABT,-0.0011,-0.0440,-0.0625,-0.0120,-0.0230,0.007485
2020-01-31,ACN,-0.0011,-0.0440,-0.0625,-0.0120,-0.0230,-0.021687
2020-01-31,ADBE,-0.0011,-0.0440,-0.0625,-0.0120,-0.0230,0.064674
...,...,...,...,...,...,...,...
2024-07-31,VRTX,0.0124,0.0833,0.0570,0.0017,0.0043,0.057604
2024-07-31,VZ,0.0124,0.0833,0.0570,0.0017,0.0043,-0.001379
2024-07-31,WFC,0.0124,0.0833,0.0570,0.0017,0.0043,-0.000842
2024-07-31,WMT,0.0124,0.0833,0.0570,0.0017,0.0043,0.013735


In [49]:
data_factored=data.copy()
print(data_factored.columns)

Index(['date', 'atr', 'bb_high', 'bb_low', 'bb_mid', 'garman_klass_vol',
       'macd', 'rsi', 'return_1m', 'return_2m', 'return_3m', 'return_6m',
       'return_9m', 'return_12m', 'Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA'],
      dtype='object')


* Filter out stocks with less than 10 months of data.

In [23]:
observations = factor_data.groupby(level=1).size()

valid_stocks = observations[observations >= 10]

factor_data = factor_data[factor_data.index.get_level_values('ticker').isin(valid_stocks.index)]

factor_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Mkt-RF,SMB,HML,RMW,CMA,return_1m
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-31,AAPL,-0.0011,-0.0440,-0.0625,-0.0120,-0.0230,0.054010
2020-01-31,ABBV,-0.0011,-0.0440,-0.0625,-0.0120,-0.0230,-0.072570
2020-01-31,ABT,-0.0011,-0.0440,-0.0625,-0.0120,-0.0230,0.007485
2020-01-31,ACN,-0.0011,-0.0440,-0.0625,-0.0120,-0.0230,-0.021687
2020-01-31,ADBE,-0.0011,-0.0440,-0.0625,-0.0120,-0.0230,0.064674
...,...,...,...,...,...,...,...
2024-07-31,VRTX,0.0124,0.0833,0.0570,0.0017,0.0043,0.057604
2024-07-31,VZ,0.0124,0.0833,0.0570,0.0017,0.0043,-0.001379
2024-07-31,WFC,0.0124,0.0833,0.0570,0.0017,0.0043,-0.000842
2024-07-31,WMT,0.0124,0.0833,0.0570,0.0017,0.0043,0.013735


* Calculate Rolling Factor Betas.

In [24]:
betas = (factor_data.groupby(level=1,
                            group_keys=False)
         .apply(lambda x: RollingOLS(endog=x['return_1m'], 
                                     exog=sm.add_constant(x.drop('return_1m', axis=1)),
                                     window=min(24, x.shape[0]),
                                     min_nobs=len(x.columns)+1)
         .fit(params_only=True)
         .params
         .drop('const', axis=1)))

betas 

Unnamed: 0_level_0,Unnamed: 1_level_0,Mkt-RF,SMB,HML,RMW,CMA
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-31,AAPL,,,,,
2020-01-31,ABBV,,,,,
2020-01-31,ABT,,,,,
2020-01-31,ACN,,,,,
2020-01-31,ADBE,,,,,
...,...,...,...,...,...,...
2024-07-31,VRTX,0.569025,0.648189,-0.392910,0.648667,0.140270
2024-07-31,VZ,0.689617,-0.844189,0.631207,0.297424,-0.472912
2024-07-31,WFC,1.111663,-0.130669,1.182282,-0.816394,-0.613398
2024-07-31,WMT,0.476749,0.123753,-0.164167,0.869365,0.094606


* Join the rolling factors data to the main features dataframe.

In [25]:
factors = ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA']

data = (data.join(betas.groupby('ticker').shift()))

data.loc[:, factors] = data.groupby('ticker', group_keys=False)[factors].apply(lambda x: x.fillna(x.mean()))

data = data.drop('adj close', axis=1)

data = data.dropna()

data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 0 entries
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   atr               0 non-null      float64
 1   bb_high           0 non-null      float64
 2   bb_low            0 non-null      float64
 3   bb_mid            0 non-null      float64
 4   garman_klass_vol  0 non-null      float64
 5   macd              0 non-null      float64
 6   rsi               0 non-null      float64
 7   return_1m         0 non-null      float64
 8   return_2m         0 non-null      float64
 9   return_3m         0 non-null      float64
 10  return_6m         0 non-null      float64
 11  return_9m         0 non-null      float64
 12  return_12m        0 non-null      float64
 13  Mkt-RF            0 non-null      float64
 14  SMB               0 non-null      float64
 15  HML               0 non-null      float64
 16  RMW               0 non-null      float64
 17  CMA      

### At this point we have to decide on what ML model and approach to use for predictions etc.


## 6. For each month fit a K-Means Clustering Algorithm to group similar assets based on their features.

### K-Means Clustering
* You may want to initialize predefined centroids for each cluster based on your research.

* For visualization purpose of this tutorial we will initially rely on the ‘k-means++’ initialization.

* Then we will pre-define our centroids for each cluster.

In [26]:
df.columns

Index(['adj close', 'close', 'high', 'low', 'open', 'volume',
       'garman_klass_vol', 'rsi', 'bb_low', 'bb_mid', 'bb_high', 'atr', 'macd',
       'dollar_volume'],
      dtype='object', name='Price')

In [54]:
from sklearn.cluster import KMeans

# Check if the 'cluster' column exists before dropping it
if 'cluster' in data.columns:
    data = data.drop('cluster', axis=1)

def get_clusters(df):
    # Fit KMeans and assign clusters to a new column 'cluster'
    df['cluster'] = KMeans(n_clusters=4,
                           random_state=0,
                           init=initial_centroids).fit(df).labels_
    return df

# Apply KMeans clustering to your data after dropping NaN values and grouping by 'date'
data = data.dropna().groupby('date', group_keys=False).apply(get_clusters)


In [55]:
def plot_clusters(data):

    cluster_0 = data[data['cluster'] == 0]
    cluster_1 = data[data['cluster'] == 1]
    cluster_2 = data[data['cluster'] == 2]
    cluster_3 = data[data['cluster'] == 3]

    plt.scatter(cluster_0.iloc[:, 0], cluster_0.iloc[:, 6], color='red', label='cluster 0')
    plt.scatter(cluster_1.iloc[:, 0], cluster_1.iloc[:, 6], color='green', label='cluster 1')
    plt.scatter(cluster_2.iloc[:, 0], cluster_2.iloc[:, 6], color='blue', label='cluster 2')
    plt.scatter(cluster_3.iloc[:, 0], cluster_3.iloc[:, 6], color='black', label='cluster 3')
    
    plt.legend()
    plt.show()
    return



In [56]:
plt.style.use('ggplot')

for i in data.index.get_level_values('date').unique().tolist():
    
    g = data.xs(i, level=0)
    
    plt.title(f'Date {i}')
    
    plot_clusters(g)


### Apply pre-defined centroids.

In [57]:
initial_centroids = np.zeros((4, data.shape[1]))

target_rsi_values = [30, 45, 55, 70]

initial_centroids[:, 6] = target_rsi_values

initial_centroids


array([[ 0.,  0.,  0.,  0.,  0.,  0., 30.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0., 45.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0., 55.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0., 70.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.]])

## 7. For each month select assets based on the cluster and form a portfolio based on Efficient Frontier max sharpe ratio optimization

* First we will filter only stocks corresponding to the cluster we choose based on our hypothesis.

* Momentum is persistent and my idea would be that stocks clustered around RSI 70 centroid should continue to outperform in the following month - thus I would select stocks corresponding to cluster 3.


In [58]:
filtered_df = data[data['cluster'] == 3].copy()

filtered_df = filtered_df.reset_index(level=1)

filtered_df.index = filtered_df.index + pd.DateOffset(1)

filtered_df = filtered_df.reset_index().set_index(['date', 'ticker'])

dates = filtered_df.index.get_level_values('date').unique().tolist()

fixed_dates = {}

for d in dates:
    
    fixed_dates[d.strftime('%Y-%m-%d')] = filtered_df.xs(d, level=0).index.tolist()
    
fixed_dates


KeyError: 'cluster'

### Define portfolio optimization function

* We will define a function which optimizes portfolio weights using PyPortfolioOpt package and EfficientFrontier optimizer to maximize the sharpe ratio.

* To optimize the weights of a given portfolio we would need to supply last 1 year prices to the function.

* Apply signle stock weight bounds constraint for diversification (minimum half of equaly weight and maximum 10% of portfolio).

In [59]:
from pypfopt.efficient_frontier import EfficientFrontier
from pypfopt import risk_models
from pypfopt import expected_returns

def optimize_weights(prices, lower_bound=0):
    
    returns = expected_returns.mean_historical_return(prices=prices,
                                                      frequency=252)
    
    cov = risk_models.sample_cov(prices=prices,
                                 frequency=252)
    
    ef = EfficientFrontier(expected_returns=returns,
                           cov_matrix=cov,
                           weight_bounds=(lower_bound, .1),
                           solver='SCS')
    
    weights = ef.max_sharpe()
    
    return ef.clean_weights()



* Download Fresh Daily Prices Data only for short listed stocks.

In [60]:
print(data.columns)

# Check the current index names
print(data.index.names)


Index(['date', 'atr', 'bb_high', 'bb_low', 'bb_mid', 'garman_klass_vol',
       'macd', 'rsi', 'return_1m', 'return_2m', 'return_3m', 'return_6m',
       'return_9m', 'return_12m', 'Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA'],
      dtype='object')
['date']


In [63]:


# Check if 'date' is already a column in the DataFrame
if 'date' in data.columns:
    # If it exists, you can drop it or rename it
    data = data.drop(columns=['date'], errors='ignore')

# Reset the index without adding the current index as a column
data.reset_index(inplace=True, drop=False)  # If you want to keep the index, set drop=False

# Now set 'date' and 'ticker' as the new index
data.set_index(['date', 'ticker'], inplace=True)


# Proceed with downloading stock data
stocks = data.index.get_level_values('ticker').unique().tolist()

new_df = yf.download(tickers=stocks,
                     start=data.index.get_level_values('date').unique()[0] - pd.DateOffset(months=12),
                     end=data.index.get_level_values('date').unique()[-1])

print(new_df)


KeyError: "None of ['ticker'] are in the columns"

* Calculate daily returns for each stock which could land up in our portfolio.

* Then loop over each month start, select the stocks for the month and calculate their weights for the next month.

* If the maximum sharpe ratio optimization fails for a given month, apply equally-weighted weights.

* Calculated each day portfolio return.

In [64]:
returns_dataframe = np.log(new_df['Adj Close']).diff()

portfolio_df = pd.DataFrame()

for start_date in fixed_dates.keys():
    
    try:

        end_date = (pd.to_datetime(start_date)+pd.offsets.MonthEnd(0)).strftime('%Y-%m-%d')

        cols = fixed_dates[start_date]

        optimization_start_date = (pd.to_datetime(start_date)-pd.DateOffset(months=12)).strftime('%Y-%m-%d')

        optimization_end_date = (pd.to_datetime(start_date)-pd.DateOffset(days=1)).strftime('%Y-%m-%d')
        
        optimization_df = new_df[optimization_start_date:optimization_end_date]['Adj Close'][cols]
        
        success = False
        try:
            weights = optimize_weights(prices=optimization_df,
                                   lower_bound=round(1/(len(optimization_df.columns)*2),3))

            weights = pd.DataFrame(weights, index=pd.Series(0))
            
            success = True
        except:
            print(f'Max Sharpe Optimization failed for {start_date}, Continuing with Equal-Weights')
        
        if success==False:
            weights = pd.DataFrame([1/len(optimization_df.columns) for i in range(len(optimization_df.columns))],
                                     index=optimization_df.columns.tolist(),
                                     columns=pd.Series(0)).T
        
        temp_df = returns_dataframe[start_date:end_date]

        temp_df = temp_df.stack().to_frame('return').reset_index(level=0)\
                   .merge(weights.stack().to_frame('weight').reset_index(level=0, drop=True),
                          left_index=True,
                          right_index=True)\
                   .reset_index().set_index(['Date', 'index']).unstack().stack()

        temp_df.index.names = ['date', 'ticker']

        temp_df['weighted_return'] = temp_df['return']*temp_df['weight']

        temp_df = temp_df.groupby(level=0)['weighted_return'].sum().to_frame('Strategy Return')

        portfolio_df = pd.concat([portfolio_df, temp_df], axis=0)
    
    except Exception as e:
        print(e)

portfolio_df = portfolio_df.drop_duplicates()

portfolio_df

NameError: name 'new_df' is not defined

 ## 8. Visualize Portfolio returns and compare to SP500 returns.

In [None]:
spy = yf.download(tickers='SPY',
                  start='2015-01-01',
                  end=dt.date.today())

spy_ret = np.log(spy[['Adj Close']]).diff().dropna().rename({'Adj Close':'SPY Buy&Hold'}, axis=1)

portfolio_df = portfolio_df.merge(spy_ret,
                                  left_index=True,
                                  right_index=True)

portfolio_df

In [None]:
import matplotlib.ticker as mtick

plt.style.use('ggplot')

portfolio_cumulative_return = np.exp(np.log1p(portfolio_df).cumsum())-1

portfolio_cumulative_return[:'2023-09-29'].plot(figsize=(16,6))

plt.title('Unsupervised Learning Trading Strategy Returns Over Time')

plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1))

plt.ylabel('Return')

plt.show()


# Twitter Sentiment Investing Strategy

## 1. Load Twitter Sentiment Data

* Load the twitter sentiment dataset, set the index, calculat engagement ratio and filter out stocks with no significant twitter activity.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import yfinance as yf
import os
plt.style.use('ggplot')

data_folder = '/Users/jacobcherian/projects/aies ml algorithm trading/algo_trading/'

sentiment_df = pd.read_csv(os.path.join(data_folder, 'sentiment_data.csv'))

sentiment_df['date'] = pd.to_datetime(sentiment_df['date'])

sentiment_df = sentiment_df.set_index(['date', 'symbol'])

sentiment_df['engagement_ratio'] = sentiment_df['twitterComments']/sentiment_df['twitterLikes']

sentiment_df = sentiment_df[(sentiment_df['twitterLikes']>20)&(sentiment_df['twitterComments']>10)]

sentiment_df

## 2. Aggregate Monthly and calculate average sentiment for the month

* Aggregate on a monthly level and calculate average monthly metric, for the one we choose.

In [None]:
aggragated_df = (sentiment_df.reset_index('symbol').groupby([pd.Grouper(freq='M'), 'symbol'])
                    [['engagement_ratio']].mean())

aggragated_df['rank'] = (aggragated_df.groupby(level=0)['engagement_ratio']
                         .transform(lambda x: x.rank(ascending=False)))

aggragated_df

## 3. Select Top 5 Stocks based on their cross-sectional ranking for each month

* Select top 5 stocks by rank for each month and fix the date to start at beginning of next month.

In [None]:
filtered_df = aggragated_df[aggragated_df['rank']<6].copy()

filtered_df = filtered_df.reset_index(level=1)

filtered_df.index = filtered_df.index+pd.DateOffset(1)

filtered_df = filtered_df.reset_index().set_index(['date', 'symbol'])

filtered_df.head(20)

## 4. Extract the stocks to form portfolios with at the start of each new month

* Create a dictionary containing start of month and corresponded selected stocks.

In [None]:
dates = filtered_df.index.get_level_values('date').unique().tolist()

fixed_dates = {}

for d in dates:
    
    fixed_dates[d.strftime('%Y-%m-%d')] = filtered_df.xs(d, level=0).index.tolist()
    
fixed_dates

## 5. Download fresh stock prices for only selected/shortlisted stocks

In [None]:
stocks_list = sentiment_df.index.get_level_values('symbol').unique().tolist()

prices_df = yf.download(tickers=stocks_list,
                        start='2021-01-01',
                        end='2023-03-01')

## 6. Calculate Portfolio Returns with monthly rebalancing


In [None]:
returns_df = np.log(prices_df['Adj Close']).diff().dropna()

portfolio_df = pd.DataFrame()

for start_date in fixed_dates.keys():
    
    end_date = (pd.to_datetime(start_date)+pd.offsets.MonthEnd()).strftime('%Y-%m-%d')
    
    cols = fixed_dates[start_date]
    
    temp_df = returns_df[start_date:end_date][cols].mean(axis=1).to_frame('portfolio_return')
    
    portfolio_df = pd.concat([portfolio_df, temp_df], axis=0)
    
portfolio_df

## 7. Download NASDAQ/QQQ prices and calculate returns to compare to our strategy

In [None]:
qqq_df = yf.download(tickers='QQQ',
                     start='2021-01-01',
                     end='2023-03-01')

qqq_ret = np.log(qqq_df['Adj Close']).diff().to_frame('nasdaq_return')

portfolio_df = portfolio_df.merge(qqq_ret,
                                  left_index=True,
                                  right_index=True)

portfolio_df

In [None]:
portfolios_cumulative_return = np.exp(np.log1p(portfolio_df).cumsum()).sub(1)

portfolios_cumulative_return.plot(figsize=(16,6))

plt.title('Twitter Engagement Ratio Strategy Return Over Time')

plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1))

plt.ylabel('Return')

plt.show()

# 3 Intraday Strategy Using GARCH Model


* Using simulated daily data and intraday 5-min data.
* Load Daily and 5-minute data.
* Define function to fit GARCH model on the daily data and predict 1-day ahead volatility in a rolling window.
* Calculate prediction premium and form a daily signal from it.
* Merge with intraday data and calculate intraday indicators to form the intraday signal.
* Generate the position entry and hold until the end of the day.
* Calculate final strategy returns.

## 1. Load Simulated Daily and Simulated 5-minute data.

* We are loading both datasets, set the indexes and calculate daily log returns.

In [None]:
import matplotlib.pyplot as plt
from arch import arch_model
import pandas_ta
import pandas as pd
import numpy as np
import os

data_folder = '/Users/jacobcherian/projects/aies ml algorithm trading/algo_trading'

daily_df = pd.read_csv(os.path.join(data_folder, 'simulated_daily_data.csv'))

daily_df = daily_df.drop('Unnamed: 7', axis=1)

daily_df['Date'] = pd.to_datetime(daily_df['Date'])

daily_df = daily_df.set_index('Date')


intraday_5min_df = pd.read_csv(os.path.join(data_folder, 'simulated_5min_data.csv'))

intraday_5min_df = intraday_5min_df.drop('Unnamed: 6', axis=1)

intraday_5min_df['datetime'] = pd.to_datetime(intraday_5min_df['datetime'])

intraday_5min_df = intraday_5min_df.set_index('datetime')

intraday_5min_df['date'] = pd.to_datetime(intraday_5min_df.index.date)

intraday_5min_df

## 2. Define function to fit GARCH model and predict 1-day ahead volatility in a rolling window.

* We are first calculating the 6-month rolling variance and then we are creating a function in a 6-month rolling window to fit a garch model and predict the next day variance.

In [None]:
daily_df['log_ret'] = np.log(daily_df['Adj Close']).diff()

daily_df['variance'] = daily_df['log_ret'].rolling(180).var()

daily_df = daily_df['2020':]

def predict_volatility(x):
    
    best_model = arch_model(y=x,
                            p=1,
                            q=3).fit(update_freq=5,
                                     disp='off')
    
    variance_forecast = best_model.forecast(horizon=1).variance.iloc[-1,0]

    print(x.index[-1])
    
    return variance_forecast

daily_df['predictions'] = daily_df['log_ret'].rolling(180).apply(lambda x: predict_volatility(x))

daily_df = daily_df.dropna()

daily_df

## 3. Calculate prediction premium and form a daily signal from it.

* We are calculating the prediction premium. And calculate its 6-month rolling standard deviation.

* From this we are creating our daily signal.

In [None]:
daily_df['prediction_premium'] = (daily_df['predictions']-daily_df['variance'])/daily_df['variance']

daily_df['premium_std'] = daily_df['prediction_premium'].rolling(180).std()

daily_df['signal_daily'] = daily_df.apply(lambda x: 1 if (x['prediction_premium']>x['premium_std'])
                                         else (-1 if (x['prediction_premium']<x['premium_std']*-1) else np.nan),
                                         axis=1)

daily_df['signal_daily'] = daily_df['signal_daily'].shift()

daily_df

In [None]:
plt.style.use('ggplot')

daily_df['signal_daily'].plot(kind='hist')

plt.show()

## 4. Merge with intraday data and calculate intraday indicators to form the intraday signal.

* Calculate all intraday indicators and intraday signal.

In [None]:
final_df = intraday_5min_df.reset_index()\
                            .merge(daily_df[['signal_daily']].reset_index(),
                                   left_on='date',
                                   right_on='Date')\
                            .drop(['date','Date'], axis=1)\
                            .set_index('datetime')

final_df['rsi'] = pandas_ta.rsi(close=final_df['close'],
                                length=20)

final_df['lband'] = pandas_ta.bbands(close=final_df['close'],
                                     length=20).iloc[:,0]

final_df['uband'] = pandas_ta.bbands(close=final_df['close'],
                                     length=20).iloc[:,2]

final_df['signal_intraday'] = final_df.apply(lambda x: 1 if (x['rsi']>70)&
                                                            (x['close']>x['uband'])
                                             else (-1 if (x['rsi']<30)&
                                                         (x['close']<x['lband']) else np.nan),
                                             axis=1)

final_df['return'] = np.log(final_df['close']).diff()

final_df

## 5. Generate the position entry and hold until the end of the day.

In [None]:
final_df['return_sign'] = final_df.apply(lambda x: -1 if (x['signal_daily']==1)&(x['signal_intraday']==1)
                                        else (1 if (x['signal_daily']==-1)&(x['signal_intraday']==-1) else np.nan),
                                        axis=1)

final_df['return_sign'] = final_df.groupby(pd.Grouper(freq='D'))['return_sign']\
                                  .transform(lambda x: x.ffill())

final_df['forward_return'] = final_df['return'].shift(-1)

final_df['strategy_return'] = final_df['forward_return']*final_df['return_sign']

daily_return_df = final_df.groupby(pd.Grouper(freq='D'))['strategy_return'].sum()

## 6. Calculate final strategy returns.

In [None]:
import matplotlib.ticker as mtick

strategy_cumulative_return = np.exp(np.log1p(daily_return_df).cumsum()).sub(1)

strategy_cumulative_return.plot(figsize=(16,6))

plt.title('Intraday Strategy Returns')

plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1))

plt.ylabel('Return')

plt.show()
                                                                            