## Goal: to add Technical features to the choosen stocks  based on Clustering Analysis

In [53]:
import pickle
import ta
import pandas as pd
import os
import numpy as np


In [54]:
def add_features(df):

    windows = [6,18,24,30,50,100,200]

    all_tickers = []

    for ticker, df0 in df.groupby('ticker'):
        df0 = df0.copy()

        for w in windows:
            if len(df0) >= w:
                # RSI
                df0['RSI_' + str(w)] = ta.momentum.RSIIndicator(df0['Close'], window=w, fillna=True).rsi()


                # MACD
                for w2 in windows:
                    if w > w2:
                        # Will utilize macd_diff because that is more normalized
                        df0['MACD_f'+str(w2)+'_s'+str(w)] = ta.trend.MACD(df0['Close'], window_slow=w2, window_fast=w, fillna=True).macd_diff()

                # Bollinger Bands
                ## Stdev default=2, but can change it if desired
                # Currently returning high/low band indicators, but can add actual values if desired.
                bbands = ta.volatility.BollingerBands(df0['Close'], window=w, fillna=True)
                df0['BBands_' + str(w) + '_h_ind'] = bbands.bollinger_hband_indicator()
                df0['BBands_' + str(w) + '_l_ind'] = bbands.bollinger_lband_indicator()
                #actual values
                df0['BBands_' + str(w) + 'hband'] = bbands.bollinger_hband()
                df0['BBands_' + str(w) + 'lband'] = bbands.bollinger_lband()

                # Average True Range (ATR)
                df0['ATR_' + str(w)] = ta.volatility.AverageTrueRange(high=df0['High'],low=df0['Low'],close=df0['Close'], window=w, fillna=True).average_true_range()
                
                # Donchian Channel (DONCHIAN)
                d_channel = ta.volatility.DonchianChannel(high=df0['High'],low=df0['Low'],close=df0['Close'], window=w, fillna=True)
                df0['DONCHAIN_' + str(w) + 'hband'] = d_channel.donchian_channel_hband()
                df0['DONCHAIN_' + str(w) + 'lband'] = d_channel.donchian_channel_lband()
                
                # Keltner Channel (KELTNER)
                # Using SMA as centerline
                k_channel = ta.volatility.KeltnerChannel(high=df0['High'],low=df0['Low'],close=df0['Close'], window=w, 
                                                      original_version= True, fillna=True)
                df0['KELTNER_' + str(w) + '_h_ind'] = k_channel.keltner_channel_hband_indicator()
                df0['KELTNER_' + str(w) + '_l_ind'] = k_channel.keltner_channel_lband_indicator()
                
                # Stochastic Oscillator (SR/STOCH)
                df0['STOCH_' + str(w)] = ta.momentum.StochasticOscillator(high=df0['High'],low=df0['Low'],close=df0['Close'], window=w, fillna=True).stoch()

                # Chaikin Money Flow Indicator (CMF)
                df0['CMF_' + str(w)] = ta.volume.ChaikinMoneyFlowIndicator(high=df0['High'],low=df0['Low'],close=df0['Close'],volume=df0['Volume'], window=w, fillna=True).chaikin_money_flow()

                # Ichimoku Indicator (ICHI)
                for w2 in windows:
                    for w3 in windows:
                        if (w > w2) & (w2 > w3):
                            ichimoku = ta.trend.IchimokuIndicator(high=df0['High'],low=df0['Low'],window1=w3, window2=w2, window3=w, fillna=True)
                            df0['ICHI_conv_' + str(w3)+'_'+str(w2)+'_'+str(w)] = ichimoku.ichimoku_conversion_line()
                            df0['ICHI_base_' + str(w3)+'_'+str(w2)+'_'+str(w)] = ichimoku.ichimoku_base_line()
                            df0['ICHI_diff_' + str(w3)+'_'+str(w2)+'_'+str(w)] = df0['ICHI_conv_' + str(w3)+'_'+str(w2)+'_'+str(w)] - df0['ICHI_base_' + str(w3)+'_'+str(w2)+'_'+str(w)]


                # SMA
                df0['SMA_' + str(w)] = ta.trend.SMAIndicator(df0['Close'], window=w, fillna=True).sma_indicator()

                # SMA Crossover
                for w2 in windows:
                    if w > w2:
                        sma_s = ta.trend.SMAIndicator(df0['Close'], window=w, fillna=True).sma_indicator()
                        sma_f = ta.trend.SMAIndicator(df0['Close'], window=w2, fillna=True).sma_indicator()
                        df0['SMA_cross_f' + str(w2) + '_s' + str(w)] = sma_f - sma_s

                # EMA
                df0['EMA_' + str(w)] = ta.trend.EMAIndicator(df0['Close'], window=w, fillna=True).ema_indicator()

                # EMA Crossover
                for w2 in windows:
                    if w > w2:
                        ema_s = ta.trend.EMAIndicator(df0['Close'], window=w, fillna=True).ema_indicator()
                        ema_f = ta.trend.EMAIndicator(df0['Close'], window=w2, fillna=True).ema_indicator()
                        df0['SMA_cross_f' + str(w2) + '_s' + str(w)] = ema_f - ema_s


            ## WINDOW NOT REQUIRED
            # On Balance Volume Indicator (OBV)
            df0['OBV'] = ta.volume.OnBalanceVolumeIndicator(close=df0['Close'],volume=df0['Volume'], fillna=True).on_balance_volume()

            # Volume-Price Trend (VPT)
            df0['VPT'] = ta.volume.VolumePriceTrendIndicator(close=df0['Close'],volume=df0['Volume'], fillna=True).volume_price_trend()

            # Accumulation/Distribution Index Indicator (ADI)
            df0['ADI'] = ta.volume.AccDistIndexIndicator(high=df0['High'],low=df0['Low'],close=df0['Close'],volume=df0['Volume'], fillna=True).acc_dist_index()

        # Getting daily returns (pct and log) for 1,2,3 days
        return_days = [1,2,3,7,14]
        for day in return_days:
            df0[f'{day}_day_return'] = (df0['Close'] / df0['Close'].shift(day)) - 1
            df0[f'{day}_day_log_return'] = (np.log(df0['Close']) - np.log(df0['Close'].shift(day)) )* 100
        for day in return_days:
            df0[f'{day}_day_return'].fillna(0, inplace=True)
            df0[f'{day}_day_log_return'].fillna(0, inplace=True)

        all_tickers.append(df0)

    final = pd.concat(all_tickers)
    final = final.sort_values(by=['reportperiod','ticker'])

    return final



In [55]:
# ALL RAW DATA
print('Reading Raw Data...')
df = pd.read_pickle('../data/raw_data.pkl')

Reading Raw Data...


In [56]:
df.head()

Unnamed: 0,ticker,dimension,calendardate,datekey,reportperiod,lastupdated,accoci,assets,assetsavg,assetsc,...,taxexp,taxliabilities,tbvps,workingcapital,Open,High,Low,Close,Adj Close,Volume
57078,NKE,MRY,2011-12-31,2011-05-31,2011-05-31,2022-01-06,95000000.0,14998000000.0,14497750000.0,11297000000.0,...,690000000.0,1038000000.0,7.543,7339000000.0,21.209999,21.237499,20.955,21.112499,18.571474,8879200.0
57077,NKE,MRY,2011-12-31,2011-05-31,2011-06-01,2022-01-06,95000000.0,14998000000.0,14497750000.0,11297000000.0,...,690000000.0,1038000000.0,7.543,7339000000.0,21.084999,21.18,20.52,20.5375,18.065693,10124800.0
57076,NKE,MRY,2011-12-31,2011-05-31,2011-06-02,2022-01-06,95000000.0,14998000000.0,14497750000.0,11297000000.0,...,690000000.0,1038000000.0,7.543,7339000000.0,20.467501,20.647499,20.264999,20.2875,17.91338,11829200.0
57075,NKE,MRY,2011-12-31,2011-05-31,2011-06-03,2022-01-06,95000000.0,14998000000.0,14497750000.0,11297000000.0,...,690000000.0,1038000000.0,7.543,7339000000.0,20.127501,20.3375,20.055,20.09,17.738981,11259200.0
57074,NKE,MRY,2011-12-31,2011-05-31,2011-06-06,2022-01-06,95000000.0,14998000000.0,14497750000.0,11297000000.0,...,690000000.0,1038000000.0,7.543,7339000000.0,20.092501,20.237499,19.934999,19.942499,17.608746,9092800.0


In [57]:
df.ticker.unique() 

array(['NKE', nan, 'MSFT', 'PG', 'CSCO', 'AAPL', 'V', 'DIS', 'AXP', 'BA',
       'CAT', 'CVX', 'DD', 'GE', 'GS', 'IBM', 'INTC', 'JNJ', 'JPM', 'KO',
       'MCD', 'MMM', 'MRK', 'PFE', 'TRV', 'TSLA', 'UNH', 'VZ', 'XOM',
       'HD', 'WMT'], dtype=object)

In [58]:
 # Tickers chosen from clustering/sector selections
tickers = ['VZ', 'TSLA', 'INTC', 'CAT', 'JNJ', 'PFE', 'AAPL', 'MSFT']
df = df[df['ticker'].isin(tickers)]

In [59]:
 # Add technical features
print('Adding Technical Features...')
df = add_features(df)

Adding Technical Features...


  df0['STOCH_' + str(w)] = ta.momentum.StochasticOscillator(high=df0['High'],low=df0['Low'],close=df0['Close'], window=w, fillna=True).stoch()
  df0['CMF_' + str(w)] = ta.volume.ChaikinMoneyFlowIndicator(high=df0['High'],low=df0['Low'],close=df0['Close'],volume=df0['Volume'], window=w, fillna=True).chaikin_money_flow()
  df0['ICHI_conv_' + str(w3)+'_'+str(w2)+'_'+str(w)] = ichimoku.ichimoku_conversion_line()
  df0['ICHI_base_' + str(w3)+'_'+str(w2)+'_'+str(w)] = ichimoku.ichimoku_base_line()
  df0['ICHI_diff_' + str(w3)+'_'+str(w2)+'_'+str(w)] = df0['ICHI_conv_' + str(w3)+'_'+str(w2)+'_'+str(w)] - df0['ICHI_base_' + str(w3)+'_'+str(w2)+'_'+str(w)]
  df0['SMA_' + str(w)] = ta.trend.SMAIndicator(df0['Close'], window=w, fillna=True).sma_indicator()
  df0['SMA_cross_f' + str(w2) + '_s' + str(w)] = sma_f - sma_s
  df0['EMA_' + str(w)] = ta.trend.EMAIndicator(df0['Close'], window=w, fillna=True).ema_indicator()
  df0['RSI_' + str(w)] = ta.momentum.RSIIndicator(df0['Close'], window=w, fillna=

In [60]:
df.iloc[:,113:].head()

Unnamed: 0,Low,Close,Adj Close,Volume,RSI_6,BBands_6_h_ind,BBands_6_l_ind,BBands_6hband,BBands_6lband,ATR_6,...,1_day_return,1_day_log_return,2_day_return,2_day_log_return,3_day_return,3_day_log_return,7_day_return,7_day_log_return,14_day_return,14_day_log_return
54356,25.66,26.0,20.713482,52535400.0,100.0,0.0,0.0,26.0,26.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54355,25.84,26.02,20.729418,52906200.0,100.0,0.0,0.0,26.030001,25.99,0.0,...,0.000769,0.076895,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54354,25.9,26.030001,20.737379,37805300.0,100.0,0.0,0.0,26.041612,25.991722,0.0,...,0.000384,0.038425,0.001154,0.115321,0.0,0.0,0.0,0.0,0.0,0.0
54353,25.959999,26.33,20.976383,48744200.0,100.0,0.0,0.0,26.367213,25.822787,0.0,...,0.011525,1.145923,0.011914,1.184348,0.012692,1.261243,0.0,0.0,0.0,0.0
54352,26.360001,26.77,21.326921,51946500.0,100.0,0.0,0.0,26.822352,25.637649,0.0,...,0.016711,1.65729,0.028429,2.803213,0.028824,2.841638,0.0,0.0,0.0,0.0


In [61]:
for ticker in df['ticker'].unique():
    df[df['ticker']==ticker].to_csv(f'../data/ticker_data/{ticker}_full_data.csv')
    print(f'{ticker} added to data')
print('Done')

MSFT added to data
AAPL added to data
CAT added to data
INTC added to data
JNJ added to data
PFE added to data
TSLA added to data
VZ added to data
Done
