In [1]:
import os
from os.path import isfile, join
import pandas_datareader as pdr
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import statistics
from statistics import mean, mode, median, stdev
import yfinance as yf
from sklearn.preprocessing import MinMaxScaler

In [2]:
tiingo_api_key = os.environ['TIINGO_API_KEY']
iex_api_key = os.environ['IEX_API_KEY']

In [3]:
# File containing all tickers listed by NASDAQ-100
tickers_file = 'ndxt_tickers.txt'
data_dir = 'data/'
raw_data_dir = data_dir + 'raw/'
processed_data_dir = data_dir + 'processed/'
final_data_dir = data_dir + 'final/'
time_range = [1, 5, 10, 20, 90, 270]

if not os.path.exists(raw_data_dir):
    os.makedirs(raw_data_dir)
if not os.path.exists(processed_data_dir):
    os.makedirs(processed_data_dir)
if not os.path.exists(final_data_dir):
    os.makedirs(final_data_dir)
for t in time_range:
    if not os.path.exists(final_data_dir+str(t)+'/'):
            os.makedirs(final_data_dir+str(t)+'/')

In [4]:
# Read all the tickers to be used in the data
ndxt_tickers = []
with open(data_dir+tickers_file) as f:
    for ticker in f:
        ndxt_tickers.append(ticker.replace('\n', ''))

In [5]:
# Code for downloading data and saving it, use only when necessary
'''
raw_stock_data_tiingo = []
raw_stock_data_iex = []
error_tickers = []

for ticker in sorted(ndxt_tickers):
    try:
        raw_stock_data_tiingo.append(pdr.get_data_tiingo(ticker, api_key= tiingo_api_key))
    except:
        error_tickers.append(ticker)

raw_index_data_yahoo = yf.download('^NDXT', period='5y')
# Save each stock data in a CSV file
for t in raw_stock_data_tiingo:
    t.to_csv(raw_data_dir + t.index.values[0][0] + '.csv')
raw_index_data_yahoo.to_csv(raw_data_dir + '^NDXT.csv')
''';

In [6]:
# Read downloaded data from files
raw_stock_data = []
raw_index_data_filename = '^NDXT.csv'
raw_stock_data_filenames = [f+'.csv' for f in ndxt_tickers]
raw_index_df = pd.read_csv(raw_data_dir + raw_index_data_filename)

for filename in raw_stock_data_filenames:
    raw_stock_data.append(pd.read_csv(raw_data_dir + filename))

In [7]:
# Check for incomplete(dates) stocks and remove them
stock_record_count = mode([len(data) for data in raw_stock_data] + [len(raw_index_df)])
for i, t in enumerate(raw_stock_data):
    if len(t) != stock_record_count: 
        raw_stock_data.pop(i)
        print('Element removed.')

In [8]:
# Check for missing values
for i, t in enumerate(raw_stock_data):
    if t.isnull().any().any(): print('Missing data.')

In [9]:
# Check that all stock data have the same dates, not the most optimal way to calculate it yet since its O(n^2)
equal = True
for i in range(len(raw_stock_data)):
    for j in range(len(raw_stock_data)):
        if not (raw_stock_data[i]['date'] == raw_stock_data[j]['date']).all(): equal = False
print('Data has equal dates in all rows: ' + str(equal))
assert equal

Data has equal dates in all rows: True


In [10]:
# Reformat date in stocks dataframes 
for data in raw_stock_data:
    data['date'] = data['date'].map(lambda x: x.split()[0])
    
# Reformat date in index dataframe
#mon_to_num = {'Jan':'01', 'Feb':'02', 'Mar':'03', 'Apr':'04', 'May':'05', 'Jun':'06', 'Jul':'07', 'Aug':'08', 'Sep':'09', 'Oct':'10', 'Nov':'11', 'Dec':'12'}

#raw_index_df['Date'] = raw_index_df['Date'].map(lambda x: x.split()[2]+'-'+ mon_to_num[x.split()[0]] +'-'+x.split()[1].strip(','))

raw_index_df.drop(columns='Volume', inplace=True)

# Rename index columns
raw_index_df.columns = ['date', 'open', 'high', 'low', 'close', 'adjClose']

In [11]:
# Assign symbol and date as index identifiers for every stock record

for data in raw_stock_data:
    data.set_index(['symbol', 'date'], inplace=True, drop=True)
    
# Assign date as index identifier for index records as well

raw_index_df.set_index(['date'], inplace=True, drop=True)


# Concatenate all stock datadrames into one

#raw_stock_df = pd.concat(raw_stock_data)


# Remove unnecessary information

for df in raw_stock_data: df.drop(columns=['divCash', 'splitFactor'], inplace=True)

In [12]:
# Find the oldest final date and newest starting date

last_dates = [raw_index_df.index[-1]]
first_dates = [raw_index_df.index[0]]

for df in raw_stock_data:
    dates = []
    
    for idx in df.index:
        dates.append(idx[1])
    
    last_dates.append(max(dates))
    first_dates.append(min(dates))

last_date = min(last_dates)
first_date = max(first_dates)
print(last_date)
print(first_date)

2019-10-15
2014-10-16


In [13]:
# Make sure both DataFrames have the same final date (as close to today as possible)
while(raw_index_df.index[0] > last_date):
    raw_index_df.pop(raw_index_df.index[0])

while(raw_index_df.index[-1] < first_date):
    raw_index_df.pop(raw_index_df.index[-1])
        
for df in raw_stock_data:
    while(df.index[0][1] > last_date):
        df.pop(df.index[0])
    while(df.index[-1][1] < first_date):
        df.pop(df.index[-1])
        
# Reverse stock and index records
for df in raw_stock_data:
    df = df.sort_index(ascending=False, inplace=True)
raw_index_df = raw_index_df.iloc[::-1]

In [14]:
# DataFrames have been processed and not considered raw anymore
stocks_df = raw_stock_data
index_df = pd.DataFrame(raw_index_df)

In [15]:
def labels(stock_df, since = 1):
    stock_df.drop(columns='y_'+str(since), inplace=True, errors='ignore')
    labels = []
    for i in range(len(stock_df)):
        try:
            assert i-since >= 0
            today = stock_df.iloc[i]['close']
            future = stock_df.iloc[i-since]['close']
            labels.append(1 if future>today else 0)
        except:
            labels.append(None)
    stock_df.insert(loc=0, column='y_'+str(since), value=labels)

In [16]:
def change(stock_df, period = 1):
    stock_df.drop(columns='change', inplace=True, errors='ignore')
    change = []
    for i in range(len(stock_df)):
        try:
            today = stock_df.iloc[i]['close']
            previous = stock_df.iloc[i+period]['close']
            change.append(100*(today-previous)/previous)
        except:
            change.append(None)
    stock_df.insert(loc=0, column='change', value=change)

In [17]:
def PMO(stock_df, period = 50):
    stock_df.drop(columns='PMO', inplace=True, errors='ignore')
    pmo = []
    for i in range(len(stock_df)):
        try:
            today = stock_df.iloc[i]['close']
            previous = stock_df.iloc[i+period]['close']
            pmo.append(today - previous)
        except:
            pmo.append(None)
    stock_df.insert(loc=0, column='PMO', value=pmo)

In [18]:
def RSI(stock_df, period = 50):
    stock_df.drop(columns='RSI', inplace=True, errors='ignore')
    rsi = []
    for i in range(len(stock_df)):
        try:
            rsi_value = 0
            pos = []
            neg = []
            
            for j in range(period):
                change = stock_df.iloc[i+j]['change']
                if change > 0: 
                    pos.append(change)
                elif change < 0: 
                    neg.append(abs(change))
                    
            if not neg:
                rsi_value = 100
            elif not pos:
                rsi_value = 0
            else:
                pos = sum(pos)/len(pos)
                neg = sum(neg)/len(neg)
                rsi_value = 100 - (100/(1+(pos/neg)))
            rsi.append(rsi_value)
        except:
            rsi.append(None)
    stock_df.insert(loc=0, column='RSI', value=rsi)

In [19]:
def MFI(stock_df, period = 50):
    stock_df.drop(columns='MFI', inplace=True, errors='ignore')
    mfi = []
    for i in range(len(stock_df)):
        try:
            mfi_value = 0
            pos = []
            neg = []
            typical_prices = []
            
            for j in range(period):
                if not typical_prices: typical_prices.append( mean([stock_df.iloc[i+1]['high'] , stock_df.iloc[i+1]['low'] , stock_df.iloc[i+1]['close']]) ) 
                tp = (stock_df.iloc[i+j]['high'] + stock_df.iloc[i+j]['low'] + stock_df.iloc[i+j]['close']) / 3
                if tp > typical_prices[-1]: 
                    pos.append( tp * stock_df.iloc[i+j]['volume'] )
                elif tp < typical_prices[-1]: 
                    neg.append( tp * stock_df.iloc[i+j]['volume'] )
            
            if not neg:
                mfi_value = 100
            elif not pos:
                mfi_value = 0
            else:
                pos = sum(pos)/len(pos)
                neg = sum(neg)/len(neg)
                mfi_value = 100 - (100/(1+(pos/neg)))
            mfi.append(mfi_value)
        except:
            mfi.append(None)
    stock_df.insert(loc=0, column='MFI', value=mfi)

In [20]:
def EMA(stock_df, period=50):
    stock_df.drop(columns='EMA', inplace=True, errors='ignore')
    a = 2/(period + 1)
    # There are many ways to calculate the first term of an exponential moving average, so for now
    # I'll be using the average of the previous 3 closes
    initial_value_range = 3
    ema = []
    
    for i in range(len(stock_df)):
        emas = []
        try:
            
            for j in list(reversed(range(period))):
                if not emas: emas.append( mean([stock_df.iloc[i+j+day]['close'] for day in range(initial_value_range)]) )
                tc = stock_df.iloc[i+j]['close']
                this_ema = (a * tc) + ((1 - a) * emas[-1])
                emas.append(this_ema)
            
            ema.append(emas[-1])
        except:
            ema.append(None)
    stock_df.insert(loc=0, column='EMA', value=ema)

In [21]:
def SO(stock_df, period=50):
    stock_df.drop(columns='SO', inplace=True, errors='ignore')
    
    so = []
    
    for i in range(len(stock_df)):
        try:
            tc = stock_df.iloc[i]['close']
            ll = min([stock_df.iloc[i+day]['low'] for day in range(period)])
            hh = max([stock_df.iloc[i+day]['high'] for day in range(period)])
            this_so = ((tc - ll) / (hh - ll)) * 100
            so.append(this_so)
        except:
            so.append(None)
    
    stock_df.insert(loc=0, column='SO', value=so)

In [22]:
def MACD(stock_df, p1=12, p2=26):
    stock_df.drop(columns='MACD', inplace=True, errors='ignore')
    
    a1 = 2/(p1 + 1)
    a2 = 2/(p2 + 1)
    initial_value_range = 3
    macd = []
    
    for i in range(len(stock_df)):
        ema1 = []
        ema2 = []
        try:
            for j in list(reversed(range(p1))):
                if not ema1: ema1.append( mean([stock_df.iloc[i+j+day]['close'] for day in range(initial_value_range)]) )
                tc = stock_df.iloc[i+j]['close']
                this_ema = (a1 * tc) + ((1 - a1) * ema1[-1])
                ema1.append(this_ema)
            
            for j in list(reversed(range(p2))):
                if not ema2: ema2.append( mean([stock_df.iloc[i+j+day]['close'] for day in range(initial_value_range)]) )
                tc = stock_df.iloc[i+j]['close']
                this_ema = (a2 * tc) + ((1 - a2) * ema2[-1])
                ema2.append(this_ema)
            
            macd.append(ema1[-1] - ema2[-1])
            
        except:
            macd.append(None)
    
    stock_df.insert(loc=0, column='MACD', value=macd)

In [23]:
# Calculate features for index data, MFI is not calculated as it requires volume

change(index_df)
MACD(index_df)
SO(index_df, 10)
EMA(index_df, 10)
RSI(index_df, 10)
PMO(index_df, 10)
index_df.fillna(value=pd.np.nan, inplace=True)

1258

In [None]:
# Calculate features and labels for stock data, this takes a lot of time
i=0
for df in stocks_df:
    i += 1
    change(df)
    MACD(df)
    SO(df, 10)
    EMA(df, 10)
    MFI(df, 10)
    RSI(df, 10)
    PMO(df, 10)
    for m in [1, 5, 10, 20, 90, 270]:
        labels(df, m)
    df.fillna(value=pd.np.nan, inplace=True)
    print(f'{round(i*100/len(ndxt_tickers))}% ', end='')
    
#for df in stocks_df:
#    df.dropna(inplace=True)

stocks_df[-1].head()

3% 5% 

In [None]:
stocks_df[0].tail()

In [None]:
# Save the processed data as a milestone

index_df.to_csv(processed_data_dir+ '^NDXT.csv')
for df in stocks_df:
    df.to_csv(processed_data_dir+ f'{df.index[0][0]}.csv')

In [None]:
# Normalizing features
scaler = MinMaxScaler()

idf = index_df[['PMO', 'EMA', 'MACD']]
scaler.fit(idf)
index_df[['PMO', 'EMA', 'MACD']] = scaler.transform(idf)
idf = index_df[['RSI', 'SO']]
index_df[['RSI', 'SO']] = idf/100

for i, df_ in enumerate(stocks_df):
    df = df_[['PMO', 'EMA', 'MACD']]
    scaler.fit(df)
    stocks_df[i][['PMO', 'EMA', 'MACD']] = scaler.transform(df)
    df = df_[['RSI' ,'MFI', 'SO']]
    stocks_df[i][['RSI' ,'MFI', 'SO']] = df/100


In [None]:
np.hstack([stocks_df[0][['y_'+str(1), 'PMO', 'EMA', 'MACD', 'RSI' ,'MFI', 'SO']].to_numpy(), index_df[['PMO', 'EMA', 'MACD', 'RSI', 'SO']].to_numpy()])

In [None]:
pd.concat([stocks_df[0][['y_'+str(1), 'PMO', 'EMA', 'MACD', 'RSI' ,'MFI', 'SO']], index_df[['PMO', 'EMA', 'MACD', 'RSI', 'SO']].reset_index().drop(columns='date')], sort=False, axis=1)

In [None]:
# Unify all data into separate training/testing sets

test_train_separation = round(len(stocks_df[0])*2/3)

for t in time_range:
    new_df_list = []
    
    for df in stocks_df:
        new_df_list.append(pd.concat([df[['y_'+str(t), 'PMO', 'EMA', 'MACD', 'RSI' ,'MFI', 'SO']], index_df[['PMO', 'EMA', 'MACD', 'RSI', 'SO']]], axis=1))
    
    for dataset in new_df_list:
        dataset.iloc[:test_train_separation].to_csv(final_data_dir+str(t)+'/train.csv')
        dataset.iloc[test_train_separation:].to_csv(final_data_dir+str(t)+'/test.csv')

# To think about

## How to sum up data into feedable features
* Definitely by record
## Labels are per row? per group of rows?
* Smaller groups -> more input data.
* These groups are n_1 and n_2 for stocks and index respectively.

## Categorical labels for low to high increase/decrease

## Should all features be normalized? how?
Yes, absolutely. MinMax normalization.