In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sys
sys.path.append('../')
import utilities


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [30]:
tickers = pd.read_csv('../data_cleaning/stocks/dense_95.csv')

# for local testing
tickers = tickers.head(10)

In [31]:
# start with the A ticker

ticker = "AAPL"
df = utilities.load_stock(ticker)

In [32]:
df.head()

Unnamed: 0,timestamp,open,high,low,close,volume,ticker,date,time,minute
0,2013-12-09 08:37:00,24.0,24.0,23.95,23.95,37500,AAL,2013-12-09,08:37:00,517
1,2013-12-09 08:38:00,23.75,23.75,23.75,23.75,1200,AAL,2013-12-09,08:38:00,518
2,2013-12-09 08:39:00,23.5,23.5,23.5,23.5,600,AAL,2013-12-09,08:39:00,519
3,2013-12-09 08:40:00,23.7,23.7,23.69,23.69,250,AAL,2013-12-09,08:40:00,520
4,2013-12-09 08:41:00,23.69,23.69,23.69,23.69,700,AAL,2013-12-09,08:41:00,521


In [33]:
df = utilities.RSI(df, 14)
df = utilities.RSI(df, 28)
df = utilities.RSI(df, 56)

df = utilities.MACD(df)

df = utilities.EMA(df, 10)
df = utilities.EMA(df, 50)
df = utilities.EMA(df, 200)

df = utilities.market_hours_only(df)

In [34]:
# Indicator normalizing

# normalize the minute column: 570-960 -> 0-1
df['minute'] = (df['minute'] - 570) / 390

# normalize the date: earliest date is 0, latest date is 1
df['date'] = (df['date'] - df['date'].min()) / (df['date'].max() - df['date'].min())

# divide all the RSI values by 100
df['rsi14'] /= 100
df['rsi28'] /= 100
df['rsi56'] /= 100

# divide all the ema values by 100
df['ema10'] /= 100
df['ema50'] /= 100
df['ema200'] /= 100

In [35]:
df.head()

Unnamed: 0,timestamp,open,high,low,close,volume,ticker,date,time,minute,rsi14,rsi28,rsi56,macd,signal,histogram,ema10,ema50,ema200
50,2013-12-09 09:30:00,23.9,24.2,23.8,24.03,436082,AAL,0.0,09:30:00,0.0,0.728571,0.686957,,0.0629,0.0345,0.0284,0.237829,0.236607,
51,2013-12-09 09:31:00,24.02,24.2,24.01,24.05,216105,AAL,0.0,09:31:00,0.002564,0.736111,0.692308,,0.0804,0.0437,0.0367,0.238315,0.236781,
52,2013-12-09 09:32:00,24.05,24.17,24.04,24.105,145086,AAL,0.0,09:32:00,0.005128,0.754839,0.73617,,0.0975,0.0545,0.043,0.238812,0.236971,
53,2013-12-09 09:33:00,24.148,24.15,24.08,24.105,129656,AAL,0.0,09:33:00,0.007692,0.75974,0.711628,,0.1098,0.0655,0.0443,0.239219,0.237152,
54,2013-12-09 09:34:00,24.11,24.19,24.07,24.17,120432,AAL,0.0,09:34:00,0.010256,0.783133,0.747748,,0.1233,0.0771,0.0462,0.23967,0.237353,


In [36]:
# Price normalizing

days = utilities.list_of_day_dfs(df)
days = [group.copy() for _, group in df.groupby(df['timestamp'].dt.date)]

# for each day: sort the prices, then normalize them to 0-1 where 0 is the lowest price and 1 is the highest price for that day
for day in days:
    for col in ['high', 'low', 'volume', 'macd', 'signal', 'histogram', 'rsi14', 'rsi28', 'rsi56', 'ema10', 'ema50', 'ema200']:
        day[col] = (day[col] - day[col].min()) / (day[col].max() - day[col].min())

# recombine the days
df = pd.concat(days)
df = df.drop(columns=['timestamp'])

df.head()

Unnamed: 0,open,high,low,close,volume,ticker,date,time,minute,rsi14,rsi28,rsi56,macd,signal,histogram,ema10,ema50,ema200
50,23.9,0.243902,0.19337,24.03,0.051935,AAL,0.0,09:30:00,0.0,0.721307,0.761397,,0.481953,0.375566,0.621545,0.0,0.0,
51,24.02,0.243902,0.309392,24.05,0.025568,AAL,0.0,09:31:00,0.002564,0.73015,0.770505,,0.535032,0.410256,0.680369,0.033242,0.011772,
52,24.05,0.22561,0.325967,24.105,0.017055,AAL,0.0,09:32:00,0.005128,0.752112,0.845163,,0.586897,0.45098,0.725018,0.067237,0.024626,
53,24.148,0.213415,0.348066,24.105,0.015205,AAL,0.0,09:33:00,0.007692,0.75786,0.80339,,0.624204,0.492459,0.734231,0.095075,0.036872,
54,24.11,0.237805,0.342541,24.17,0.0141,AAL,0.0,09:34:00,0.010256,0.785294,0.86487,,0.66515,0.536199,0.747697,0.125923,0.05047,


In [37]:
# Price normalizing needs to be the only metric that's forward looking

# Create a new column called 'enter' that represents whether or not the price will he higher or lower in 5 minutes
# 1 if the price will be higher, 0 if the price will be lower
df['enter'] = (df['close'].shift(-5) > df['close']).astype(int)
df = df.drop(columns=['open', 'close'])

In [38]:
df.describe()

Unnamed: 0,high,low,volume,date,minute,rsi14,rsi28,rsi56,macd,signal,histogram,ema10,ema50,ema200,enter
count,993687.0,993687.0,993687.0,993687.0,993687.0,993687.0,993687.0,993681.0,993687.0,993687.0,993687.0,993687.0,993687.0,993538.0,993687.0
mean,0.480716,0.519704,0.06384,0.500136,0.49917,0.499018,0.499005,0.499479,0.501636,0.500716,0.500845,0.497207,0.496483,0.502318,0.475612
std,0.253962,0.25352,0.096151,0.288694,0.289377,0.205642,0.210339,0.222962,0.232385,0.243813,0.177973,0.276767,0.309573,0.331105,0.499405
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.271429,0.315432,0.01466,0.25,0.248718,0.353479,0.348167,0.333109,0.328057,0.317266,0.388146,0.264342,0.217278,0.186856,0.0
50%,0.46988,0.525547,0.034541,0.500541,0.497436,0.498706,0.498902,0.499359,0.501294,0.5,0.501247,0.493284,0.491525,0.50223,0.0
75%,0.6875,0.73,0.074566,0.75027,0.748718,0.644829,0.649499,0.664859,0.676809,0.685055,0.613568,0.732256,0.778215,0.819466,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [39]:
df.to_csv(f'../cleaned_data/{ticker}.csv')