In [1]:
import pandas as pd
import numpy as np


df = pd.read_csv('https://raw.githubusercontent.com/joshuasir/datasets/main/stock-market-train.csv')

df_test = pd.read_csv('https://raw.githubusercontent.com/joshuasir/datasets/main/stock-market-test.csv')

In [2]:
df.dropna(axis=0,inplace=True)
df_test.dropna(axis=0,inplace=True)

In [3]:
df['Month'] = pd.to_datetime(df['Date']).dt.month 
df['Year'] = pd.to_datetime(df['Date']).dt.year
df['Day'] = pd.to_datetime(df['Date']).dt.day

df_test['Month'] = pd.to_datetime(df_test['Date']).dt.month 
df_test['Year'] = pd.to_datetime(df_test['Date']).dt.year
df_test['Day'] = pd.to_datetime(df_test['Date']).dt.day

In [4]:
def calc_SMA(closing,lookback=50):
  SMA = []
  MA = [closing[:lookback]]
  for price in closing[lookback:]:
    MA.pop(0)
    MA.append(price)
    SMA.append(sum(MA)/len(MA))
  return pd.Series(SMA,name='SMA').dropna()

def calc_EMA(closing,lookback=50,smoothing=2):
  EMA = [sum(closing[:lookback])/lookback]
  for price in closing[lookback:]:
    EMA.append((price * (smoothing / (1 + lookback))) + EMA[-1] * (1 - (smoothing / (1 + lookback))))
  return pd.Series(EMA,name='EMA').dropna()

def calc_MACD(EMA_12,EMA_26):
  MACD = []
  for first,second in zip(EMA_12,EMA_26):
    MACD.append(first-second)
  return pd.Series(MACD,name='MACD').dropna()

def calc_RSI(closing, lookback=14):
    ret = closing.diff()
    up = []
    down = []
    for diff in ret:
        if diff < 0:
            up.append(0)
            down.append(diff)
        else:
            up.append(diff)
            down.append(0)

    up_series = pd.Series(up)
    down_series = pd.Series(down).abs()
    up_ewm = up_series.ewm(com = lookback - 1, adjust = False).mean()
    down_ewm = down_series.ewm(com = lookback - 1, adjust = False).mean()
    RS = up_ewm/down_ewm
    RSI = 100 - (100 / (1 + RS))
    return RSI.rename({0:'RSI'}).dropna()

def calc_OBV(closing,volume,lookback=50):
  OBV = [0]
  for i in range(lookback,len(closing)):
    OBV.append((OBV[-1]-volume[i-1])*(closing[i-1] < closing[i-2])+(OBV[-1]+volume[i-1])*(closing[i-1] > closing[i-2]))

  return pd.Series(OBV[1:],name='OBV').dropna()

def calc_target(closing,open,lookback=50):
  target = []
  for open,close in zip(open,closing):
    target.append(int(close > open))
  return pd.Series(target,name='target').dropna()

def add_indicators(df,days=50):
  new_df = pd.DataFrame()
  for index in df.Index.unique():
    df_index = df[df['Index']==index].reset_index(drop=True).copy()
    closing = df_index['Open']
    Indicators = pd.DataFrame({
         'Date' : df_index['Date'][50:].reset_index(drop=True),
         'SMA' : calc_SMA(closing,days),
         'EMA' : calc_EMA(closing,lookback = days),
         'MACD' : calc_MACD(calc_EMA(closing[38:],lookback=12),calc_EMA(closing[24:],lookback=26)),
         'RSI' : calc_RSI(closing[36:]),
         # 'OBV' : calc_OBV(closing = closing,volume = df_index['Volume']),
         'Target' : calc_target(closing = df.Close,open = closing,lookback = days)
         })
    df_index = pd.merge(df_index[53:-1].reset_index(drop=True),Indicators[3:-1].reset_index(drop=True),on='Date')
    new_df = new_df.append(df_index)
  return new_df

In [5]:
df = df.sort_values('Date').reset_index(drop=True)
df_test = df_test.sort_values('Date').reset_index(drop=True)

In [6]:
df.shape

(83373, 12)

In [7]:
df = add_indicators(df,days=50)
df_test = add_indicators(df_test,days=50)

In [8]:
df.shape

(82671, 18)

In [9]:
df_selected = df[['Target','MACD','SMA','EMA','RSI','Month','Day','Index']].copy()
df_test_selected = df_test[['Target','MACD','SMA','EMA','RSI','Month','Day','Index']].copy()

In [10]:
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler,StandardScaler

scaler_norm = MinMaxScaler()
scaler_norm.clip=True

scaler_stand = StandardScaler()
scaler_stand.clip=True


df_selected[['SMA','EMA']] = scaler_norm.fit_transform(df_selected[['SMA','EMA']])

df_selected[['RSI']] = scaler_stand.fit_transform(df_selected[['RSI']])

df_test_selected[['SMA','EMA']] = scaler_norm.transform(df_test_selected[['SMA','EMA']])

df_test_selected[['RSI']] = scaler_stand.transform(df_test_selected[['RSI']])


OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols = pd.DataFrame(OH_encoder.fit_transform(df_selected[['Index']]))

OH_cols.index = df_selected.index

num_df_selected = df_selected.drop(['Index'], axis=1)

OH_df_selected = pd.concat([num_df_selected, OH_cols], axis=1)



OH_test_cols = pd.DataFrame(OH_encoder.fit_transform(df_test_selected[['Index']]))

OH_test_cols.index = df_test_selected.index

num_test_df_selected = df_test_selected.drop(['Index'], axis=1)

OH_test_df_selected = pd.concat([num_test_df_selected, OH_test_cols], axis=1)





OH_df_selected.head()

Unnamed: 0,Target,MACD,SMA,EMA,RSI,Month,Day,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0,-22.671634,0.017884,0.019559,1.630931,3,25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0,-22.706738,0.017977,0.019512,0.437169,3,26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0,-22.020768,0.018399,0.019472,-2.026688,3,29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0,-19.156707,0.017604,0.019449,-2.588845,3,30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0,-20.602509,0.01749,0.019396,-2.744211,3,31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [11]:
OH_df_selected.to_csv('stock-market-train-clean.csv',index=False)
OH_test_df_selected.to_csv('stock-market-test-clean.csv',index=False)