In [1]:
import numpy as np
import pandas as pd
import os
import yfinance as yf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import tensorflow as tf
from keras.models import Sequential
from keras.callbacks import CSVLogger
from keras.layers import Dense, Embedding, LSTM, Dropout
from keras import optimizers

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
sns.set(rc={'figure.figsize':(15,8)})
from tqdm import tqdm_notebook

Using TensorFlow backend.


In [2]:
hm = pd.read_csv('data/HM-B.ST_1m.csv')
#hm = hm.set_index('Datetime')
hm.head()

Unnamed: 0,Datetime,Open,High,Low,Close,Adj Close,Volume
0,2020-03-26 09:00:00+01:00,130.399994,130.860001,129.979996,130.860001,130.860001,0
1,2020-03-26 09:01:00+01:00,130.820007,131.779999,129.860001,129.860001,129.860001,37317
2,2020-03-26 09:02:00+01:00,129.820007,130.179993,128.580002,129.580002,129.580002,18178
3,2020-03-26 09:03:00+01:00,129.619995,130.419998,129.559998,130.039993,130.039993,13635
4,2020-03-26 09:04:00+01:00,130.419998,130.419998,129.179993,129.179993,129.179993,37434


In [None]:
def plot_daily_stock_movement(df, stock_name, var, plt):
    curdate = df['Date'].unique()
    df = df.set_index(df['Time'])
    md = str(df['Date'][0])
    plot_title = stock_name + ' stock for ' + md
    if(len(curdate) > 1):
        print('Data consists of more than one day. Exiting...')
    else:
        #plt.figure()
        #plt.plot(df["Open"])
        plt.plot(df[var])
        #plt.plot(df["Volume"])
        #plt.plot(df["Close"])
        plt.set_title(plot_title)
        plt.set_ylabel('Price (SEK)')
        #plt.set_ylim([120,190])
        plt.set_xlabel('Minutes')
        plt.legend(['Open','High','Low','Close'], loc='upper left')

In [None]:
hm2 = hm.copy()
hm2["Datetime"] = pd.to_datetime(hm2["Datetime"])
hm2['Date'] = hm2['Datetime'].dt.date
hm2['Time'] = hm2['Datetime'].dt.time
day_list = hm2['Date'].unique()

#nrows = int(np.ceil(len(day_list)/2))
nrows = len(day_list)

fig, axs = plt.subplots(nrows,2, figsize=(20, 60), facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace =0.5, wspace=0.3)

axs = axs.ravel()
for i in range(len(day_list)):
    curdf = hm2[hm2.Date == day_list[i]]
    plot_daily_stock_movement(curdf, 'H&M', 'Close', axs[2*i])
    plot_daily_stock_movement(curdf, 'H&M', 'Volume', axs[2*i+1])


### Daily stock movement

In [None]:
plt.figure()
plt.plot(hm["Open"])
plt.plot(hm["High"])
plt.plot(hm["Low"])
plt.plot(hm["Close"])
plt.title('H&M stock price history')
plt.ylabel('Price (SEK)')
plt.xlabel('Minutes')
plt.legend(['Open','High','Low','Close'], loc='upper left')
plt.show()

In [None]:
plt.figure()
plt.plot(hm["Volume"])
plt.title('H&M stock volume history')
plt.ylabel('Volume')
plt.xlabel('Datetime')
plt.show()

### Normalizing the data

In [None]:
def normalize_data(minutedata):
    train_cols = ["Open","High","Low","Close","Volume"]
    df_train, df_test = train_test_split(minutedata, train_size=0.9, test_size=0.1, shuffle=False)
    print("Train and Test size are: ", len(df_train), " and ", len(df_test))
    
    # scale the feature MinMax, build array
    x = df_train.loc[:,train_cols].values
    min_max_scaler = MinMaxScaler()
    x_train = min_max_scaler.fit_transform(x)
    x_test = min_max_scaler.transform(df_test.loc[:,train_cols])
    return x_train, x_test

In [None]:
xtrain, xtest = normalize_data(hm)
xtrain.view()

In [None]:
train_cols = ["Open","High","Low","Close","Volume"]
df_train, df_test = train_test_split(hm, train_size=0.9, test_size=0.1, shuffle=False)
print("Train and Test size are: ", len(df_train), " and ", len(df_test))
# scale the feature MinMax, build array
x = df_train.loc[:,train_cols].values
min_max_scaler = MinMaxScaler()
x_train = min_max_scaler.fit_transform(x)
x_test = min_max_scaler.transform(df_test.loc[:,train_cols])

# we will look at last hour to predict next value, so time step = 60
TIME_STEPS = 30
BATCH_SIZE = 100
OUTPUT_PATH='log/'

In [None]:
def build_timeseries(mat, y_col_index):
    # y_col_index is the index of column that would act as output column
    # total number of time-series samples would be len(mat) - TIME_STEPS
    dim_0 = mat.shape[0] - TIME_STEPS
    dim_1 = mat.shape[1]
    x = np.zeros((dim_0, TIME_STEPS, dim_1))
    y = np.zeros((dim_0,))
    
    for i in tqdm_notebook(range(dim_0)):
        x[i] = mat[i:TIME_STEPS+i]
        y[i] = mat[TIME_STEPS+i, y_col_index]
    print("length of time-series i/o",x.shape,y.shape)
    return x, y

In [None]:
def trim_dataset(mat, batch_size):
    """
    trims dataset to a size that's divisible by BATCH_SIZE
    """
    no_of_rows_drop = mat.shape[0]%batch_size
    if(no_of_rows_drop > 0):
        return mat[:-no_of_rows_drop]
    else:
        return mat

In [None]:
def prepare_data_for_lstm(idata, label_col_id, BATCH_SIZE):
    x_ts, y_ts = build_timeseries(x_train, label_col_id)
    x_ts = trim_dataset(x_ts, BATCH_SIZE)
    y_ts = trim_dataset(y_ts, BATCH_SIZE)
    return x_ts, y_ts

In [None]:
x_t, y_t = prepare_data_for_lstm(x_train, 3, BATCH_SIZE)

In [None]:
x_t, y_t = build_timeseries(x_train, 3)
x_t = trim_dataset(x_t, BATCH_SIZE)
y_t = trim_dataset(y_t, BATCH_SIZE)
x_temp, y_temp = build_timeseries(x_test, 3)
x_val, x_test_t = np.split(trim_dataset(x_temp, BATCH_SIZE),2)
y_val, y_test_t = np.split(trim_dataset(y_temp, BATCH_SIZE),2)

### Create model

In [None]:
import tensorflow as tf
from keras.models import Sequential
from keras.callbacks import CSVLogger
from keras.layers import Dense, Embedding, LSTM, Dropout
from keras import optimizers
lstm_model = Sequential()
lstm_model.add(LSTM(100, batch_input_shape=(BATCH_SIZE, TIME_STEPS, x_t.shape[2]), dropout=0.0, recurrent_dropout=0.0, stateful=True,     kernel_initializer='random_uniform'))
lstm_model.add(Dropout(0.5))
lstm_model.add(Dense(20,activation='relu'))
lstm_model.add(Dense(1,activation='sigmoid'))
optimizer = optimizers.RMSprop(lr=0.001)
lstm_model.compile(loss='mean_squared_error', optimizer=optimizer)

In [None]:
csv_logger = CSVLogger(os.path.join(OUTPUT_PATH, 'lstm_hm' + '.log'), append=True)

history = lstm_model.fit(x_t, y_t, epochs=100, verbose=2, batch_size=BATCH_SIZE,
                    shuffle=False, validation_data=(trim_dataset(x_val, BATCH_SIZE),
                    trim_dataset(y_val, BATCH_SIZE)), callbacks=[csv_logger])

In [None]:
y_pred = lstm_model.predict(trim_dataset(x_test_t, BATCH_SIZE), batch_size=BATCH_SIZE)
y_pred = y_pred.flatten()
y_test_t = trim_dataset(y_test_t, BATCH_SIZE)
error = mean_squared_error(y_test_t, y_pred)
print("Error is", error, y_pred.shape, y_test_t.shape)
print(y_pred[0:15])
print(y_test_t[0:15])

In [None]:
y_pred_org = (y_pred * min_max_scaler.data_range_[3]) + min_max_scaler.data_min_[3] 
y_test_t_org = (y_test_t * min_max_scaler.data_range_[3]) + min_max_scaler.data_min_[3] # min_max_scaler.inverse_transform(y_test_t)
print(y_pred_org[0:15])
print(y_test_t_org[0:15])

### Visualize the results

In [None]:
plt.figure()
plt.plot(y_pred_org)
plt.plot(y_test_t_org)
plt.title('Prediction vs Real Stock Price')
plt.ylabel('Price (in SEK)')
plt.xlabel('Minutes')
plt.legend(['Prediction', 'Real'], loc='upper left')
plt.show()
#plt.savefig(os.path.join(OUTPUT_PATH, 'pred_vs_real_BS'+str(BATCH_SIZE)+"_"+time.ctime()+'.png'))
#print_time("program completed ", stime)

### LSTM for outputting next five values

In [None]:
def build_timeseries(mat, y_col_index):
    # y_col_index is the index of column that would act as output column
    # total number of time-series samples would be len(mat) - TIME_STEPS
    dim_0 = mat.shape[0] - TIME_STEPS
    dim_1 = mat.shape[1]
    x = np.zeros((dim_0, TIME_STEPS, dim_1))
    y = np.zeros((dim_0,5))
    print(y.shape)
    print(mat[TIME_STEPS+1:TIME_STEPS+1+5, y_col_index].reshape(1,5).shape)
    
    for i in tqdm_notebook(range(dim_0-5)):
        x[i] = mat[i:TIME_STEPS+i]
        #print(y[i].shape)
        y[i] = mat[TIME_STEPS+i:TIME_STEPS+i+5, y_col_index].reshape(5,)
    print("length of time-series i/o",x.shape,y.shape)
    return x, y

In [None]:
def prepare_data_for_lstm(idata, label_col_id, BATCH_SIZE):
    x_ts, y_ts = build_timeseries(x_train, label_col_id)
    x_ts = trim_dataset(x_ts, BATCH_SIZE)
    y_ts = trim_dataset(y_ts, BATCH_SIZE)
    return x_ts, y_ts

In [None]:
xx, yy = prepare_data_for_lstm(x_train, 3, BATCH_SIZE)


In [None]:
x_t, y_t = build_timeseries(x_train, 3)
x_t = trim_dataset(x_t, BATCH_SIZE)
y_t = trim_dataset(y_t, BATCH_SIZE)
x_temp, y_temp = build_timeseries(x_test, 3)
x_val, x_test_t = np.split(trim_dataset(x_temp, BATCH_SIZE),2)
y_val, y_test_t = np.split(trim_dataset(y_temp, BATCH_SIZE),2)

In [None]:
def build_lstm_model(train_data, BATCH_SIZE, TIME_STEPS):  
    model = Sequential()
    model.add(LSTM(100, batch_input_shape=(BATCH_SIZE, TIME_STEPS, train_data.shape[2]), dropout=0.0, recurrent_dropout=0.0, stateful=True, kernel_initializer='random_uniform'))
    model.add(Dropout(0.5))
    model.add(Dense(20,activation='relu'))
    model.add(Dense(5,activation='sigmoid'))
    optimizer = optimizers.RMSprop(lr=0.001)
    model.compile(loss='mean_squared_error', optimizer=optimizer)
    return model

In [None]:
lstm_model = build_lstm_model(x_t, BATCH_SIZE, TIME_STEPS)
print(lstm_model)

In [None]:
import tensorflow as tf
from keras.models import Sequential
from keras.callbacks import CSVLogger
from keras.layers import Dense, Embedding, LSTM, Dropout
from keras import optimizers
lstm_model = Sequential()
lstm_model.add(LSTM(100, batch_input_shape=(BATCH_SIZE, TIME_STEPS, x_t.shape[2]), dropout=0.0, recurrent_dropout=0.0, stateful=True, kernel_initializer='random_uniform'))
lstm_model.add(Dropout(0.5))
lstm_model.add(Dense(20,activation='relu'))
lstm_model.add(Dense(5,activation='sigmoid'))
optimizer = optimizers.RMSprop(lr=0.001)
lstm_model.compile(loss='mean_squared_error', optimizer=optimizer)
lstm_model

In [None]:
csv_logger = CSVLogger(os.path.join(OUTPUT_PATH, 'lstm_hm' + '.log'), append=True)

history = lstm_model.fit(x_t, y_t, epochs=100, verbose=2, batch_size=BATCH_SIZE,
                    shuffle=False, validation_data=(trim_dataset(x_val, BATCH_SIZE),
                    trim_dataset(y_val, BATCH_SIZE)), callbacks=[csv_logger])

In [None]:
def compute_sq_error(lstm_model, x_test_data, y_test_data, BATCH_SIZE):
    y_pred = lstm_model.predict(trim_dataset(x_test_data, BATCH_SIZE), batch_size=BATCH_SIZE)
    y_pred_mean = y_pred.mean(axis=1)
    y_test_t = trim_dataset(y_test_data, BATCH_SIZE)
    y_test_t_mean = y_test_t.mean(axis=1)
    error = mean_squared_error(y_test_t_mean, y_pred_mean)
    return error

In [None]:
sq_error = compute_sq_error(lstm_model, x_test_t, y_test_t, BATCH_SIZE)
print("Error is", sq_error)

In [None]:
y_pred = lstm_model.predict(trim_dataset(x_test_t, BATCH_SIZE), batch_size=BATCH_SIZE)
y_pred_mean = y_pred.mean(axis=1)
#y_pred = y_pred.flatten()
y_test_t = trim_dataset(y_test_t, BATCH_SIZE)
y_test_t_mean = y_test_t.mean(axis=1)
error = mean_squared_error(y_test_t_mean, y_pred_mean)
print("Error is", error, y_pred_mean.shape, y_test_t_mean.shape)
#print(y_pred[0:15])
#print(y_test_t[0:15])
float()

In [None]:
y_pred_org = (y_pred_mean * min_max_scaler.data_range_[3]) + min_max_scaler.data_min_[3] 
y_test_t_org = (y_test_t_mean * min_max_scaler.data_range_[3]) + min_max_scaler.data_min_[3] # min_max_scaler.inverse_transform(y_test_t)
print(y_pred_org[0:15])
print(y_test_t_org[0:15])

In [None]:
def plot_real_vs_pred(pred, real):
    outfile = 'figures/TS_'+str(TIME_STEPS)+'_BS_'+str(BATCH_SIZE)+'.png'
    plt.figure()
    plt.plot(pred)
    plt.plot(real)
    plt.title('Prediction vs Real Stock Price')
    plt.ylabel('Price (in SEK)')
    plt.xlabel('Minutes')
    plt.legend(['Prediction', 'Real'], loc='upper left')
    fig.savefig(outfile)   # save the figure to file
    plt.close(fig)  

In [None]:
plot_real_vs_pred(y_pred_org, y_test_t_org)

In [None]:
#fig, ax = plt.subplots( nrows=1, ncols=1 )
plt.figure()
plt.plot(y_pred_org)
plt.plot(y_test_t_org)
plt.title('Prediction vs Real Stock Price')
plt.ylabel('Price (in SEK)')
plt.xlabel('Minutes')
plt.legend(['Prediction', 'Real'], loc='upper left')
plt.savefig('/tmp/ab.png')
plt.show()

In [None]:
print('figures/TS_'+str(TIME_STEPS)+'_BS_'+str(BATCH_SIZE)+'_Error_'+str(round(error,5))+'.png')

In [None]:
hm2.head()

### Extra code

In [None]:
hm2 = hm.copy()
hm2["Datetime"] = pd.to_datetime(hm2["Datetime"])
hm2['Date'] = hm2['Datetime'].dt.date
hm2['Time'] = hm2['Datetime'].dt.time
print(len(hm2['Date'].unique()))
#hm2.sort_values(by=['Datetime'])

In [None]:
hm3 = hm
#hm3['Date'] = hm3['Datetime'].date()
hm3 = hm3.set_index('Datetime')
shm = hm3['2020-03-02']
md = shm['Date'].unique()
#dt.strftime('%m/%d/%Y')
print(shm['Date'][0].strftime('%m/%d/%Y'))
#hm2.head()
plot_daily_stock_movement(hm3['2020-03-02'], 'H&M')

In [None]:
import seaborn as sns
import numpy as np
import pandas as pd

fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.plot(pd.Series(np.random.normal(10,3,size=10)), color='g')
ax2 = ax1.twinx()
ax2.plot(pd.Series(np.random.normal(0,1,size=10)), color='r')
#ax2.grid(False)
plt.show()

In [None]:
import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.normal(50, 30,[350,1]), 
        index=pd.date_range('1/1/2000', periods=350), columns=list('A'))
df2 = pd.DataFrame(np.random.normal(550, 50,[350,1]), 
        index=pd.date_range('1/1/2000', periods=350), columns=list('D'))
#df3 = pd.concat([df, df2], sort = True)
df3 = pd.concat([df.reset_index(drop=True), df2.reset_index(drop=True)], axis=1)
print(df3.head())
#df = df.cumsum()
df3.plot()
plt.show()

In [None]:
np.random.normal(50, 2, [3,4])

In [None]:
def read_sel_data(stock_name, interval = '1m', datadir = 'data/'):
    colnames = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
    filename = datadir + stock_name + '_' + interval + '.csv'
    data = pd.read_csv(filename)
    cols = list(data.columns)[1:] 
    if colnames == cols:
        ll = list([stock_name+'_'+x for x in cols])
        cnames = ['Datetime'] + ll
        data.columns = cnames
    else:
        print('Columns are not in order')
    # select only the Adj Close and Volume
    if re.search('\^', stock_name) is None:
        data = data.iloc[:, [0,5,6]]
    else:
        data = data.iloc[:, [0,5]]
    
    return data

In [None]:
hm = read_sel_data('HM-B.ST')
hm.head()

In [None]:
stock_list = ['ABB.ST', 'ASSA-B.ST', 'ATCO-B.ST', 'ELUX-B.ST', 'ERIC-B.ST', 'HM-B.ST', 'SAND.ST', '^OMXC20', '^OMXC25', '^OMXH25', '^OMXHPI', '^OMX']
sel_list = ['HM-B.ST','^OMXC20', '^OMXC25', '^OMXH25', '^OMXHPI', '^OMX']
interval = '1m'
hh = dict()
hh['a'] = [1,2,3]
print(hh['a'])

In [None]:
import re
rr=re.search('\^', sel_list[0])
rr is None

In [None]:
data_dic = dict()
for i in sel_list:
    print('reading data for ' + i)
    data_dic[i] = read_sel_data(i)
    
print('done...')

In [None]:
data_dic['HM-B.ST'].head()

In [None]:
merged_df = data_dic[sel_list[0]]

for i in range(1, len(sel_list)):
    merged_df = merged_df.merge(data_dic[sel_list[i]], on = 'Datetime')
merged_df.tail()

In [None]:
colnames = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
ll = list(['HM_'+x for x in colnames])
cnames = ['Datatime'] + ll
hm2 = hm.copy()
hm2.columns = cnames
hm2.head()

In [None]:
def read_sel_data(stock_name, interval = '1m', datadir = 'data/'):
    colnames = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
    filename = datadir + stock_name + '_' + interval + '.csv'
    data = pd.read_csv(filename)
    cols = list(data.columns)[1:] 
    if colnames == cols:
        ll = list([stock_name+'_'+x for x in cols])
        cnames = ['Datetime'] + ll
        data.columns = cnames
    else:
        print('Columns are not in order')
    # select only the Adj Close and Volume
    if re.search('\^', stock_name) is None:
        data = data.iloc[:, [0,5,6]]
    else:
        data = data.iloc[:, [0,5]]
    
    return data

def get_full_data(stock_list):
    data_dic = dict()
    for i in stock_list:
        print('reading data for ' + i)
        data_dic[i] = read_sel_data(i)
    
    merged_df = data_dic[stock_list[0]]

    for i in range(1, len(stock_list)):
        merged_df = merged_df.merge(data_dic[stock_list[i]], on = 'Datetime')
    
    return merged_df

In [None]:
stock_name = 'ERIC-B.ST'
# let's add exchange indices 
ind_list = list(['^OMXC20', '^OMXC25', '^OMXH25', '^OMXHPI', '^OMX'])
comp_list = list([stock_name]) + ind_list
dd = get_full_data(comp_list)
dd.head()

In [None]:
dd.columns[2:]

In [None]:
dd.shape

In [None]:
dd.head()