## What is this?
In nikkei.py, I was trying to predict nikkei price on one day based on last 100 days price.
The script can fit the actual data in graph, but actual price up/down prediction accuracy was around 50%.

In this notebook, I'm trying to train a model using up/down training data rather than price data we used in the nikkei.py

In [1]:
import sys
import os
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Activation
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from keras.utils.visualize_util import plot
from matplotlib.pyplot import show, plot
import pandas as pd
import numpy as np
import math
import random
random.seed(0)


Using TensorFlow backend.


### Read nikkei data

In [4]:
df = pd.read_csv("../../dont_remove_data/nikkei.csv", header=None)
df

Unnamed: 0,0,1,2,3,4,5
0,19960401,21600,21870,21600,21650,21840
1,19960402,21620,21720,21580,21650,14860
2,19960403,21750,21830,21410,21510,25180
3,19960404,21540,21670,21490,21670,15460
4,19960405,21640,21830,21610,21820,13940
5,19960408,21640,21660,21430,21480,14720
6,19960409,21660,21900,21640,21880,26280
7,19960410,21840,21950,21790,21840,16460
8,19960411,21730,21820,21670,21740,16110
9,19960412,21740,21880,21630,21780,23210


In [209]:
prices = df[1]
print(type(prices))
prices = prices.astype('float32')
prices


<class 'pandas.core.series.Series'>


0       21600.0
1       21620.0
2       21750.0
3       21540.0
4       21640.0
5       21640.0
6       21660.0
7       21840.0
8       21730.0
9       21740.0
10      21930.0
11      22020.0
12      22000.0
13      21790.0
14      21780.0
15      21990.0
16      22260.0
17      22300.0
18      22320.0
19      22270.0
20      22020.0
21      22090.0
22      21770.0
23      21590.0
24      21500.0
25      21720.0
26      21400.0
27      21510.0
28      21250.0
29      21620.0
         ...   
4823    19440.0
4824    19660.0
4825    19710.0
4826    19900.0
4827    19890.0
4828    19970.0
4829    19850.0
4830    19960.0
4831    19830.0
4832    19780.0
4833    20010.0
4834    19960.0
4835    19920.0
4836    19500.0
4837    19760.0
4838    19510.0
4839    19350.0
4840    19020.0
4841    19190.0
4842    18920.0
4843    18600.0
4844    19070.0
4845    19350.0
4846    19000.0
4847    18850.0
4848    18870.0
4849    18800.0
4850    18730.0
4851    18910.0
4852    18990.0
Name: 1, dtype: float32

### Create up/down data
As we did in nikke.py, we used last 100 days prices for input then target value is either up (1, 0) or down(0, 1)

In [210]:
# we use last 200 days data
nb_prev = 200

X = []
Y = []
for i in range(len(prices) - nb_prev):
    X.append(np.array(prices.iloc[i:i + nb_prev]))
    up = (prices.iloc[i + nb_prev] - prices.iloc[i + nb_prev - 1]) > 0
    Y.append(np.array([1, 0] if up else [0, 1]))
print("len(X)", len(X))
print("len(Y)", len(Y))
print("len(X[0])", len(X[0]))
print("len Y[0]", Y[0])
print("np.count_nonzero(np.argmax(Y, axis=1)) =", np.count_nonzero(np.argmax(Y, axis=1)))

X = np.array(X).reshape((-1, nb_prev, 1))
Y = np.array(Y)

print("X.shape", X.shape)
print("Y.shape", Y.shape)


len(X) 4653
len(Y) 4653
len(X[0]) 200
len Y[0] [1 0]
np.count_nonzero(np.argmax(Y, axis=1)) = 2348
X.shape (4653, 200, 1)
Y.shape (4653, 2)


In [211]:
# Split train and test data
test_size = 0.1
nb_train = int(round(len(X) * (1 - test_size)))
print(nb_train)

X_train = X[:nb_train]
Y_train = Y[:nb_train]
X_test = X[nb_train:]
Y_test = Y[nb_train:]
print(X_train.shape)
print(Y_train.shape)
print(len(X_test))
print(len(Y_test))


4188
(4188, 200, 1)
(4188, 2)
465
465


### Let's define our model

Input shape is (nb_samples, 100, 1) where
- nb_samples is # of samples
- 100 is timespan
- 1 is dimension of input. In this case we are using (100, 1), but could be (100, N)

We pass 2 to Dense, because our output should be (0, 1) or (1, 0)

In [212]:
in_out_neurons = 1
hidden_neurons = 300

model = Sequential()  
model.add(LSTM(hidden_neurons, batch_input_shape=(None, nb_prev, in_out_neurons), return_sequences=False))  
model.add(Dense(2, input_dim=in_out_neurons))  
model.add(Activation("linear"))  
model.compile(loss="mean_squared_error", optimizer="adam")
print(model)


<keras.models.Sequential object at 0x1224b1b38>


### Train

In [213]:
# where we store optimized params
version = 4
weights_dir = "/Users/higepon/Desktop/nikkei-up-down-{0}".format(version)
print(weights_dir)

/Users/higepon/Desktop/nikkei-up-down-4


In [None]:
# train!
os.makedirs(weights_dir, exist_ok=True)
filepath = weights_dir + "/{loss:.4f}"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

print(X_train.shape)
model.fit(X_train, Y_train, batch_size=600, nb_epoch=60, validation_split=0.05, callbacks=callbacks_list)


(4188, 200, 1)
Train on 3978 samples, validate on 210 samples
Epoch 1/60

### Predict

In [217]:
def best_model_path():
    files = os.listdir(weights_dir)
    files.sort()
    return "{0}/{1}".format(weights_dir, files[0])

model.load_weights(best_model_path())    


In [218]:
predicted = model.predict(X_test)
print(predicted[0])

# get max index
predicted_index = np.argmax(predicted, axis=1)
print("predicted_index", predicted_index)
test_index = np.argmax(Y_test, axis=1)
print("test_index", test_index)

[ 0.67877376  0.52697355]
predicted_index [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
test_index [0 

In [176]:
# evaluate
print(((predicted_index - test_index) ** 2).mean())

(predicted_index, test_index)


0.463157894737


(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
upOrDownLoss(predicted, )