# Part 14

## # Dataset

In [1]:
import pandas as pd
import numpy as np

In [135]:
df_train = pd.read_csv('../datasets/petr4_treinamento.csv')
df_train.shape, df_train.columns

((1245, 7),
 Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object'))

In [136]:
df_train.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2013-01-02,19.99,20.209999,19.690001,19.690001,18.086271,30182600.0
1,2013-01-03,19.809999,20.4,19.700001,20.4,18.738441,30552600.0
2,2013-01-04,20.33,20.620001,20.17,20.43,18.766001,36141000.0
3,2013-01-07,20.48,20.67,19.950001,20.08,18.444506,28069600.0
4,2013-01-08,20.110001,20.23,19.459999,19.5,17.911745,29091300.0


In [137]:
def _isnull(df):
    for c in df.columns:
        cnt = df[pd.isnull(df[c])].shape[0]
        if cnt != 0:
            print(f'# {c}: {cnt} rows')

In [138]:
_isnull(df_train)

# Open: 3 rows
# High: 3 rows
# Low: 3 rows
# Close: 3 rows
# Adj Close: 3 rows
# Volume: 3 rows


In [139]:
df_train.dropna(axis=0, inplace=True)
df_train.reset_index(inplace=True, drop=True)
df_train.shape

(1242, 7)

In [140]:
df_train.shape

(1242, 7)

DAYS BEFORE = 90

In [141]:
def daysbefore(df, days_before, col):
    feature = []
    target = []
    for i in range(days_before, df.shape[0]):
        slice_start = i - days_before
        slice_stop = i
        feature.append(df[col].iloc[slice_start:slice_stop].values)
        target.append(df[col].iloc[i])
    return np.array(feature), np.array(target)

In [142]:
x, y = daysbefore(df_train, 90, 'Open')

In [143]:
df_open_train = pd.DataFrame(data=x)
df_open_train['target'] = y
df_open_train.shape

(1152, 91)

In [144]:
df_open_train.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,81,82,83,84,85,86,87,88,89,target
1147,13.64,13.7,13.78,13.78,13.87,13.93,13.76,13.79,13.53,13.85,...,15.36,15.65,15.1,15.05,15.16,15.18,15.21,15.31,15.75,15.75
1148,13.7,13.78,13.78,13.87,13.93,13.76,13.79,13.53,13.85,13.96,...,15.65,15.1,15.05,15.16,15.18,15.21,15.31,15.75,15.75,15.75
1149,13.78,13.78,13.87,13.93,13.76,13.79,13.53,13.85,13.96,14.57,...,15.1,15.05,15.16,15.18,15.21,15.31,15.75,15.75,15.75,15.99
1150,13.78,13.87,13.93,13.76,13.79,13.53,13.85,13.96,14.57,14.65,...,15.05,15.16,15.18,15.21,15.31,15.75,15.75,15.75,15.99,16.1
1151,13.87,13.93,13.76,13.79,13.53,13.85,13.96,14.57,14.65,15.02,...,15.16,15.18,15.21,15.31,15.75,15.75,15.75,15.99,16.1,16.1


In [145]:
x = np.reshape(x, (x.shape[0], x.shape[1], 1))
x.shape, y.shape

((1152, 90, 1), (1152,))

In [146]:
from sklearn.model_selection import train_test_split

In [147]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.25, random_state=42)

f'{xtrain.shape}, {ytrain.shape}, {xtest.shape}, {ytest.shape}'

'(864, 90, 1), (864,), (288, 90, 1), (288,)'

## # Model

In [28]:
from keras.models import Sequential
from keras.backend import clear_session
from keras.layers import Dropout, Dense, LSTM, RNN

In [94]:
def model():
    clear_session()

    model = Sequential()

    model.add(LSTM(units=100, return_sequences=True,
                   input_shape=(xtrain.shape[1], 1)))
    model.add(Dropout(rate=0.5))
    
    model.add(LSTM(units=50, return_sequences=True))
    model.add(Dropout(rate=0.5))
    
    model.add(LSTM(units=50, return_sequences=True))
    model.add(Dropout(rate=0.5))
    
    model.add(LSTM(units=50))
    model.add(Dropout(rate=0.5))
    
    model.add(Dense(units=1, activation='linear'))
    
    model.compile(optimizer='rmsprop', loss='mse', metrics='mae')
    
    print(model.summary())
    
    return model

In [95]:
clf = model()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 90, 100)           40800     
_________________________________________________________________
dropout (Dropout)            (None, 90, 100)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 90, 50)            30200     
_________________________________________________________________
dropout_1 (Dropout)          (None, 90, 50)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 90, 50)            20200     
_________________________________________________________________
dropout_2 (Dropout)          (None, 90, 50)            0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 50)                2

In [96]:
clf.fit(x=xtrain, y=ytrain, validation_data=(xtest, ytest), epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200


Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200


Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200


Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<tensorflow.python.keras.callbacks.History at 0x7f3822ea8df0>

## # Evaluation

In [220]:
pred = clf.predict(xtest)

In [221]:
ytest.shape, pred.shape

((288,), (288, 1))

In [209]:
ytest.mean() - pred.mean()

0.23415341501193687

In [210]:
ytest.std() - pred.std()

-0.02007471726045118

### ## Test data

In [196]:
df_test = pd.read_csv('../datasets/petr4_teste.csv')
df_test.shape, df_test.columns

((22, 7),
 Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object'))

In [197]:
df_test.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
17,2018-01-25,19.34,19.34,19.34,19.34,19.301397,0
18,2018-01-26,19.620001,19.98,19.1,19.93,19.890221,81989500
19,2018-01-29,19.67,20.049999,19.57,19.85,19.810381,55726200
20,2018-01-30,19.77,19.77,19.360001,19.49,19.451097,46203000
21,2018-01-31,19.74,19.93,19.68,19.700001,19.660681,41576600


In [198]:
df_complete = pd.concat(objs=[df_train, df_test], axis=0)
df_complete.reset_index(inplace=True, drop=True)
df_complete.shape

(1264, 7)

In [199]:
df_complete.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
1259,2018-01-25,19.34,19.34,19.34,19.34,19.301397,0.0
1260,2018-01-26,19.620001,19.98,19.1,19.93,19.890221,81989500.0
1261,2018-01-29,19.67,20.049999,19.57,19.85,19.810381,55726200.0
1262,2018-01-30,19.77,19.77,19.360001,19.49,19.451097,46203000.0
1263,2018-01-31,19.74,19.93,19.68,19.700001,19.660681,41576600.0


In [235]:
x, y = daysbefore(df_complete, 90, 'Open')

In [236]:
x.shape, y.shape

((1174, 90), (1174,))

In [237]:
df_open_test = pd.DataFrame(data=x)
df_open_test['target'] = y
df_open_test.shape

(1174, 91)

In [239]:
df_open_test.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,81,82,83,84,85,86,87,88,89,target
1169,15.25,15.85,15.6,15.79,15.86,15.7,15.37,15.5,15.19,15.6,...,17.040001,17.32,17.35,17.92,18.35,18.309999,18.26,18.4,18.42,19.34
1170,15.85,15.6,15.79,15.86,15.7,15.37,15.5,15.19,15.6,15.9,...,17.32,17.35,17.92,18.35,18.309999,18.26,18.4,18.42,19.34,19.620001
1171,15.6,15.79,15.86,15.7,15.37,15.5,15.19,15.6,15.9,15.88,...,17.35,17.92,18.35,18.309999,18.26,18.4,18.42,19.34,19.620001,19.67
1172,15.79,15.86,15.7,15.37,15.5,15.19,15.6,15.9,15.88,15.66,...,17.92,18.35,18.309999,18.26,18.4,18.42,19.34,19.620001,19.67,19.77
1173,15.86,15.7,15.37,15.5,15.19,15.6,15.9,15.88,15.66,15.61,...,18.35,18.309999,18.26,18.4,18.42,19.34,19.620001,19.67,19.77,19.74


In [240]:
x = np.reshape(x, (x.shape[0], x.shape[1], 1))
x.shape, y.shape

((1174, 90, 1), (1174,))

In [241]:
pred = clf.predict(x)

In [242]:
pred.shape, y.shape

((1174, 1), (1174,))

In [243]:
y.mean() - pred.mean()

0.25753281960264296

In [244]:
y.std() - pred.std()

-0.018483486639921054

In [245]:
import matplotlib.pyplot as plt

%matplotlib notebook

In [247]:
plt.plot(y, color='red', label='real value')
plt.plot(pred, color='blue', label='prediction')
plt.title('PETR4 - Stock OPEN price prediction')
plt.xlabel('time line')
plt.ylabel('Value R$ (Yahoo)')
plt.legend()
plt.show()

<IPython.core.display.Javascript object>

## ## Multiple inputs

In [248]:
def daysbefore(df, days_before, col:None):
    feature = []
    target = []
    for i in range(days_before, df.shape[0]):
        slice_start = i - days_before
        slice_stop = i
        feature.append(df[col].iloc[slice_start:slice_stop].values)
        target.append(df[col].iloc[i])
    return np.array(feature), np.array(target)
