In [1]:

%matplotlib inline
import os
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import SimpleRNN

from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import math
from sklearn.metrics import mean_squared_error
import FinanceDataReader as fdr

Using TensorFlow backend.


In [2]:
pandf = fdr.DataReader("KS11","2009-01-01","2019-01-01")


In [3]:
#특성 추가 ------------------------------------------
    #이동평균선
def get_MA(df):
    MA_26=df["Close"].rolling(26).mean()
    MA_52=df["Close"].rolling(52).mean()
    df=df.assign(MA_26=MA_26,MA_52=MA_52).dropna()
    
    return df

    
    #스토캐스틱
def get_stochastic(df, n=15, m=5, t=3):
    # n일중 최고가
    ndays_high = df.High.rolling(window=n, min_periods=1).max()
    # n일중 최저가
    ndays_low = df.Low.rolling(window=n, min_periods=1).min()
 
    # Fast%K 계산
    kdj_k = ((df.Close - ndays_low) / (ndays_high - ndays_low))*100
    # Fast%D (=Slow%K) 계산
    kdj_d = kdj_k.ewm(span=m).mean()
    # Slow%D 계산
    kdj_j = kdj_d.ewm(span=t).mean()
 
    # dataframe에 컬럼 추가
    df = df.assign(kdj_k=kdj_k, kdj_d=kdj_d, kdj_j=kdj_j).dropna()
    
    return df
   
    #시간
def get_time(df):
    time=np.linspace(0,10,len(df),endpoint=False).reshape(-1,1)
    df=df.assign(time=time)
    
    return df

In [4]:
df=pandf
df=get_time(df)
df=get_stochastic(df)
df=get_MA(df)
# convert nparray

nparr = df.values
nparr.astype('float32')
print(nparr)

[[1169.95       1176.86       1176.86       ...   85.79334492
  1104.68038462 1134.385     ]
 [1161.81       1180.39       1186.91       ...   87.42346904
  1103.58923077 1134.46980769]
 [1170.94       1165.03       1180.38       ...   88.69249321
  1103.24692308 1134.41923077]
 ...
 [2028.01       2028.81       2037.83       ...   21.65334887
  2078.89038462 2083.74788462]
 [2028.44       2032.09       2035.57       ...   19.78398169
  2076.80807692 2081.50403846]
 [2041.04       2036.7        2046.97       ...   20.81083906
  2075.44230769 2079.5025    ]]


In [5]:
# normalization
scaler = MinMaxScaler(feature_range=(0, 1))
nptf = scaler.fit_transform(nparr)

In [6]:
# 값과 이평선 비교하기 ----------------

MA_26=df["Close"].rolling(26).mean()
y_before=MA_26.dropna()
df_before=df["Close"][25:]
y=np.where(df_before.shift(-1)>y_before.shift(-1),1,0)
X=nptf[25:]

In [7]:
# split train, test
train_size = int(len(nptf) * 0.7)
y_train2=y[:train_size]
y_test2=y[train_size:]
X_train2 = X[:train_size]
X_test2 = X[train_size:]



In [8]:
# reshape input to be [samples, time steps, features]
X_train = np.reshape(X_train2, (X_train2.shape[0], 1, X_train2.shape[1]))
X_test = np.reshape(X_test2, (X_test2.shape[0], 1, X_test2.shape[1]))
y_train=y_train2
y_test=y_test2


In [9]:
# simple lstm network learning
model = Sequential()
model.add(LSTM(36, input_shape=(1, 12)))
for i in range(5):
    model.add(Dense(36,activation='sigmoid'))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model.fit(X_train, y_train, epochs=10, batch_size=16,validation_split=0.1)
model.evaluate(X_test,y_test)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Train on 1526 samples, validate on 170 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.35561769385623115, 0.8504273295402527]

In [10]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,np.where(model.predict(X_test)>0.5,1,0))

0.8504273504273504

In [11]:
model.predict(X_test)

array([[0.92429096],
       [0.93254197],
       [0.9329069 ],
       [0.93408287],
       [0.93908644],
       [0.94115055],
       [0.9404321 ],
       [0.9457123 ],
       [0.94672155],
       [0.94687545],
       [0.9465298 ],
       [0.9408624 ],
       [0.9423754 ],
       [0.9439957 ],
       [0.9448546 ],
       [0.9403703 ],
       [0.937649  ],
       [0.94007003],
       [0.940725  ],
       [0.9419297 ],
       [0.9395479 ],
       [0.94234324],
       [0.9414922 ],
       [0.93211347],
       [0.92391884],
       [0.91650784],
       [0.93512845],
       [0.93672216],
       [0.9185373 ],
       [0.27148354],
       [0.23717612],
       [0.09549609],
       [0.10200968],
       [0.10116923],
       [0.10418412],
       [0.09971598],
       [0.15944725],
       [0.8886317 ],
       [0.92125595],
       [0.92452717],
       [0.9305658 ],
       [0.919352  ],
       [0.9417004 ],
       [0.93826354],
       [0.93901783],
       [0.942936  ],
       [0.9412581 ],
       [0.923

In [20]:
df=df[25:]
df["y"]=y

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [21]:
df

Unnamed: 0_level_0,Close,Open,High,Low,Volume,Change,time,kdj_k,kdj_d,kdj_j,MA_26,MA_52,y
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2009-04-22,1356.02,1347.79,1360.41,1336.41,730160000.0,0.0144,0.307195,87.423768,82.994507,82.438338,1271.283077,1188.537885,1
2009-04-23,1368.80,1367.30,1371.95,1353.16,747020000.0,0.0094,0.311237,97.167776,87.718930,85.078634,1278.931154,1191.805769,1
2009-04-24,1354.10,1371.81,1375.80,1343.55,683430000.0,-0.0107,0.315279,81.141914,85.526591,85.302613,1286.326923,1194.958077,1
2009-04-27,1339.83,1358.20,1366.66,1334.02,547300000.0,-0.0105,0.319321,68.740766,79.931316,82.616964,1292.822692,1198.034808,1
2009-04-28,1300.24,1344.77,1353.80,1298.86,759040000.0,-0.0295,0.323363,34.335622,64.732751,73.674858,1296.697308,1200.107885,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-21,2061.49,2052.70,2061.51,2049.76,311390000.0,0.0007,9.979790,19.565450,24.885223,27.398510,2083.118846,2087.757500,0
2018-12-24,2055.01,2050.38,2059.94,2046.18,285280000.0,-0.0031,9.983832,13.764143,21.178196,24.288353,2081.680769,2086.321731,0
2018-12-26,2028.01,2028.81,2037.83,2014.28,321500000.0,-0.0131,9.987874,14.698640,19.018344,21.653349,2078.890385,2083.747885,0
2018-12-27,2028.44,2032.09,2035.57,2021.39,398020000.0,0.0002,9.991916,15.707155,17.914615,19.783982,2076.808077,2081.504038,0


In [28]:
import matplotlib.pyplot as plt
import seaborn as sns
df.corr()

Unnamed: 0,Close,Open,High,Low,Volume,Change,time,kdj_k,kdj_d,kdj_j,MA_26,MA_52,y
Close,1.0,0.998109,0.999051,0.999236,-0.139608,0.011786,0.787133,0.012046,0.009546,0.007615,0.97706,0.957555,0.020577
Open,0.998109,1.0,0.999371,0.998882,-0.135742,-0.03266,0.786883,-0.009442,0.001211,0.003448,0.978599,0.958715,0.008714
High,0.999051,0.999371,1.0,0.99888,-0.135305,-0.017202,0.785793,-0.00649,-0.000806,0.000147,0.979812,0.960516,0.008723
Low,0.999236,0.998882,0.99888,1.0,-0.142034,-0.004952,0.788882,0.007813,0.010995,0.01045,0.97598,0.955971,0.020812
Volume,-0.139608,-0.135742,-0.135305,-0.142034,1.0,-0.037128,-0.114756,0.070808,0.105671,0.114117,-0.157492,-0.186812,0.069188
Change,0.011786,-0.03266,-0.017202,-0.004952,-0.037128,1.0,-0.028722,0.413369,0.148582,0.067494,-0.057683,-0.053502,0.241365
time,0.787133,0.786883,0.785793,0.788882,-0.114756,-0.028722,1.0,-0.099283,-0.116909,-0.121257,0.802918,0.814682,-0.08685
kdj_k,0.012046,-0.009442,-0.00649,0.007813,0.070808,0.413369,-0.099283,1.0,0.900767,0.807247,-0.158047,-0.167299,0.73449
kdj_d,0.009546,0.001211,-0.000806,0.010995,0.105671,0.148582,-0.116909,0.900767,1.0,0.98096,-0.170115,-0.192285,0.749996
kdj_j,0.007615,0.003448,0.000147,0.01045,0.114117,0.067494,-0.121257,0.807247,0.98096,1.0,-0.168131,-0.197001,0.722796


In [396]:
pandf.shape

(789, 6)

In [397]:
MA_26.shape

(713,)

In [398]:
X_train2.shape

(516, 12)