# Lottery prediction with LSTM model


made by J-D Park, Yonsei CSE Ph.D Student

In [225]:
# load dependacies
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from matplotlib import pyplot
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM

In [74]:
# load raw data
# os.getcwd()
rawdata = pd.read_csv('dataset/lottery_history.csv')

In [75]:
rawdata

Unnamed: 0.1,Unnamed: 0,1,2,3,4,5,6,bonus
0,959,1,14,15,24,40,41,35
1,958,2,9,10,16,35,37,1
2,957,4,15,24,35,36,40,1
3,956,10,11,20,21,25,41,40
4,955,4,9,23,26,29,33,8
...,...,...,...,...,...,...,...,...
954,5,16,24,29,40,41,42,3
955,4,14,27,30,31,40,42,2
956,3,11,16,19,21,27,31,30
957,2,9,13,21,25,32,42,2


In [76]:
# sort with ascending order
rawdata = rawdata.sort_values(by=['Unnamed: 0'], axis=0)
rawnp = rawdata.to_numpy()

In [77]:
rawnp_proc = rawnp[:,1:]
print(rawnp_proc)

[[10 23 29 ... 37 40 16]
 [ 9 13 21 ... 32 42  2]
 [11 16 19 ... 27 31 30]
 ...
 [ 4 15 24 ... 36 40  1]
 [ 2  9 10 ... 35 37  1]
 [ 1 14 15 ... 40 41 35]]


In [78]:
# to construct one-hot encoded input dataset
inputnp = np.zeros((len(rawnp),45))

In [79]:
# convert to one-hot features (multi-variate time-series)
i = 0
for row in rawnp_proc:
    for elem in row:
        #assign one-hot
        inputnp[i, elem-1] = 1
    i += 1

In [80]:
#final output
inputnp

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 1., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [81]:
# save to the local
np.save('data.npy', inputnp)

About model from now on -> future migration of codes

In [290]:
dataset = np.load('data.npy')
dataset_pd = pd.DataFrame(dataset)

In [291]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
  n_vars = 1 if type(data) is list else data.shape[1]
  df = pd.DataFrame(data)
  cols, names = list(), list()
  # input sequence (t-n, ... t-1)
  for i in range(n_in, 0, -1):
      cols.append(df.shift(i))
      names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
  # forecast sequence (t, t+1, ... t+n)
  for i in range(0, n_out):
      cols.append(df.shift(-i))
      if i == 0:
          names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
      else:
          names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
  # put it all together
  agg = pd.concat(cols, axis=1)
  agg.columns = names
  # drop rows with NaN values
  if dropnan:
      agg.dropna(inplace=True)
  return agg

In [292]:
reframed = series_to_supervised(dataset, 1, 1)

In [293]:
reframed

Unnamed: 0,var1(t-1),var2(t-1),var3(t-1),var4(t-1),var5(t-1),var6(t-1),var7(t-1),var8(t-1),var9(t-1),var10(t-1),...,var36(t),var37(t),var38(t),var39(t),var40(t),var41(t),var42(t),var43(t),var44(t),var45(t)
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
954,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
955,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
957,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [294]:
# ENTIRE_NUMBER = 45

# values = reframed.values
# n_train_hours = int(0.8 * len(dataset))
# train = values[:n_train_hours, :]
# test = values[n_train_hours:, :]
# # split into input and outputs
# train_X, train_y = train[:, :-1], train[:, -1]
# test_X, test_y = test[:, :-1], test[:, -1]
# # reshape input to be 3D [samples, timesteps, features]
# train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
# test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))

In [295]:
ENTIRE_NUMBER = 45

values = reframed.values
n_train_hours = int(0.8 * len(dataset))
train = values[:n_train_hours, :]
test = values[n_train_hours:n_train_hours+1, :]
# split into input and outputs
train_X, train_y = train[:, :ENTIRE_NUMBER], train[:, ENTIRE_NUMBER:]
test_X, test_y = test[:, :ENTIRE_NUMBER], test[:, ENTIRE_NUMBER:]
# reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))

In [296]:
train_X.shape[2]

45

In [297]:
model = Sequential()
model.add(LSTM(128, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dense(ENTIRE_NUMBER))
model.compile(loss='mse', optimizer='adam')

In [298]:
history = model.fit(train_X, train_y, epochs=10, batch_size=8, 
         verbose=2, shuffle=False)

Epoch 1/10
96/96 - 0s - loss: 0.1386
Epoch 2/10
96/96 - 0s - loss: 0.1308
Epoch 3/10
96/96 - 0s - loss: 0.1291
Epoch 4/10
96/96 - 0s - loss: 0.1279
Epoch 5/10
96/96 - 0s - loss: 0.1269
Epoch 6/10
96/96 - 0s - loss: 0.1261
Epoch 7/10
96/96 - 0s - loss: 0.1254
Epoch 8/10
96/96 - 0s - loss: 0.1248
Epoch 9/10
96/96 - 0s - loss: 0.1243
Epoch 10/10
96/96 - 0s - loss: 0.1237


In [299]:
yhat = model.predict(test_X)


In [300]:
yhat

array([[ 0.1883061 ,  0.05320678,  0.16360234,  0.14108801,  0.12809163,
         0.35185164,  0.2269278 ,  0.13521384,  0.07440365,  0.19711213,
         0.3528889 ,  0.06874692,  0.23552819,  0.29494694, -0.01834923,
         0.16106597,  0.03067926,  0.10306194,  0.04760702,  0.19481601,
         0.15245   , -0.07876786,  0.12156203,  0.25573358,  0.20910658,
         0.26953048,  0.29428062,  0.01463741,  0.19032341,  0.13220221,
         0.19059548,  0.16392511,  0.23256163,  0.03474504,  0.20236427,
         0.07094978,  0.03037872,  0.18603517,  0.13383919,  0.23467389,
         0.15983039,  0.09181807,  0.17738949,  0.2826555 ,  0.09778278]],
      dtype=float32)

In [301]:
yhat_assigned = np.argsort(-yhat[:])

In [302]:
ww = yhat_assigned[:,:6]


In [303]:
predicted

array([[10,  5, 13, 26, 43, 25]], dtype=int64)

In [304]:
gth

array([[10,  5, 25, 13, 26,  9]], dtype=int64)

In [305]:
for i in ww[0]:
    print(i)
    if i in gth:
        print('aa')
    else:
        print('bb')
    
    

10
aa
5
aa
13
aa
26
aa
43
bb
25
aa


In [306]:
gth = np.argsort(-test_y[:])
gth = yhat_assigned[:,:6]

In [307]:
len(gth.T)

6

In [308]:
test_y

array([[0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1.]])

In [309]:
# yhat = model.predict(test_X)
# test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]))
# # invert scaling for forecast
# inv_yhat = np.concatenate((yhat, test_X[:, 1:]), axis=1)
# #inv_yhat = scaler.inverse_transform(inv_yhat)
# inv_yhat = inv_yhat[:,0]
# # invert scaling for actual
# test_y = test_y.reshape((len(test_y), 1))
# inv_y = np.concatenate((test_y, test_X[:, 1:]), axis=1)
# #inv_y = scaler.inverse_transform(inv_y)
# inv_y = inv_y[:,0]
# # calculate RMSE
# rmse = np.sqrt(mean_squared_error(inv_y, inv_yhat))

In [240]:
test_X.shape[1]

89