In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import models
from keras.models import Sequential
from keras.layers import LSTM, Dense, Bidirectional, Dropout

import matplotlib.pyplot as plt

#Import data

In [2]:
!wget http://www.playnow.com/resources/documents/downloadable-numbers/649.zip
!unzip 649.zip

--2022-07-16 11:33:43--  http://www.playnow.com/resources/documents/downloadable-numbers/649.zip
Resolving www.playnow.com (www.playnow.com)... 198.162.228.12
Connecting to www.playnow.com (www.playnow.com)|198.162.228.12|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://www.playnow.com/resources/documents/downloadable-numbers/649.zip [following]
--2022-07-16 11:33:43--  https://www.playnow.com/resources/documents/downloadable-numbers/649.zip
Connecting to www.playnow.com (www.playnow.com)|198.162.228.12|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 47045 (46K) [application/zip]
Saving to: ‘649.zip’


2022-07-16 11:33:43 (325 KB/s) - ‘649.zip’ saved [47045/47045]

Archive:  649.zip
  inflating: 649.csv                 


In [3]:
df = pd.read_csv("649.csv")
df.head(3)

Unnamed: 0,PRODUCT,DRAW NUMBER,SEQUENCE NUMBER,DRAW DATE,NUMBER DRAWN 1,NUMBER DRAWN 2,NUMBER DRAWN 3,NUMBER DRAWN 4,NUMBER DRAWN 5,NUMBER DRAWN 6,BONUS NUMBER
0,649,1,0,1982-06-12,3,11,12,14,41,43,13
1,649,2,0,1982-06-19,8,33,36,37,39,41,9
2,649,3,0,1982-06-26,1,6,23,24,27,39,34


#Data pre-processing

In [4]:
def get_winning_number(df): 
  number_set = []
  for index, row in df.iterrows(): 
    sorted_num = sorted(row[4:].tolist())
    number_set.append(sorted_num) #df column number drawn 1 to Bonus 
  print("The total of",len(number_set), "draw winning numbers are retrieved")
  return number_set
winning_numbers = get_winning_number(df)
print(winning_numbers[0])

The total of 4015 draw winning numbers are retrieved
[3, 11, 12, 13, 14, 41, 43]


#### Since winning numbers are considered as 'categorical data', the data are one hot encoded

In [5]:
def number2onehot(numbers):
  encoded = np.zeros(49) 
  for number in numbers: 
    encoded[number - 1] = 1
  return encoded

def onehot2number(numbers): 
  decoded  = []
  index = 0
  for number in numbers: 
    if number == 1: 
      decoded.append(index + 1)
    index += 1 
  return decoded

In [6]:
enc_winning_numbers = list(map(number2onehot,winning_numbers))
# decoded_winning_numbers = list(map(onehot2number, enc_winning_numbers))

total_draws = len(enc_winning_numbers)
X = enc_winning_numbers[0:total_draws-1]
y = enc_winning_numbers[1:total_draws]

In [17]:
from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = train_test_split(X, y, test_size = 0.05, random_state=42) #80:20 
train_input, val_input, train_target, val_target = train_test_split(train_input, train_target, test_size = 0.05, random_state=42)
print("train: {0}, val: {1}, test: {2}".format(len(train_input), len(val_input), len(test_input)))
print(train_input[0].shape)

train: 3622, val: 191, test: 201
(49,)


#Modeling

In [18]:
train_input = np.array(train_input)
train_target = np.array(train_target)
val_input = np.array(val_input)
val_target = np.array(val_target)

In [19]:
print(train_input.shape, train_target.shape)
print(val_input.shape, val_target.shape)

(3622, 49) (3622, 49)
(191, 49) (191, 49)


In [22]:
model = Sequential()
model.add(LSTM(128, input_shape=(1, 49)))
model.add(Dense(49, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 128)               91136     
                                                                 
 dense_2 (Dense)             (None, 49)                6321      
                                                                 
Total params: 97,457
Trainable params: 97,457
Non-trainable params: 0
_________________________________________________________________


In [None]:
# 매 에포크마다 훈련과 검증의 손실 및 정확도를 기록하기 위한 변수
train_loss = []
train_acc = []
val_loss = []
val_acc = []

for epoch in range(500):

    model.reset_states() # important to reset states because we are training from the first draw every epoch 

    batch_train_loss = []
    batch_train_acc = []

    #training 
    for i in range(0, len(train_input)):
        
        xs = train_input[i].reshape(1, 1, 49)
        ys = train_target[i].reshape(1, 49)
        
        loss, acc = model.train_on_batch(xs, ys) 

        batch_train_loss.append(loss)
        batch_train_acc.append(acc)

    train_loss.append(np.mean(batch_train_loss))
    train_acc.append(np.mean(batch_train_acc))

    batch_val_loss = []
    batch_val_acc = []

    #validation
    for i in range(0, len(val_input)):

        xs = val_input[i].reshape(1, 1, 49)
        ys = val_target[i].reshape(1, 49)
        
        loss, acc = model.test_on_batch(xs, ys) 
        
        batch_val_loss.append(loss)
        batch_val_acc.append(acc)

    val_loss.append(np.mean(batch_val_loss))
    val_acc.append(np.mean(batch_val_acc))

    print('epoch {0:4d} train acc {1:0.3f} loss {2:0.3f} val acc {3:0.3f} loss {4:0.3f}'.format(epoch, np.mean(batch_train_acc), np.mean(batch_train_loss), np.mean(batch_val_acc), np.mean(batch_val_loss)))

epoch    0 train acc 0.109 loss 0.321 val acc 0.010 loss 0.524
epoch    1 train acc 0.110 loss 0.319 val acc 0.016 loss 0.528
epoch    2 train acc 0.109 loss 0.317 val acc 0.016 loss 0.531
epoch    3 train acc 0.110 loss 0.315 val acc 0.016 loss 0.535
epoch    4 train acc 0.112 loss 0.314 val acc 0.016 loss 0.539
epoch    5 train acc 0.113 loss 0.312 val acc 0.016 loss 0.542
epoch    6 train acc 0.115 loss 0.310 val acc 0.016 loss 0.546
epoch    7 train acc 0.114 loss 0.308 val acc 0.016 loss 0.550
epoch    8 train acc 0.116 loss 0.307 val acc 0.016 loss 0.553
epoch    9 train acc 0.117 loss 0.305 val acc 0.016 loss 0.557
epoch   10 train acc 0.117 loss 0.304 val acc 0.016 loss 0.561
epoch   11 train acc 0.117 loss 0.302 val acc 0.016 loss 0.564
epoch   12 train acc 0.117 loss 0.300 val acc 0.010 loss 0.568
epoch   13 train acc 0.118 loss 0.299 val acc 0.005 loss 0.572
epoch   14 train acc 0.119 loss 0.298 val acc 0.005 loss 0.575
epoch   15 train acc 0.118 loss 0.296 val acc 0.005 los

In [None]:
import matplotlib.pyplot as plt

fig, loss_ax = plt.subplots()

acc_ax = loss_ax.twinx()

loss_ax.plot(train_loss, 'y', label='train loss')
loss_ax.plot(val_loss, 'r', label='val loss')

acc_ax.plot(train_acc, 'b', label='train acc')
acc_ax.plot(val_acc, 'g', label='val acc')

loss_ax.set_xlabel('epoch')
loss_ax.set_ylabel('loss')
acc_ax.set_ylabel('accuray')

loss_ax.legend(loc='upper left')
acc_ax.legend(loc='lower left')

plt.show()

In [None]:
from google.colab import files
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))
files.download('finalized_model.sav')

#Testing performance