<a href="https://colab.research.google.com/github/ladofa/edu/blob/master/Example_lotto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 개요
 - 1회차부터 현재까지 로또 번호를 알고 있으면 그 다음 로또 번호를 예측할 수 있는 시스템을 딥러닝으로 구현
 - 입력 : 현재까지의 로또 번호
   * shape : (N, t, 45 * 2)
   * 메인 번호와 보너스 번호 두 개의 채널로 구성
   * 메인 번호 : one hot 인코딩 비슷하게 주요 6개 숫자는 1 나머지는 0
   * 보너스 번호 : one hot 인코딩
 - 출력 : 다음 로또 번호
   * 6개의 메인 번호, 1개의 보너스 번호
   * 메인 번호(main) : (N, 45), 당첨번호 1, 그 외 0 (binary_crossentropy)
   * 보너스 번호(sub) : (N, 45), softmax (sparse_categorical_crossentropy)

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

# Stateful LSTM 선행학습

In [2]:
model = keras.models.Sequential([
    keras.layers.LSTM(10, return_sequences=True, stateful=True, batch_input_shape=[1, None, 1]),
    keras.layers.LSTM(10, stateful=True),
    keras.layers.Dense(1)
])

In [3]:
model.compile('nadam', 'mse')
sample = np.arange(9).reshape(1, 9, 1)


for epoch in range(1000):
    for layer in model.layers[:-1]:
        layer.reset_states()
    total_loss = 0
    for i in range(8):
        x = sample[:, i:i+1, :]
        i2 = i + 1
        y = sample[:, i2:i2+1, :]
        loss = model.train_on_batch(x, y)
        total_loss += loss
    if epoch % 100 == 0:
        print(epoch, total_loss)

# do not run
# model.fit(sample[:, :8, :], sample[:, 1:, :], epochs=100)


0 210.7184820175171
100 3.9147999100387096
200 0.1783169927784911
300 0.17014859741402688
400 0.08715382154332474
500 0.09522466907310445
600 0.05914029003815813
700 0.049783330040554574
800 0.043782850396382855
900 0.038815279644040857


In [4]:
for layer in model.layers[:-1]:
    layer.reset_states()
for i in range(8):
    print(model(sample[:, i:i+1, :]))

tf.Tensor([[1.0089418]], shape=(1, 1), dtype=float32)
tf.Tensor([[2.0095718]], shape=(1, 1), dtype=float32)
tf.Tensor([[3.0610096]], shape=(1, 1), dtype=float32)
tf.Tensor([[4.059567]], shape=(1, 1), dtype=float32)
tf.Tensor([[5.079951]], shape=(1, 1), dtype=float32)
tf.Tensor([[6.0963693]], shape=(1, 1), dtype=float32)
tf.Tensor([[7.06233]], shape=(1, 1), dtype=float32)
tf.Tensor([[8.050302]], shape=(1, 1), dtype=float32)


In [5]:
for i in range(8):
    print(model(sample[:, i:i+1, :]))

tf.Tensor([[8.073675]], shape=(1, 1), dtype=float32)
tf.Tensor([[7.7889504]], shape=(1, 1), dtype=float32)
tf.Tensor([[7.5429497]], shape=(1, 1), dtype=float32)
tf.Tensor([[7.3707404]], shape=(1, 1), dtype=float32)
tf.Tensor([[7.3045745]], shape=(1, 1), dtype=float32)
tf.Tensor([[7.3486443]], shape=(1, 1), dtype=float32)
tf.Tensor([[7.509231]], shape=(1, 1), dtype=float32)
tf.Tensor([[7.9287033]], shape=(1, 1), dtype=float32)


In [6]:
for layer in model.layers[:-1]:
    layer.stateful = False

In [7]:
model(sample[:, 0:3, :])

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[3.0610094]], dtype=float32)>

In [8]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (1, None, 10)             480       
                                                                 
 lstm_1 (LSTM)               (1, 10)                   840       
                                                                 
 dense (Dense)               (1, 1)                    11        
                                                                 
Total params: 1,331
Trainable params: 1,331
Non-trainable params: 0
_________________________________________________________________


# 데이터 준비


## lotto 사이트에서 데이터 받아오기

https://dhlottery.co.kr/gameResult.do?method=byWin 에서 데이터를 받는다. 받은 데이터를 xlsx 형태로 변환한다.

최종적으로 ndarray로 저장

In [9]:
import openpyxl
#참고 : https://openpyxl.readthedocs.io/en/stable/tutorial.html

xlsx = openpyxl.load_workbook('lotto.xlsx')
ws = xlsx['excel']
cells = ws['n4':'t994']

values = [[x.value for x in row] for row in cells]

data = np.array(values) - 1 #모든 숫자는 0부터 44
t_max = data.shape[0]

## 학습에 맞게 변환
주요 당첨 번호는 data_main으로, 보너스는 data_sub로 바꾼다. data_main은 one hot 인코딩과 유사하게 정답은 1, 오답은 0으로 하여 binary_crossentropy 로 계산한다. 보너스는 평범한 classifier 문제로 생각하여 label을 그대로 저장한다.

In [10]:
coord_x = data.copy()
coord_x[:, 6] += 45
coord_x = coord_x.reshape(-1)
coord_y = np.arange(t_max).repeat(7)
coding_data = np.zeros((t_max, 45*2), dtype=np.uint8)
coding_data[coord_y, coord_x] = 1

x_train = coding_data[None, :-1]
y_train_main = coding_data[None, 1:, :45]
y_train_sub = data[None, 1:, 6]

제대로 변환했는지 확인

In [11]:
sample = coding_data[0]
np.where(sample == 1)[0]

array([12, 17, 24, 30, 32, 43, 82])

# 모델 설계

In [12]:
def top6_accuracy(y_true, y_pred):
    t1 = tf.math.top_k(y_pred,k=6)[1]
    t2 = tf.math.top_k(y_true,k=6)[1]
    s = tf.sparse.to_dense(tf.sets.intersection(t1, t2))
    acc_main = tf.shape(s)[-1]
    return acc_main

In [13]:
model_lstm = keras.models.Sequential([
    keras.layers.LSTM(64, return_sequences=True, stateful=True, batch_input_shape=[1, None, 45*2]),
    keras.layers.LSTM(64, return_sequences=True, stateful=True),
    keras.layers.LSTM(64, stateful=True),
])

out_main = keras.layers.Dense(45, 'sigmoid')(model_lstm.output)
out_sub = keras.layers.Dense(45, 'softmax')(model_lstm.output)

model = keras.models.Model(model_lstm.input, [out_main, out_sub])
rate_main = 10
rate_sub = 1
model.compile('nadam', ['binary_crossentropy', 'sparse_categorical_crossentropy'],
              loss_weights=[rate_main, rate_sub], metrics=([top6_accuracy], ['accuracy']))

In [14]:
history = []

for epoch in range(30):
    for layer in model_lstm.layers:
        layer.reset_states()

    #초반 100회차 정도 쌓여야 예측이 가능하다고 가정
    model(x_train[:, :100])
    train_loss = []

    #후반 50개를 validation 데이터로 사용
    for i in range(100, len(x_train[0]) - 50):
        loss = model.train_on_batch(x_train[:, i:i+1], (y_train_main[:, i], y_train_sub[:, i]))
        train_loss.append(loss)
    train_loss = np.array(train_loss).mean(axis=0)
    print(epoch, train_loss.round(3))

    valid_loss = []
    for i in range(len(x_train[0]) - 50, len(x_train[0])):
        
        y_pred = model(x_train[:, i:i+1])
        y_true_main = y_train_main[:, i]
        y_true_sub = y_train_sub[:, i]
        loss_main = tf.reduce_mean(tf.losses.binary_crossentropy(y_true_main, y_pred[0]))
        loss_sub = tf.reduce_mean(tf.losses.sparse_categorical_crossentropy(y_true_sub, y_pred[1]))
        loss = rate_main * loss_main + rate_sub * loss_sub
        acc_main = top6_accuracy(y_true_main, y_pred[0])
        acc_sub = tf.reduce_mean(keras.metrics.sparse_categorical_accuracy(y_true_sub, y_pred[1]))
        valid_loss.append([loss, loss_main, loss_sub, acc_main, acc_sub])
        # loss = model.evaluate(x_train[:, i:i+1])
        # valid_loss.append(loss)
    valid_loss = np.array(valid_loss).mean(axis=0)
    print(epoch, '                                        ', valid_loss.round(3))

    history.append([train_loss, valid_loss])


0 [7.981 0.405 3.934 0.801 0.027]
0                                          [7.74  0.397 3.769 0.62  0.1  ]
1 [7.795 0.395 3.849 0.807 0.03 ]
1                                          [7.76  0.397 3.792 0.6   0.1  ]
2 [7.785 0.394 3.842 0.813 0.029]
2                                          [7.759 0.396 3.794 0.6   0.   ]
3 [7.778 0.394 3.836 0.82  0.029]
3                                          [7.758 0.396 3.795 0.6   0.   ]
4 [7.765 0.394 3.827 0.83  0.03 ]
4                                          [7.762 0.396 3.797 0.58  0.   ]
5 [7.745 0.393 3.811 0.861 0.033]
5                                          [7.78  0.397 3.808 0.56  0.06 ]
6 [7.704 0.393 3.775 0.915 0.046]
6                                          [7.758 0.397 3.786 0.68  0.   ]
7 [7.617 0.392 3.696 0.952 0.056]
7                                          [7.772 0.398 3.79  0.64  0.   ]
8 [7.479 0.391 3.57  1.017 0.07 ]
8                                          [7.805 0.399 3.81  0.7   0.04 ]
9 [7.308 0.39  3.41

In [15]:
y_train_main[:, 0].shape

(1, 45)

In [16]:
for layer in model_lstm.layers:
        layer.reset_states()

y_pred = model(coding_data[None])


In [17]:
y_pred[0][0].numpy().argsort()

array([ 5,  9, 27, 15,  8, 37, 29, 14,  2, 12, 10, 18, 40, 35, 21, 25, 42,
       32,  0, 26, 22, 38, 33, 41, 30, 24,  3, 23,  6, 20, 31, 44, 39, 36,
       11, 28, 17,  1,  7, 13, 19, 34, 16, 43,  4])

In [18]:
main = y_pred[0][0].numpy()
main.argsort() + 1

array([ 6, 10, 28, 16,  9, 38, 30, 15,  3, 13, 11, 19, 41, 36, 22, 26, 43,
       33,  1, 27, 23, 39, 34, 42, 31, 25,  4, 24,  7, 21, 32, 45, 40, 37,
       12, 29, 18,  2,  8, 14, 20, 35, 17, 44,  5])

In [19]:
for layer in model_lstm.layers:
        layer.reset_states()

y_pred = model(coding_data[None])
main = y_pred[0][0].numpy()
main.argsort() + 1

array([ 6, 10, 28, 16,  9, 38, 30, 15,  3, 13, 11, 19, 41, 36, 22, 26, 43,
       33,  1, 27, 23, 39, 34, 42, 31, 25,  4, 24,  7, 21, 32, 45, 40, 37,
       12, 29, 18,  2,  8, 14, 20, 35, 17, 44,  5])

In [20]:
y_pred[1][0].numpy().argmax()

23

In [21]:
y_pred[1][0][2]

<tf.Tensor: shape=(), dtype=float32, numpy=0.00013949312>