In [None]:
!ls -a

In [None]:
import pandas as pd

In [None]:
import os
from os.path import join, dirname
from dotenv import load_dotenv
from pathlib import Path

In [None]:
load_dotenv(verbose=True)
dotenv_path = join(Path().resolve(), '.env')
load_dotenv(dotenv_path)

In [None]:
GOOGLE_DRIVE_PATH = os.environ.get("GOOGLE_DRIVE_PATH")

In [None]:
os.listdir("data")

In [None]:
test_year = "2009"
train_year = "horse-2008"
test_data = pd.DataFrame()
train_data = pd.DataFrame()
for name in os.listdir("data"):
    if test_year in name:
        test_data = test_data.append(pd.read_csv(GOOGLE_DRIVE_PATH + '/horse_racing/data/' + name))
    elif train_year in name:
        train_data =pd.concat([train_data, pd.read_csv(GOOGLE_DRIVE_PATH + '/horse_racing/data/' + name)])
test_data = test_data.reset_index(drop=True)
train_data = train_data.reset_index(drop=True)

In [None]:
# ダミーデータ作成
for _ in range(10):
    train_data = pd.concat([train_data.copy(), train_data.copy()])
train_data = train_data.reset_index(drop=True)

In [None]:
test_data.head()

In [None]:
train_data.head()

In [None]:
train_data.info()

すぐに使えそうな特徴量
* frame_number, burden_weight, odds, popular

加工が必要な特徴量
* horse_id, sex_and_age, rider_id, half_way_rank, horse_weight, tamer_id

In [None]:
test_data.head()

a = test_data[["frame_number", "burden_weight", "odds", "popular", "horse_id", "rider_id", "tamer_id"]]
a[["horse_id", "rider_id", "tamer_id"]] = a[["horse_id", "rider_id", "tamer_id"]].astype("str").copy()
pd.get_dummies(a).info()

## 前処理

### ラベルの作成
問題は２値分類 (ラベル: 0 => 1~3着, 1 => 4着以降)

In [None]:
def make_label(rank):
    return [0 if r in ["1", "2", "3"] else 1 for r in rank]

In [None]:
train_data_label = make_label(train_data["rank"].values)
train_data["label"] = train_data_label
test_data_label = make_label(test_data["rank"].values)
test_data["label"] = test_data_label

In [None]:
train_data.head()

In [None]:
# 学習に用いるデータセットの作成
x_train = train_data[["frame_number", "burden_weight", "odds", "popular"]]
y_train = train_data["label"]
x_test = train_data[["frame_number", "burden_weight", "odds", "popular"]]
y_test = train_data["label"]

In [None]:
# データセットのシャッフルとバッチ化
train_ds = tf.data.Dataset.from_tensor_slices(
    (x_train, y_train)).shuffle(10000).batch(32)

test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(32)

In [None]:
import tensorflow as tf
import utils.sample_model as model
    
# モデルのインスタンスを作成
model = model.HorseModel(x_train.shape[1])

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam()

In [None]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')

In [None]:
@tf.function
def train_step(horse_data, labels):
    with tf.GradientTape() as tape:
        predictions = model(horse_data)
        loss = loss_object(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    train_loss(loss)
    train_accuracy(labels, predictions)

In [None]:
@tf.function
def test_step(horse_data, labels):
    predictions = model(horse_data)
    t_loss = loss_object(labels, predictions)
    
    test_loss(t_loss)
    test_accuracy(labels, predictions)

In [None]:
EPOCHS = 5

for epoch in range(EPOCHS):
    for images, labels in train_ds:
        train_step(images, labels)
    
    for test_images, test_labels in test_ds:
        test_step(test_images, test_labels)
        
    template = 'Epoch {}, Loss: {}, Accuracy: {}, Test Loss: {}, Test Accuracy: {}'
    print(template.format(epoch+1,
                          train_loss.result(),
                          train_accuracy.result()*100,
                          test_loss.result(),
                          test_accuracy.result()*100))
    
    # 次のエポック用にメトリクスをリセット
    train_loss.reset_states()
    train_accuracy.reset_states()
    test_loss.reset_states()
    test_accuracy.reset_states()

# TODO
* モデル作成 (別ファイルで作成できるようにする)
* jupyter 出力無視
* データ整形
* グラフ作成