In [1]:
import pickle, copy, os, re, datetime, io
import pandas as pd
import numpy as np
from collections import Counter, deque, defaultdict
from datetime import timedelta as td
from datetime import datetime as dt

import matplotlib.pyplot as plt
import matplotlib as mpl

import tarfile
import glob
import boto3
import botocore
import sagemaker
import json

from io import StringIO, BytesIO

s3_client = boto3.client('s3')
s3_resource = boto3.resource('s3')
sm_client = boto3.client('sagemaker', config = botocore.config.Config())

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from tensorflow.keras.layers import IntegerLookup, Normalization, StringLookup
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier, KerasRegressor

from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder

2023-03-08 12:55:45.454936: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def encode_features(col):
    le = LabelEncoder()
    # print(c, len(set(col)))
    le.fit(col)
    col = le.transform(col)
    return col

def standardize_features(col):
    return (col-np.mean(col))/(np.std(col))

def normalize_features(col):
    return (col-min(col))/(max(col)-min(col))

class PrintDot(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs):
    #if epoch == 0:
    print(str(epoch)+' ',end='')

def create_sagemaker_model(name, model_data_uri, instance_type):
    image_uri = sagemaker.image_uris.retrieve(
        framework='Tensorflow',
        version='2.9.2',
        instance_type=instance_type,
        region='us-west-2',
        py_version='py3',
        image_scope='inference'
    )
    sm_client.create_model(
        ModelName=name,
        ExecutionRoleArn=params['sm_role'],
        PrimaryContainer={
            'ModelDataUrl': model_data_uri,
            'Image': image_uri
        }
    )
    
def save_and_package_model(model_assets, path, bucket='hwm-nba', name=None, sagemaker_name=None,
                           upload='sagemaker', instance_type='ml.c5.large', key_data=None):
    if name is None:
        name = model_assets['name']
    if sagemaker_name is None:
        sagemaker_name = name
    tar_name = 'model.tar.gz'
    local_path = os.path.join(path, name, tar_name)
    tf.keras.models.save_model(model_assets['model'], os.path.join(path, name, '00001'))
    model_assets['history'].to_csv(os.path.join(path, name, 'history.csv'), index=False)
    tar = tarfile.open(os.path.join(path, name, tar_name), 'w:gz')
    for file_name in glob.glob(os.path.join(path, name, '*')):
        print('Adding %s...' % file_name)
        tar.add(file_name, os.path.basename(file_name))
    tar.close()
    if key_data is not None:
        key_dict = {'key': str([int(np.round(p)) for p in model_assets['model'].predict(key_data).squeeze()])}
        pd.DataFrame(key_data).to_csv(os.path.join(path, name, 'key_data.csv'), index=False)
        with open(os.path.join(path, name, 'key.json'), 'w') as outfile:
            json.dump(key_dict, outfile)
    if upload in('s3','sagemaker'):
        s3_path = os.path.join('models', name, tar_name)
        s3_full_uri = os.path.join('s3://', bucket, s3_path)
        s3_client.upload_file(local_path, bucket, s3_path)
        if key_data is not None:
            s3_client.upload_file(os.path.join(path, name, 'key_data.csv'), bucket, os.path.join('models', name, 'key_data.csv'))
            s3_client.upload_file(os.path.join(path, name, 'key.json'), bucket, os.path.join('models', name, 'key.json'))
        s3_client.upload_file(os.path.join(path, name, 'history.csv'), bucket, os.path.join('models', name, 'history.csv'))
        if upload == 'sagemaker':
            create_sagemaker_model(sagemaker_name, s3_full_uri, instance_type)

#####
##### **LOAD DATA**

In [43]:
# games_final = pd.read_csv('../data/processed/games_final.csv')
with open('../data/processed/games_final.pickle', 'rb') as f: games_final = pickle.load(f)

In [44]:
games_final[-100:-80]

Unnamed: 0,date,away,as,home,hs,ot,detail_path,game_type,game_subtype,diff,...,away_standing_losses,away_standing_wins_home,away_standing_losses_home,away_standing_points_home,away_standing_wins_away,away_standing_losses_away,away_standing_points_away,away_standing_diff,standing_diff,standing_points_diff
36239,2023-02-27,MIA,101,PHI,99,0,mia-vs-phi-0022200920,regular,0,2,...,31.0,19.0,10.0,3115.0,11.0,21.0,1208.0,-1.0,20.0,4642.0
36240,2023-02-27,ORL,101,NOP,93,0,orl-vs-nop-0022200922,regular,0,8,...,34.0,17.0,14.0,4996.0,10.0,20.0,1125.0,-7.0,6.0,3407.0
36241,2023-02-28,CHI,98,TOR,104,0,chi-vs-tor-0022200925,regular,0,6,...,32.0,19.0,12.0,4306.0,10.0,20.0,1177.0,-3.0,1.0,2163.0
36242,2023-02-28,DEN,133,HOU,112,0,den-vs-hou-0022200926,regular,0,21,...,20.0,28.0,4.0,5469.0,14.0,16.0,1619.0,22.0,-56.0,-688.0
36243,2023-02-28,IND,124,DAL,122,0,ind-vs-dal-0022200929,regular,0,2,...,34.0,19.0,14.0,3302.0,9.0,20.0,1053.0,-6.0,8.0,3448.0
36244,2023-02-28,LAL,109,MEM,121,0,lal-vs-mem-0022200927,regular,0,12,...,32.0,16.0,13.0,3960.0,13.0,19.0,1604.0,-3.0,14.0,2407.0
36245,2023-02-28,MIL,118,BKN,104,0,mil-vs-bkn-0022200924,regular,0,14,...,18.0,26.0,5.0,4127.0,16.0,13.0,1887.0,24.0,-14.0,1391.0
36246,2023-02-28,MIN,108,LAC,101,0,min-vs-lac-0022200932,regular,0,7,...,30.0,22.0,12.0,5639.0,11.0,18.0,1310.0,3.0,-4.0,414.0
36247,2023-02-28,POR,105,GSW,123,0,por-vs-gsw-0022200931,regular,0,18,...,32.0,17.0,14.0,3191.0,11.0,18.0,1247.0,-4.0,5.0,1686.0
36248,2023-02-28,SAC,123,OKC,117,0,sac-vs-okc-0022200928,regular,0,6,...,27.0,18.0,12.0,3349.0,15.0,15.0,1877.0,6.0,-8.0,3480.0


In [45]:
target = 'home_win'
skip_cols = ['date', 'detail_path', 'team_pair', 'team_pair_sorted', 'winner', 'winner', 'hs', 'diff',
             'as', 'ot', 't1_wins_after_game', 't2_wins_after_game', 'leader_after_game', 'series_winner']

In [46]:
col_map = {}
for c in games_final.columns:
    if c not in skip_cols: col_map[c] = str(games_final[c].dtype)

In [47]:
for c in col_map.keys():
    if col_map[c] == 'object':
        games_final[c] =  encode_features(games_final[c])

In [48]:
games_final = games_final.fillna(0)

In [49]:
games_final = games_final[[c for c in games_final.columns if c not in skip_cols]]

In [50]:
games_final['standing_diff'] = np.where(games_final['standing_diff']<0, 0, np.where(games_final['standing_diff']<10, 1, 2))

In [51]:
for i, c in enumerate(games_final.columns):
    games_final[c] = standardize_features(games_final[c])
    games_final[c] = normalize_features(games_final[c])

#####
##### **SPLIT DATA**

In [52]:
val_frac = 0.1
test_frac = 0.1

train_frac = 1 - val_frac - test_frac
val_frac = train_frac + val_frac

In [53]:
np.random.seed(56)
games_final_shuffled = games_final.sample(frac=1).reset_index(drop=True)

In [54]:
games_final_shuffled #[-250:-230]

Unnamed: 0,away,home,game_type,game_subtype,month,home_win,season,is_preseason,is_playoffs,t1_wins_before_game,...,away_standing_losses,away_standing_wins_home,away_standing_losses_home,away_standing_points_home,away_standing_wins_away,away_standing_losses_away,away_standing_points_away,away_standing_diff,standing_diff,standing_points_diff
0,0.000000,0.393939,1.0,0.0,0.000000,1.0,0.814815,0.0,0.0,0.0,...,0.408451,0.175,0.323529,0.094237,0.088235,0.473684,0.087349,0.336,1.0,0.342974
1,0.727273,0.757576,1.0,0.0,0.909091,1.0,0.148148,0.0,0.0,0.0,...,0.014085,0.000,0.000000,0.000000,0.000000,0.026316,0.000000,0.480,0.5,0.266660
2,0.696970,0.272727,1.0,0.0,0.090909,1.0,0.037037,0.0,0.0,0.0,...,0.281690,0.375,0.235294,0.196741,0.294118,0.315789,0.245482,0.528,1.0,0.361638
3,0.151515,0.909091,1.0,0.0,0.090909,1.0,0.962963,0.0,0.0,0.0,...,0.253521,0.475,0.176471,0.266769,0.382353,0.315789,0.373996,0.600,0.0,0.276138
4,0.484848,0.727273,1.0,0.0,0.181818,1.0,0.962963,0.0,0.0,0.0,...,0.338028,0.650,0.264706,0.352385,0.617647,0.394737,0.595131,0.672,0.0,0.256498
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36334,0.636364,0.363636,1.0,0.0,0.181818,0.0,0.370370,0.0,0.0,0.0,...,0.605634,0.275,0.529412,0.142300,0.147059,0.657895,0.119729,0.272,1.0,0.393004
36335,0.363636,0.272727,0.5,0.0,0.818182,0.0,0.740741,1.0,0.0,0.0,...,0.000000,0.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.488,0.5,0.266660
36336,0.212121,0.424242,1.0,0.0,0.272727,1.0,0.074074,0.0,0.0,0.0,...,0.873239,0.325,0.823529,0.152692,0.176471,0.894737,0.150853,0.144,1.0,0.544557
36337,0.212121,0.727273,1.0,0.0,0.090909,1.0,0.925926,0.0,0.0,0.0,...,0.211268,0.200,0.235294,0.110888,0.205882,0.184211,0.209086,0.488,1.0,0.337796


In [56]:
x_train = games_final_shuffled.iloc[:int(len(games_final)*train_frac),:]
x_val = games_final_shuffled.iloc[int(len(games_final)*train_frac):int(len(games_final)*val_frac),:]
x_test = games_final_shuffled.iloc[int(len(games_final)*val_frac):,:]

In [57]:
y_train = np.array(x_train.pop(target))
y_val = np.array(x_val.pop(target))
y_test = np.array(x_test.pop(target))

In [58]:
x_train, x_val, x_test = np.array(x_train), np.array(x_val), np.array(x_test)

In [59]:
# pickle splits for later
with open('../data/processed/games_split.pickle', 'wb') as f:
    pickle.dump(((x_train, x_val, x_test),(y_train, y_val, y_test)), f, pickle.HIGHEST_PROTOCOL)
s3_client.upload_file('../data/processed/games_split.pickle', 'hwm-nba', os.path.join('model_data', 'games_split.pickle'))

#####
##### **FIRST MODELS**

In [60]:
# set random seed
tf.random.set_seed(16)

# create the model
model_0 = tf.keras.Sequential([
    tf.keras.layers.Dense(32, activation='relu'),
    # tf.keras.layers.Dropout(0.1), 
    tf.keras.layers.Dense(1, activation='sigmoid') # output shape is 1
])

# compile the model
model_0.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(5e-5),
    metrics=['accuracy']
)

# fit the model
history = model_0.fit(
    x_train,
    y_train,
    epochs=40,
    validation_data=(x_val, y_val)
) # see how the model performs on the test set during training

model_0_assets = {
    'model': model_0,
    'history': pd.DataFrame.from_dict(history.history),
    'name': 'model_'+str(int(datetime.datetime.now().timestamp()*100000))+'_'+str(history.history['val_accuracy'][-1])
}

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [None]:
# set random seed
tf.random.set_seed(16)

# create the model
model_1 = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    #tf.keras.layers.Dropout(0.1), 
    tf.keras.layers.Dense(32, activation='relu'),
    #tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(4, activation='relu'),
    #tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(1, activation='sigmoid') # output shape is 1
])

# compile the model
model_1.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(1e-4),
    metrics=['accuracy']
)

# fit the model
history = model_1.fit(
    x_train,
    y_train,
    epochs=40,
    validation_data=(x_val, y_val)
) # see how the model performs on the test set during training

model_1_assets = {
    'model': model_1,
    'history': pd.DataFrame.from_dict(history.history),
    'name': 'model_'+str(int(datetime.datetime.now().timestamp()*100000))+'_'+str(history.history['val_accuracy'][-1])
}

In [87]:
# Set random seed
tf.random.set_seed(16)

# Create the model
model_2 = tf.keras.Sequential([
    tf.keras.layers.Dense(4096, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.1), 
    tf.keras.layers.Dense(1024, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.1), 
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.1), 
    tf.keras.layers.Dense(256, activation='selu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.1), 
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid') # output shape is 1
])

# Compile the model
model_2.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(1e-3),
    metrics=['accuracy']
)

# Fit the model
history = model_2.fit(
    x_train,
    y_train,
    epochs=20,
    validation_data=(x_val, y_val)) # see how the model performs on the test set during training

model_2_assets = {
    'model': model_2,
    'history': pd.DataFrame.from_dict(history.history),
    'name': 'model_'+str(int(datetime.datetime.now().timestamp()*100000))+'_'+str(history.history['val_accuracy'][-1])
}

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [88]:
# Set random seed
tf.random.set_seed(16)

# Create the model
model_3 = tf.keras.Sequential([
    tf.keras.layers.Dense(2, activation='relu'),
    tf.keras.layers.Dense(1) # output shape is 1
])

# Compile the model
model_3.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(5e-4),
    metrics=['accuracy']
)

# Fit the model
history = model_3.fit(
    x_train,
    y_train,
    epochs=20,
    validation_data=(x_test,y_test),
    #verbose=1,
    #callbacks=[PrintDot()]
)
# see how the model performs on the test set during training

model_3_assets = {
    'model': model_3,
    'history': pd.DataFrame.from_dict(history.history),
    'name': 'model_'+str(int(datetime.datetime.now().timestamp()*100000))+'_'+str(history.history['val_accuracy'][-1])
}

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [89]:
# Set random seed
tf.random.set_seed(16)

# Create the model
model_4 = tf.keras.Sequential([
    tf.keras.layers.Dense(1) # output shape is 1
])

# Compile the model
model_4.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(5e-4),
    metrics=['accuracy']
)

# Fit the model
history = model_4.fit(
    x_train,
    y_train,
    epochs=1,
    validation_data=(x_test,y_test),
    # verbose=1,
    # callbacks=[PrintDot()]
)
# see how the model performs on the test set during training

model_4_assets = {
    'model': model_4,
    'history': pd.DataFrame.from_dict(history.history),
    'name': 'model_'+str(int(datetime.datetime.now().timestamp()*100000))+'_'+str(history.history['val_accuracy'][-1])
}



#####
##### **UPLOAD TO RELEVANT AWS RESOURCES (S3 AND SAGEMAKER)**

In [90]:
with open('../sagemaker/params.json','r') as params:
    params = json.load(params)

In [91]:
key_data = x_train[:1000]

In [92]:
# TODO: reflect more on the right nomenclature inside sagemaker
save_and_package_model(model_0_assets, '../models', sagemaker_name='model-0', key_data=key_data)
save_and_package_model(model_1_assets, '../models', sagemaker_name='model-1', key_data=key_data)
save_and_package_model(model_2_assets, '../models', sagemaker_name='model-2', key_data=key_data)
save_and_package_model(model_3_assets, '../models', sagemaker_name='model-3', key_data=key_data)
save_and_package_model(model_4_assets, '../models', sagemaker_name='model-4', key_data=key_data)

INFO:tensorflow:Assets written to: ../models/model_167654828573204_0.6701715588569641/00001/assets


INFO:tensorflow:Assets written to: ../models/model_167654828573204_0.6701715588569641/00001/assets


Adding ../models/model_167654828573204_0.6701715588569641/00001...
Adding ../models/model_167654828573204_0.6701715588569641/history.csv...
Adding ../models/model_167654828573204_0.6701715588569641/model.tar.gz...
INFO:tensorflow:Assets written to: ../models/model_167654849883550_0.6651909351348877/00001/assets


INFO:tensorflow:Assets written to: ../models/model_167654849883550_0.6651909351348877/00001/assets


Adding ../models/model_167654849883550_0.6651909351348877/00001...
Adding ../models/model_167654849883550_0.6651909351348877/history.csv...
Adding ../models/model_167654849883550_0.6651909351348877/model.tar.gz...
INFO:tensorflow:Assets written to: ../models/model_167654885970443_0.671001672744751/00001/assets


INFO:tensorflow:Assets written to: ../models/model_167654885970443_0.671001672744751/00001/assets


Adding ../models/model_167654885970443_0.671001672744751/00001...
Adding ../models/model_167654885970443_0.671001672744751/history.csv...
Adding ../models/model_167654885970443_0.671001672744751/model.tar.gz...
INFO:tensorflow:Assets written to: ../models/model_167654895178177_0.6557831168174744/00001/assets


INFO:tensorflow:Assets written to: ../models/model_167654895178177_0.6557831168174744/00001/assets


Adding ../models/model_167654895178177_0.6557831168174744/00001...
Adding ../models/model_167654895178177_0.6557831168174744/history.csv...
Adding ../models/model_167654895178177_0.6557831168174744/model.tar.gz...
INFO:tensorflow:Assets written to: ../models/model_167654895642998_0.517708957195282/00001/assets


INFO:tensorflow:Assets written to: ../models/model_167654895642998_0.517708957195282/00001/assets


Adding ../models/model_167654895642998_0.517708957195282/00001...
Adding ../models/model_167654895642998_0.517708957195282/history.csv...
Adding ../models/model_167654895642998_0.517708957195282/model.tar.gz...
