In [1]:
import pandas as pd 
import numpy as np
import random
import tensorflow as tf
from tensorflow.keras import layers
import os
from sklearn.metrics import classification_report,  confusion_matrix, accuracy_score, f1_score,  precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler


random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

# Loading and Prepping data

## Train Set

In [116]:
ops = False

if ops:
  ops = 'ops_'
else:
  ops = ''

train_data = {}
for year in range(2010,2017):
  train_data[year] = pd.read_csv(f'https://raw.githubusercontent.com/jacobh310/over_under/master/data_cleaning/clean_data/{ops}clean_data_{year}.csv')

dev_data = {}
for year in range(2017,2019):
  dev_data[year] = pd.read_csv(f'https://raw.githubusercontent.com/jacobh310/over_under/master/data_cleaning/clean_data/{ops}clean_data_{year}.csv')

In [117]:
def prep_data(train_data, dev_data, scale,cutoff=0):  
  
  train = pd.concat([train_data[year][cutoff:] for year in range(2010,2017)])
  train = train.dropna()

  train_features = train.drop(columns=['Date','Home Team','Visit Team','Total Runs','Over']).values #features
  
  train_labels = train['Over'].values  # labels

  dev = pd.concat([dev_data[year][cutoff:] for year in range(2017,2019)])
  dev = dev.dropna()
  dev_features = dev.drop(columns=['Date','Home Team','Visit Team','Total Runs','Over']).values # features
  dev_labels = dev['Over'].values # lables

  if scale:
    train_scale = MinMaxScaler()
    dev_scale = MinMaxScaler()

    train_features = train_scale.fit_transform(train_features)    # scale
    dev_features = dev_scale.fit_transform(dev_features) # scale

  return train_features, train_labels, dev_features, dev_labels




In [118]:
cutoff = 50

train_features, train_labels, dev_features, dev_labels = prep_data(train_data, dev_data, False, cutoff)

train = pd.concat([train_data[year][cutoff:] for year in range(2010,2017)])
train = train.dropna()

dev = pd.concat([dev_data[year][cutoff:] for year in range(2017,2019)])
dev = dev.dropna()

In [119]:
dev.head()

Unnamed: 0,Date,Home Team,Visit Team,Home Close OU,Total Runs,Over,Visit AVG 1,Visit AVG 2,Visit AVG 3,Visit AVG 4,Visit AVG 5,Visit AVG 6,Visit AVG 7,Visit AVG 8,Visit AVG 9,Visit ERA,Home AVG 1,Home AVG 2,Home AVG 3,Home AVG 4,Home AVG 5,Home AVG 6,Home AVG 7,Home AVG 8,Home AVG 9,Home ERA
50,408,TEX,OAK,9.0,7,0,0.095,0.231,0.167,0.375,0.25,0.294,0.158,0.357,0.214,36.0,0.214,0.286,0.588,0.067,0.375,0.167,0.154,0.357,0.0,10.8
51,408,TAM,TOR,7.5,5,0,0.214,0.375,0.133,0.188,0.118,0.333,0.333,0.2,0.333,135.0,0.389,0.278,0.313,0.286,0.214,0.19,0.176,0.167,0.0,9.0
52,408,BAL,NYY,7.5,9,1,0.375,0.167,0.063,0.308,0.4,0.313,0.467,0.133,0.091,7.2,0.375,0.273,0.273,0.25,0.167,0.273,0.0,0.2,0.111,10.38
53,409,BAL,NYY,8.5,10,1,0.35,0.15,0.333,0.0,0.368,0.35,0.412,0.4,0.2,11.74,0.385,0.267,0.286,0.286,0.188,0.286,0.333,0.154,0.077,5.4
54,409,NYM,MIA,7.0,7,1,0.556,0.286,0.25,0.333,0.118,0.429,0.231,0.2,0.0,1.8,0.2,0.286,0.167,0.167,0.15,0.25,0.167,0.2,0.0,4.5


In [120]:
train_features.shape, dev_features.shape

((15766, 21), (4509, 21))

In [121]:
BATCH_SIZE=128

train_dataset = tf.data.Dataset.from_tensor_slices(
    (train_features,
    train_labels)
)

dev_dataset = tf.data.Dataset.from_tensor_slices(
    (dev_features,
    dev_labels)
)

train_dataset = train_dataset.batch(batch_size=BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)
dev_dataset = dev_dataset.batch(batch_size=BATCH_SIZE).prefetch(tf.data.AUTOTUNE)


# Establishing Baseline

In [122]:
train['Over'].value_counts(normalize=True)

1    0.520741
0    0.479259
Name: Over, dtype: float64

In [123]:
dev['Over'].value_counts(normalize=True)

1    0.506542
0    0.493458
Name: Over, dtype: float64

# Model Building

## Machine Learning Models

In [124]:
models = {'KNN': KNeighborsClassifier(n_jobs=-1),
          'RandomForest': RandomForestClassifier(random_state=42, n_jobs=-1),
          'XGBoost' : XGBClassifier(n_jobs=-1, random_state=42),
          'Naive Bayes Gaussian': GaussianNB(),
          'SVC': SVC(random_state=42, probability=True)
          }

results = {}

for model_name, model in models.items():
  model.fit(train_features, train_labels)
  results[model_name] = [model.score(train_features, train_labels), model.score(dev_features, dev_labels)]

In [125]:
base_train_score = train['Over'].value_counts(normalize=True)[1]
base_dev_score = dev['Over'].value_counts(normalize=True)[1]
print('Baseline')
print(f"Train accuracy score: {base_train_score*100}")
print(f"Dev accuracy score: {base_dev_score*100}\n")

results_df = pd.DataFrame.from_dict(results, orient='index', columns=['Train Accuracy Score','Dev Accuracy Score'])
results_df['Dev Improvement'] = base_dev_score
results_df['Dev Improvement'] = results_df['Dev Accuracy Score'] - results_df['Dev Improvement'] 
results_df = results_df*100
results_df

Baseline
Train accuracy score: 52.07408347075986
Dev accuracy score: 50.65424706143269



Unnamed: 0,Train Accuracy Score,Dev Accuracy Score,Dev Improvement
KNN,69.288342,50.632069,-0.022178
RandomForest,100.0,51.408295,0.754047
XGBoost,61.074464,52.339765,1.685518
Naive Bayes Gaussian,50.171255,48.946551,-1.707696
SVC,52.099455,50.676425,0.022178


In [114]:
results_df

Unnamed: 0,Train Accuracy Score,Dev Accuracy Score,Dev Improvement
KNN,69.754927,50.774063,0.06731
RandomForest,100.0,50.572134,-0.13462
XGBoost,62.415716,50.908683,0.20193
Naive Bayes Gaussian,51.614367,49.719542,-0.987211
SVC,52.003371,50.751627,0.044873


### Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import RandomizedSearchCV 

#### SVC

In [None]:
svc = SVC(probability = True)
param_grid = [{'kernel': ['rbf'], 'gamma': [.1,.5,1,2,5,10], 'C': [.1, 1, 10, 100, 1000]},
              {'kernel': ['linear'], 'C': [.1, 1, 10, 100, 1000]},
              {'kernel': ['poly'], 'degree' : [2,3,4,5], 'C': [.1, 1, 10, 100, 1000]}]

clf_svc = GridSearchCV(svc, param_grid = param_grid, cv = 3, verbose = True, n_jobs = -1)
svc_grid = clf_svc.fit(train_features, train_labels)

Fitting 3 folds for each of 55 candidates, totalling 165 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 23.2min
[Parallel(n_jobs=-1)]: Done 165 out of 165 | elapsed: 455.0min finished


In [None]:
svc_best_params = {'C': 0.1, 'kernel': 'linear'}

In [None]:
svc_grid.best_estimator_.score(dev_features, dev_labels)

0.5067294751009421

#### RandomForest

## Neural Networks

In [None]:
 METRICS = [tf.keras.metrics.BinaryAccuracy(name='accuracy'),
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall')] 

In [None]:
model = tf.keras.Sequential([
                             layers.Dense(21, activation='relu',input_shape=(21,)),
                             layers.Dropout(0.2),
                             layers.Dense(32, activation='relu'),
                             layers.Dropout(0.2),
                             layers.Dense(64, activation='relu'),
                             layers.Dropout(0.2),
                             layers.Dense(32, activation='relu'),
                             layers.Dropout(0.2),
                             layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',
              optimizer=tf.keras.optimizers.Adam(),
              metrics=METRICS)

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_10 (Dense)             (None, 21)                462       
_________________________________________________________________
dropout_8 (Dropout)          (None, 21)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 32)                704       
_________________________________________________________________
dropout_9 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 64)                2112      
_________________________________________________________________
dropout_10 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_13 (Dense)             (None, 32)               

In [None]:
model.fit(train_dataset,
          epochs=50,
          steps_per_epoch=len(train_dataset),
          validation_data = dev_dataset,
          validation_steps= len(dev_dataset),)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7fdd73787750>

In [None]:
model.evaluate(dev_dataset)



[0.6931648254394531,
 0.5076267123222351,
 0.5071877837181091,
 0.9995573163032532]