In [30]:
import pandas as pd
import numpy as np
from statistics import mean
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import tabulate

# REGRESSION
# Return X = testable features, Y = point spread
def spread_df_rgn(normalize=False, get_custom_date_range=False, substituted_spreads=None):
  data_df = pd.read_csv('combined_out.csv')

  real_cols = data_df.select_dtypes(include=['number']).columns

  if not get_custom_date_range:
    data_df = data_df.dropna(axis=0)

  if normalize:
    for col in real_cols:
      min_val = data_df[col].min()
      max_val = data_df[col].max()
      data_df[col] = (data_df[col] - min_val) / (max_val - min_val)

  if get_custom_date_range:
    data_df = data_df[(data_df['Date'] >= '2024-04-01') & (data_df['Date'] <= '2024-04-07')]
    data_df['HomeSpread'] = substituted_spreads
    data_df['HomeSpreadCorrectDirection'] = data_df['HomeSpreadActual'] > data_df['HomeSpread']
    data_df['HomeSpreadCorrectDirection'] = data_df['HomeSpreadCorrectDirection'].astype(int)
  
  data_df = data_df.drop('AwayTeam', axis=1)\
                   .drop('HomeTeam', axis=1)\
                   .drop('Date', axis=1)
  
  data_np = data_df.to_numpy()
  data_np = data_np[:, 5:]

  data_np = data_np.astype(float)

  return data_np[:, 1:-1], data_np[:, :1]

# CLASSIFICATION
# Return X = testable features, Y = correct point spread direction
def spread_df_cls(normalize=False, get_custom_date_range=False, substituted_spreads=None):
  data_df = pd.read_csv('combined_out.csv')

  real_cols = data_df.select_dtypes(include=['number']).columns

  if not get_custom_date_range:
    data_df = data_df.dropna(axis=0)

  if normalize:
    for col in real_cols:
      min_val = data_df[col].min()
      max_val = data_df[col].max()
      data_df[col] = (data_df[col] - min_val) / (max_val - min_val)

  if get_custom_date_range:
    data_df = data_df[(data_df['Date'] >= '2024-04-01') & (data_df['Date'] <= '2024-04-07')]
    data_df['HomeSpread'] = substituted_spreads
    data_df['HomeSpreadCorrectDirection'] = data_df['HomeSpreadActual'] > data_df['HomeSpread']
    data_df['HomeSpreadCorrectDirection'] = data_df['HomeSpreadCorrectDirection'].astype(int)

  data_df = data_df.drop('AwayTeam', axis=1)\
                   .drop('HomeTeam', axis=1)\
                   .drop('Date', axis=1)
  
  data_np = data_df.to_numpy()
  data_np = data_np[:, 5:]

  data_np = data_np.astype(float)

  return data_np[:, 1:-1], data_np[:, -1:]

MLP Classifier

Does Early Stopping improve accuracy?

In [37]:
x, y = spread_df_cls()
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, test_size=0.2)

converged_early = []
converged = []
train_accuracy_early = []
train_accuracy = []
test_accuracy_early = []
test_accuracy = []

for i in range(5):
  clf = MLPClassifier()
  clf.fit(x_train, y_train)
  
  converged.append(clf.n_iter_)
  train_accuracy.append(clf.score(x_train, y_train))
  test_accuracy.append(clf.score(x_test, y_test))
  
print("Average results without Early Stopping")
print("Converged after", mean(converged), "iterations")
print("Train Accuracy:", mean(train_accuracy))
print("Test Accuracy:", mean(test_accuracy))
      
headers = ["Iterations", "Train Accuracy", "Test Accuracy"]
table = [(converged[i], train_accuracy[i], test_accuracy[i]) for i in range(len(converged))]

print(tabulate.tabulate(table, headers, tablefmt="grid"))
  
for i in range(5):
  clf = MLPClassifier(early_stopping=True)
  clf.fit(x_train, y_train)
  
  converged_early.append(clf.n_iter_)
  train_accuracy_early.append(clf.score(x_train, y_train))
  test_accuracy_early.append(clf.score(x_test, y_test))
  
print("Results with Early Stopping")
print("Converged after", mean(converged_early), "iterations")
print("Train Accuracy:", mean(train_accuracy_early))
print("Test Accuracy:", mean(test_accuracy_early))

headers = ["Iterations", "Train Accuracy", "Test Accuracy"]
table = [(converged_early[i], train_accuracy_early[i], test_accuracy_early[i]) for i in range(len(converged_early))]

print(tabulate.tabulate(table, headers, tablefmt="grid"))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Average results without Early Stopping
Converged after 35.2 iterations
Train Accuracy: 0.5134237929946356
Test Accuracy: 0.5115598182735992
+--------------+------------------+-----------------+
|   Iterations |   Train Accuracy |   Test Accuracy |
|           30 |         0.502745 |        0.502776 |
+--------------+------------------+-----------------+
|           35 |         0.511013 |        0.505553 |
+--------------+------------------+-----------------+
|           27 |         0.502745 |        0.503029 |
+--------------+------------------+-----------------+
|           54 |         0.547176 |        0.545432 |
+--------------+------------------+-----------------+
|           30 |         0.50344  |        0.50101  |
+--------------+------------------+-----------------+


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Results with Early Stopping
Converged after 23 iterations
Train Accuracy: 0.5234963710949826
Test Accuracy: 0.5200403836446239
+--------------+------------------+-----------------+
|   Iterations |   Train Accuracy |   Test Accuracy |
|           46 |         0.53083  |        0.525745 |
+--------------+------------------+-----------------+
|           20 |         0.539602 |        0.528269 |
+--------------+------------------+-----------------+
|           22 |         0.523888 |        0.514891 |
+--------------+------------------+-----------------+
|           13 |         0.515241 |        0.510601 |
+--------------+------------------+-----------------+
|           14 |         0.50792  |        0.520697 |
+--------------+------------------+-----------------+


Which alpha value creates the highest accuracy?

In [43]:
converged = []
train_accuracy = []
test_accuracy = []

alphas = [0.1, 0.01, 0.001, 0.0001, 0.00001]

for i in alphas:
  clf = MLPClassifier(alpha=i, early_stopping=True)
  clf.fit(x_train, y_train)
  
  train_accuracy.append(clf.score(x_train, y_train))
  test_accuracy.append(clf.score(x_test, y_test))
  
headers_alpha = ["Alpha", "Train Accuracy", "Test Accuracy"]
table_alpha = [(alphas[i], train_accuracy[i], test_accuracy[i]) for i in range(len(alphas))]

print(tabulate.tabulate(headers_alpha, table_alpha, tablefmt="grid"))

a = np.arange(len(alphas))

fig = plt.figure(1, figsize=(7,7))
ax = fig.add_subplot(212)

ax.plot(a, test_accuracy)
ax.xaxis.set_ticks(a)
ax.xaxis.set_ticklabels(alphas)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


+-------------------------------------------------+-------------------------------------------------+---------------------------------------------------+----------------------------------------------------+---------------------------------------------------+
| (0.1, 0.5166929630798359, 0.5070671378091873)   | (0.01, 0.509561375828337, 0.5037859666834932)   | (0.001, 0.5086778163458504, 0.5025239777889955)   | (0.0001, 0.5285579047017986, 0.5181726400807672)   | (1e-05, 0.5136005048911328, 0.5133770822816759)   |
| A                                               | l                                               | p                                                 | h                                                  | a                                                 |
+-------------------------------------------------+-------------------------------------------------+---------------------------------------------------+----------------------------------------------------+-----------------

[Text(0, 0, '0.1'),
 Text(1, 0, '0.01'),
 Text(2, 0, '0.001'),
 Text(3, 0, '0.0001'),
 Text(4, 0, '1e-05')]

<Figure size 700x700 with 0 Axes>