In [670]:
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# This is a swing at a custom neural network predicting soccer game results based on scores.
# I've been in Keras for a few months, but am just starting to understand my way around.
df = pd.read_csv("https://projects.fivethirtyeight.com/soccer-api/club/spi_matches.csv").dropna()

# We need to calculate one team's aggregate margin in order to determine the result.
# We've chosen home for that calculation, but may as well get away as well for good measure.
# Then, the absolute margin is the number of goals between the teams.
df['h_marg'] = df['score1'] - df['score2']
df['a_marg'] = df['score2'] - df['score1']
df['margin'] = np.abs(df['score1'] - df['score2'])

results = []

for i in df['h_marg']:
  if i > 0: # If the home team's margin is greater than 0, it's a home win.
    results.append("HOME WIN")
  elif i < 0: # If the home team's margin is less than 0, it's an away win.
    results.append("AWAY WIN")
  else: # Otherwise, it's a draw.
    results.append("DRAW")

df['result'] = results

In [671]:
# The model gets a little confused the more numerical rows are fed into it, so we're keeping it simple.
X = df[['score1', 'score2', 'margin']].reset_index(drop=True)
y = df[['result']].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.34)

In [672]:
# Now it's time to make some random test data (separate from the existing test dfs) to use for predictions and accuracy. 
# First, we need to generate a function to turn the list into a dataframe.

def to_df(l1):
  df = pd.DataFrame(l1).reset_index(drop=True)
  return df

def to_cat(cats):
  cat = to_categorical(pd.factorize(cats)[0])
  return cat

# These scores are just # of goals (integers between 0 and 4), so we can use randint for this.
t1 = to_df(np.random.randint(0,3,size=60))
t2 = to_df(np.random.randint(0,3,size=60))
t3 = np.abs(t1) - np.abs(t2)
t4 = t2 - t1
t5 = t1 - t2

t_res = []

cols = ['score1', 'score2', 'margin', 'h_margin', 'a_margin']

rand_X_test = pd.concat([t1, t2, t3, t4, t5], axis=1)
rand_X_test.columns = cols

for i in rand_X_test['h_margin']:
  if i > 0: # If the home team's margin is greater than 0, it's a home win.
    t_res.append("HOME WIN")
  elif i < 0: # If the home team's margin is less than 0, it's an away win.
    t_res.append("AWAY WIN")
  else: # Otherwise, it's a draw.
    t_res.append("DRAW")

rand_X_test['result'] = t_res
rand_y_test = to_cat(rand_X_test['result'])
rand_X_test = rand_X_test[['score1', 'score2', 'margin']]
rand_reshape = rand_X_test.shape[0] * rand_X_test.shape[1]
rand_X_test = np.asarray(rand_X_test)
rand_y_test = np.asarray(rand_y_test)

In [673]:
# Have to factorize the labels before converting them to categorical (e.g. 1.,0.,0.).
y_train = to_cat(y_train['result'])
y_test = to_cat(y_test['result'])

# The train and test data will be easier to handle in array form.
X_train = np.asarray(X_train)
X_test = np.asarray(X_test)

In [674]:
# Two layers with a 3 unit output layer given the number of outcomes (HOME WIN, AWAY WIN, DRAW).
model = Sequential()
model.add(Dense(40, activation='relu'))
model.add(Dense(20, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [675]:
# Definitely excessive on the epochs. ¯\_(ツ)_/¯
model.fit(X_train, y_train, epochs=20, batch_size=128, verbose=1, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fd444552ed0>

In [676]:
# Now we will predict results based on the random data we generated.
predictions = model.predict(rand_X_test).round(2)
predictions = np.asarray(predictions)



In [677]:
# Reshaping y_test and predictions to be directly compared 1v1 iteratively.
predictions = predictions.reshape(rand_reshape,)
predictions = pd.DataFrame(predictions).reset_index(drop=True)
rand_y_test = rand_y_test.reshape(rand_reshape,)
rand_y_test = pd.DataFrame(rand_y_test).reset_index(drop=True)
PvA = pd.concat([rand_y_test, predictions], axis=1)
PvA.columns = ["Predicted", "Actual"]

In [678]:
# If the margin is 0, there is no error. If the margin is 1, there was an error.
PvA['Margin'] = np.abs(PvA['Predicted'] - PvA['Actual'])

In [679]:
# Sum of all of the rows with errors.
PvA_err = np.sum(PvA['Margin'])
print(PvA_err)

34.0


In [680]:
# Length of the prediction set.
PvA_len = len(PvA)
print(PvA_len)

180


In [681]:
# Accuracy metric.
PvA_acc = 1-(PvA_err/PvA_len)

In [682]:
# AORTD = Accuracy on Random Test Data
print("AORTD: %.2f" % (PvA_acc * 100) + "%")

AORTD: 81.11%


In [686]:
# I've run a number of models that perform well on either of X & y test comparisons and randomly generated data and labels, but not both.
# This one is probably the best balance of performance I've achieved yet.
loss, accuracy = model.evaluate(X_test, y_test)
print("Test (X & y test) Accuracy: %.2f" % (accuracy * 100) + "%")

Test (X & y test) Accuracy: 100.00%
