Import standard machine learning packages.


In [12]:
import pandas as pd
import datetime as dt
import numpy as np
import tensorflow as tf
from sklearn import linear_model, model_selection
from sklearn.preprocessing import PolynomialFeatures
import numpy.polynomial.polynomial as poly
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Activation
import tensorflow.keras.backend as K
from keras.layers import Dropout
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Load historical webscrapped NFL data from 2009-2018 (NFLGames.csv) from https://www.pro-football-reference.com/ into a Pandas dataframe object.

In [0]:
url = "/content/gdrive/My Drive/Machine Learning/NFLGamesStatisticalData.csv"
cols = ["year", "team", "opponent", "passYardsAndRedZone", "rushYardsAndRedZone", "turnovers", "penalties", "sacks", "startingPosition", "pointsPerDrive", "thirdPct", "yardsPerAttempt", "homeOrAway", "score"]
#cols = ["year", "team", "opponent", "passYardsAndRedZone", "turnovers", "sackRate", "pointsPerDrive", "thirdPct", "compPct", "homeOrAway", "score"]
df = pd.read_csv(url, names = cols, na_values = '?')

Print out first ten rows.

In [14]:
df.head(6)

Unnamed: 0,year,team,opponent,passYardsAndRedZone,rushYardsAndRedZone,turnovers,penalties,sacks,startingPosition,pointsPerDrive,thirdPct,yardsPerAttempt,homeOrAway,score
0,2018,Philadelphia Eagles,Atlanta Falcons,16897.11,1546.6565,11.5,1001.0,2.40625,27.85,2.23,44.85,7.55,1,18.0
1,2018,Atlanta Falcons,Philadelphia Eagles,15206.715,2511.1035,9.9,995.0,2.6875,26.95,2.19,40.3,7.7,0,12.0
2,2018,New Orleans Saints,Tampa Bay Buccaneers,18841.6,4487.024,8.85,1156.0,1.8125,30.25,2.67,42.45,8.1,1,40.0
3,2018,Tampa Bay Buccaneers,New Orleans Saints,18162.09,4590.15075,16.8,814.0,2.8125,26.25,2.125,43.65,8.35,0,48.0
4,2018,Minnesota Vikings,San Francisco 49ers,14507.3,3201.10625,7.25,1011.0,2.40625,30.6,2.025,38.05,7.2,1,24.0
5,2018,San Francisco 49ers,Minnesota Vikings,9427.95,2905.22925,13.8,834.0,3.0625,27.4,1.745,34.15,7.5,0,16.0


In [0]:
nsamp, natt = df.shape
print("Number of Samples = {0:d}  Number of Attributes = {1:d}".format(nsamp, natt))

Number of Samples = 5120  Number of Attributes = 14


Create Game object to hold relevent attributes from the dataframe.

In [0]:
class Game:
	def __init__(self, t1, t2, t1s, t2s, y, line, overUnder, real):
		self.home = t1
		self.away = t2
		self.homeScore = t1s
		self.awayScore = t2s
		if not real:
			homeScore = float(t1s)
			awayScore = float(t2s)
			scoreLimit = 25.0
			penalty = .8
			if homeScore > scoreLimit:
				self.homeScore = scoreLimit + (homeScore-scoreLimit)*penalty
			if awayScore > scoreLimit:
				self.awayScore = scoreLimit + (awayScore-scoreLimit)*penalty
		self.year = y
    #the line is in terms of the home team
		self.line = line
		self.overUnder = overUnder

	def printResult(self):
		print(self.home + ": " + str(self.homeScore))
		print(self.away + ": " + str(self.awayScore))

Create Game objects based on the data in the dataframe.

In [16]:
games_dict = dict()

url = "/content/gdrive/My Drive/Machine Learning/NFLGamesBettingData.csv"
cols = ["year", "home", "away", "homeScore", "awayScore", "spread", "overUnder"]
gamesDf = pd.read_csv(url, names = cols, na_values = '?')

for index, row in gamesDf.iterrows():
  key = str(row["home"]) + " " + str(row["away"]) + " " + str(row["year"])
  game_obj = Game(row["home"], row["away"], row["homeScore"], row["awayScore"], row["year"], row["spread"], row["overUnder"], True)
  games_dict[key] = game_obj

print(games_dict)

{'Philadelphia Eagles Atlanta Falcons 2018': <__main__.Game object at 0x7ff692c26780>, 'New Orleans Saints Tampa Bay Buccaneers 2018': <__main__.Game object at 0x7ff6f6af29b0>, 'Minnesota Vikings San Francisco 49ers 2018': <__main__.Game object at 0x7ff6f6af2a90>, 'Miami Dolphins Tennessee Titans 2018': <__main__.Game object at 0x7ff6f6af29e8>, 'New York Giants Jacksonville Jaguars 2018': <__main__.Game object at 0x7ff6f6af2a20>, 'Cleveland Browns Pittsburgh Steelers 2018': <__main__.Game object at 0x7ff6f6af2898>, 'New England Patriots Houston Texans 2018': <__main__.Game object at 0x7ff6f6af27f0>, 'Indianapolis Colts Cincinnati Bengals 2018': <__main__.Game object at 0x7ff6f6af28d0>, 'Baltimore Ravens Buffalo Bills 2018': <__main__.Game object at 0x7ff6f6af2860>, 'Los Angeles Chargers Kansas City Chiefs 2018': <__main__.Game object at 0x7ff6f6af2780>, 'Carolina Panthers Dallas Cowboys 2018': <__main__.Game object at 0x7ff6f6af27b8>, 'Arizona Cardinals Washington Redskins 2018': <__ma

Create a function checkPredictions to see if the predicted Game object corresponds to the actual Game object.

In [0]:
def CheckPredictions(actual, predicted):
  actualHomeCovered = 0.0
  actualHomeResult = (actual.homeScore - actual.awayScore + actual.line)
  #print("actHomeRes " + str(actualHomeResult))
  if actualHomeResult > 0:
    actualHomeCovered = 1.0
  elif actualHomeResult == 0:
    #push. result exactly matches line so no winner or loser
    actualHomeCovered = .5
  #print("actHomeCov " + str(actualHomeCovered))

  predictedHomeCovered = 0.0
  predictedHomeResult = (predicted.homeScore - predicted.awayScore + actual.line)
  #print("predHomeRes " + str(predictedHomeResult))
  if predictedHomeResult > 0:
    predictedHomeCovered = 1.0
  elif predictedHomeResult == 0:
    #push. result exactly matches line so no winner or loser
    predictedHomeCovered = .5
  #print("predHomeCov " + str(predictedHomeCovered))

  correctSpreadPrediction = 0.0
  if actualHomeCovered == .5:
    correctSpreadPrediction = .5
  elif actualHomeCovered == predictedHomeCovered:
    correctSpreadPrediction = 1.0

  actualOverHit = 0.0
  actualTotalResult = (actual.homeScore + actual.awayScore - actual.overUnder)
  #print("actTotRes " + str(actualTotalResult))
  if actualTotalResult > 0:
    actualOverHit = 1.0
  elif actualTotalResult == 0:
    #push
    actualOverHit = .5

  predictedOverHit = 0.0
  predictedTotalResult = (predicted.homeScore + predicted.awayScore - actual.overUnder)
  #print("predTotalRes " + str(predictedTotalResult))
  if predictedTotalResult > 0:
    predictedOverHit = 1.0
  elif predictedTotalResult == 0:
    predictedOverHit= .5
  
  correctTotalPrediction = 0.0
  if actualOverHit == .5:
    correctTotalPrediction = .5
  elif actualOverHit == predictedOverHit:
    correctTotalPrediction = 1.0

  return correctSpreadPrediction, correctTotalPrediction


Run a multiple linear regression model on each year of NFL games and compare the results to the actual game results using the checkPredictions function.

In [0]:
#Regular Linear Regression 

spreadScores = []
totalScores = []

for i in range(10):
  test = df[df['year'] == 2018 - i]
  train = df[df['year'] != 2018 - i]

  depth, length = test.shape
  trainX = train.drop("year", axis=1).drop("team", axis=1).drop("score", axis=1).drop("opponent", axis=1)
  trainY = train["score"]

  regr = linear_model.LinearRegression()
  regr.fit(trainX, trainY)

  for j in range(len(test)//2):
    homeTeam = test["team"][i*depth + j*2]
    awayTeam = test["team"][i*depth + j*2+1] 
    homeRow = test.loc[(test["team"] == homeTeam) & (test["opponent"] == awayTeam) & (test["homeOrAway"] == 1)]
    homeRow = homeRow.drop("year", axis=1).drop("team", axis=1).drop("score", axis=1).drop("opponent", axis=1)
    awayRow = test.loc[(test["team"] == awayTeam) & (test["opponent"] == homeTeam) & (test["homeOrAway"] == 0)]
    awayRow = awayRow.drop("year", axis=1).drop("team", axis=1).drop("score", axis=1).drop("opponent", axis=1)
    homePrediectedScore = regr.predict(homeRow)
    awayPrediectedScore = regr.predict(awayRow)
    predictedGame = Game(homeTeam, awayTeam, homePrediectedScore, awayPrediectedScore, 0, 0, 0, False)
    gameKey = homeTeam + " " + awayTeam + " " + str(2018-i)
    actualGame = games_dict[gameKey]
    spreadResult, totalResult = CheckPredictions(actualGame, predictedGame)
    spreadScores.append(spreadResult)
    totalScores.append(totalResult)

print("Spread accuracy: " + str(sum(spreadScores)/len(spreadScores)))
print("Over/Under accuracy: " + str(sum(totalScores)/len(totalScores)))

Spread accuracy: 0.625
Over/Under accuracy: 0.6154296875


Run the same regression with a different train/test split

In [0]:
#linear regression with 50/50 train test split

testYears = [2018, 2016, 2014, 2012, 2010]
trainYears = [2017, 2015, 2013, 2011, 2009]

test = df[df['year'].isin(testYears)] 
train = df[df['year'].isin(trainYears)]

depth, length = test.shape
trainX = train.drop("year", axis=1).drop("team", axis=1).drop("score", axis=1).drop("opponent", axis=1)
trainY = train["score"]

testX = test.drop("year", axis=1).drop("team", axis=1).drop("score", axis=1).drop("opponent", axis=1)
testY = test["score"]

print(testX.iloc[[0]])

regr = linear_model.LinearRegression()
regr.fit(trainX, trainY)

spreadScores = []
totalScores = []

year_multiplier = -1

for year in testYears:
  for i in range(256):
      index = (2018 - year) * 512 + i*2
      index2 = (2018 - year) * 256 + i*2
      homeTeam = test["team"][index]
      awayTeam = test["team"][index + 1] 
      homeRow = pd.DataFrame([testX.iloc[index2]])
      awayRow = pd.DataFrame([testX.iloc[index2 + 1]])
      homePrediectedScore = regr.predict(homeRow)
      awayPrediectedScore = regr.predict(awayRow)
      predictedGame = Game(homeTeam, awayTeam, homePrediectedScore, awayPrediectedScore, 0, 0, 0, True)
      gameKey = homeTeam + " " + awayTeam + " " + str(year)
      actualGame = games_dict[gameKey]
      spreadResult, totalResult = CheckPredictions(actualGame, predictedGame)
      spreadScores.append(spreadResult)
      totalScores.append(totalResult)

print("Spread accuracy: " + str(sum(spreadScores)/len(spreadScores)))
print("Over/Under accuracy: " + str(sum(totalScores)/len(totalScores)))

   passYardsAndRedZone  rushYardsAndRedZone  ...  yardsPerAttempt  homeOrAway
0             16897.11            1546.6565  ...             7.55           1

[1 rows x 10 columns]
Spread accuracy: 0.634375
Over/Under accuracy: 0.584765625


In [0]:
#Trying PCA Before Linear Regression
spreadScores = []
totalScores = []

for i in range(10):
  test = df[df['year'] == 2018 - i]
  train = df[df['year'] != 2018 - i]

  depth, length = test.shape
  trainX = train.drop("year", axis=1).drop("team", axis=1).drop("score", axis=1).drop("opponent", axis=1)
  trainY = train["score"]

  from sklearn.preprocessing import StandardScaler
  from sklearn.decomposition import PCA

  scaling = StandardScaler()
  scaling.fit(trainX)
  trainX = scaling.transform(trainX)

  pca = PCA(n_components = 5, svd_solver = 'randomized', whiten = True)
  pca.fit(trainX)
  
  regr = linear_model.LinearRegression()
  regr.fit(pca.transform(trainX), trainY)

  for j in range(len(test)//2):
    homeTeam = test["team"][i*depth + j*2]
    awayTeam = test["team"][i*depth + j*2+1] 
    homeRow = test.loc[(test["team"] == homeTeam) & (test["opponent"] == awayTeam) & (test["homeOrAway"] == 1)]
    homeRow = homeRow.drop("year", axis=1).drop("team", axis=1).drop("score", axis=1).drop("opponent", axis=1)
    awayRow = test.loc[(test["team"] == awayTeam) & (test["opponent"] == homeTeam) & (test["homeOrAway"] == 0)]
    awayRow = awayRow.drop("year", axis=1).drop("team", axis=1).drop("score", axis=1).drop("opponent", axis=1)
    homePrediectedScore = regr.predict(pca.transform(scaling.transform(homeRow)))
    awayPrediectedScore = regr.predict(pca.transform(scaling.transform(awayRow)))
    predictedGame = Game(homeTeam, awayTeam, homePrediectedScore, awayPrediectedScore, 0, 0, 0, True)
    gameKey = homeTeam + " " + awayTeam + " " + str(2018-i)
    actualGame = games_dict[gameKey]
    spreadResult, totalResult = CheckPredictions(actualGame, predictedGame)
    spreadScores.append(spreadResult)
    totalScores.append(totalResult)

print("Spread accuracy: " + str(sum(spreadScores)/len(spreadScores)))
print("Over/Under accuracy: " + str(sum(totalScores)/len(totalScores)))

Spread accuracy: 0.6390625
Over/Under accuracy: 0.5896484375
[1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.5, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.5, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.5, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.5, 0.5, 1.0, 0.0, 0.5, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.5, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0

In [0]:
#Trying LASSO Regression

# Regularization values to test
nalpha = 100
alphas = np.linspace(5, 5, nalpha)

# Compute the lasso path for the split
for ia, a in enumerate(alphas):
  spreadScores = []
  totalScores = []

  model = linear_model.Lasso(alpha=a)

  for i in range(10):
    test = df[df['year'] == 2018 - i]
    train = df[df['year'] != 2018 - i]

    depth, length = test.shape
    trainX = train.drop("year", axis=1).drop("team", axis=1).drop("score", axis=1).drop("opponent", axis=1)
    trainY = train["score"]

    model.fit(trainX, trainY)

    for j in range(len(test)//2):
      homeTeam = test["team"][i*depth + j*2]
      awayTeam = test["team"][i*depth + j*2+1] 
      homeRow = test.loc[(test["team"] == homeTeam) & (test["opponent"] == awayTeam) & (test["homeOrAway"] == 1)]
      homeRow = homeRow.drop("year", axis=1).drop("team", axis=1).drop("score", axis=1).drop("opponent", axis=1)
      awayRow = test.loc[(test["team"] == awayTeam) & (test["opponent"] == homeTeam) & (test["homeOrAway"] == 0)]
      awayRow = awayRow.drop("year", axis=1).drop("team", axis=1).drop("score", axis=1).drop("opponent", axis=1)
      homePrediectedScore = model.predict(homeRow)
      awayPrediectedScore = model.predict(awayRow)
      predictedGame = Game(homeTeam, awayTeam, homePrediectedScore, awayPrediectedScore, 0, 0, 0, True)
      gameKey = homeTeam + " " + awayTeam + " " + str(2018-i)
      actualGame = games_dict[gameKey]
      spreadResult, totalResult = CheckPredictions(actualGame, predictedGame)
      spreadScores.append(spreadResult)
      totalScores.append(totalResult)

  print("====================================")
  print("alpha = " + str(a))
  print("Spread accuracy: " + str(sum(spreadScores)/len(spreadScores)))
  print("Over/Under accuracy: " + str(sum(totalScores)/len(totalScores)))
  print("====================================")

alpha = 5.0
Spread accuracy: 0.54609375
Over/Under accuracy: 0.5845703125
alpha = 5.0
Spread accuracy: 0.54609375
Over/Under accuracy: 0.5845703125


KeyboardInterrupt: ignored

In [0]:
#neural networks
nx = df.drop("year", axis=1).drop("team", axis=1).drop("score", axis=1).drop("opponent", axis=1).shape[1]
nin = nx

nout = 1  # number of outputs
model = Sequential()
model.add(Dense(units=1000, activation='relu', input_dim=nx))
model.add(Dense(units=500, activation='relu'))
model.add(Dense(units=1000, activation='sigmoid'))
model.add(Dense(units=500, activation='sigmoid'))
model.add(Dense(units=250, activation='relu'))
model.add(Dense(units=125, activation='relu'))
model.add(Dense(units=50, activation='relu'))
model.add(Dense(units=25, activation='relu'))
model.add(Dense(units=25, activation='relu'))
model.add(Dense(units=nout, name='output'))

model.summary()
model.compile(optimizer = 'adam',loss = 'mean_squared_error')

testYears = [2018, 2016, 2014, 2012, 2010]
trainYears = [2017, 2015, 2013, 2011, 2009]

test = df[df['year'].isin(testYears)] 
train = df[df['year'].isin(trainYears)]

depth, length = test.shape
trainX = train.drop("year", axis=1).drop("team", axis=1).drop("score", axis=1).drop("opponent", axis=1)
trainY = train["score"]

testX = test.drop("year", axis=1).drop("team", axis=1).drop("score", axis=1).drop("opponent", axis=1)
testY = test["score"]

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
trainX = sc.fit_transform(trainX)
testX = sc.transform(testX)

model.fit(trainX, trainY, epochs=10, batch_size=100)

spreadScores = []
totalScores = []

year_multiplier = -1

print(testX.shape)

for year in testYears:
  for i in range(256):
      index = (2018 - year) * 512 + i*2
      index2 = (2018 - year) * 256 + i*2
      homeTeam = test["team"][index]
      awayTeam = test["team"][index + 1] 
      homeRow = pd.DataFrame([testX[index2]])
      awayRow = pd.DataFrame([testX[index2 + 1]])
      homePrediectedScore = model.predict(homeRow)
      awayPrediectedScore = model.predict(awayRow)
      #print("home Score: " + str(homePrediectedScore))
      #print("away Score: " + str(awayPrediectedScore))
      #print("============")
      predictedGame = Game(homeTeam, awayTeam, homePrediectedScore, awayPrediectedScore, 0, 0, 0, False)
      gameKey = homeTeam + " " + awayTeam + " " + str(year)
      actualGame = games_dict[gameKey]
      spreadResult, totalResult = CheckPredictions(actualGame, predictedGame)
      spreadScores.append(spreadResult)
      totalScores.append(totalResult)

print("Spread accuracy: " + str(sum(spreadScores)/len(spreadScores)))
print("Over/Under accuracy: " + str(sum(totalScores)/len(totalScores)))

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1000)              11000     
_________________________________________________________________
dense_1 (Dense)              (None, 500)               500500    
_________________________________________________________________
dense_2 (Dense)              (None, 1000)              501000    
_________________________________________________________________
dense_3 (Dense)              (None, 500)               500500    
_________________________________________________________________
dense_4 (Dense)              (None, 250)               125250    
_________________________________________________________________
dense_5 (Dense)              (None, 125)               31375     
_________________________________________

In [0]:
#trying neural networks with train test split of 9 years for train and 1 for test
nx = df.drop("year", axis=1).drop("team", axis=1).drop("score", axis=1).drop("opponent", axis=1).shape[1]
nin = nx
from sklearn.preprocessing import StandardScaler

nout = 1  # number of outputs
model = Sequential()

model.add(Dense(units=1000, activation='relu', input_dim=nx))
model.add(Dense(units=1000, activation='relu'))
model.add(Dense(units=500, activation='relu'))
model.add(Dense(units=500, activation='relu'))
model.add(Dense(units=250, activation='relu'))
model.add(Dense(units=125, activation='relu'))
model.add(Dense(units=50, activation='relu'))
model.add(Dense(units=25, activation='relu'))
model.add(Dense(units=25, activation='relu'))
model.add(Dense(units=nout, name='output'))

#model.summary()
model.compile(optimizer = 'adam',loss = 'mean_squared_error')

spreadScores = []
totalScores = []

for i in range(10):
  test = df[df['year'] == 2018 - i]
  train = df[df['year'] != 2018 - i]

  depth, length = test.shape
  trainX = train.drop("year", axis=1).drop("team", axis=1).drop("score", axis=1).drop("opponent", axis=1)
  trainY = train["score"]
  
  testX = test.drop("year", axis=1).drop("team", axis=1).drop("score", axis=1).drop("opponent", axis=1)
  testY = test["score"]

  sc = StandardScaler()
  trainX = sc.fit_transform(trainX)
  testX = sc.transform(testX)
  
  model.fit(trainX, trainY, epochs=10, batch_size=100)

  for j in range(len(test)//2):
    homeTeam = test["team"][i*depth + j*2]
    awayTeam = test["team"][i*depth + j*2+1] 
    homeRow = test.loc[(test["team"] == homeTeam) & (test["opponent"] == awayTeam) & (test["homeOrAway"] == 1)]
    homeRow = homeRow.drop("year", axis=1).drop("team", axis=1).drop("score", axis=1).drop("opponent", axis=1)
    awayRow = test.loc[(test["team"] == awayTeam) & (test["opponent"] == homeTeam) & (test["homeOrAway"] == 0)]
    awayRow = awayRow.drop("year", axis=1).drop("team", axis=1).drop("score", axis=1).drop("opponent", axis=1)
    homePrediectedScore = model.predict(homeRow)
    awayPrediectedScore = model.predict(awayRow)
    predictedGame = Game(homeTeam, awayTeam, homePrediectedScore, awayPrediectedScore, 0, 0, 0, False)
    gameKey = homeTeam + " " + awayTeam + " " + str(2018-i)
    actualGame = games_dict[gameKey]
    spreadResult, totalResult = CheckPredictions(actualGame, predictedGame)
    spreadScores.append(spreadResult)
    totalScores.append(totalResult)

print("Spread accuracy: " + str(sum(spreadScores)/len(spreadScores)))
print("Over/Under accuracy: " + str(sum(totalScores)/len(totalScores)))

Train on 4608 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 4608 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 4608 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 4608 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 4608 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 4608 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 4608 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 4608 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epo

Data Transformations Documentation: 
1. Regular Linear Regression (Averaging the Stats) => Spread Accuracy = 0.63 and Over/Under Accuracy = 0.61
2. PCA with Linear Regression (Components = 5) (Averaging the Stats) => Spread Accuracy = 0.63 and Over/Under Accuracy = 0.58
3. LASSO Regression (100 Alphas between -5 and 5) (Averaging the Stats) => Best Alpa = 0.05 Spread Accuracy = 0.63 and Over/Under Accuracy = 0.62

Additional Feature: Predict the scores of upcoming NFL games using the webscrapped NFL historical data. Predictions from linear regression and neural networks below.

In [0]:
url = "/content/gdrive/My Drive/Machine Learning/NFLGamesStatisticalData.csv"
cols = ["year", "team", "opponent", "passYardsAndRedZone", "rushYardsAndRedZone", "turnovers", "penalties", "sacks", "startingPosition", "pointsPerDrive", "thirdPct", "completionPct", "homeOrAway", "score"]
trainDf = pd.read_csv(url, names = cols, na_values = '?')

url = "/content/gdrive/My Drive/Machine Learning/UpcomingNFLGames.csv"
#cols = ["team", "passYardsAndRedZone", "rushYardsAndRedZone", "turnovers", "penalties", "sacks", "startingPosition", "pointsPerDrive", "thirdPct", "completionPct", "homeOrAway"]
cols = ["team", "passYardsAndRedZone", "turnovers", "sackRate", "pointsPerDrive", "thirdPct", "compPct", "homeOrAway"]
testDf = pd.read_csv(url, names = cols, na_values = '?')

trainX = train.drop("year", axis=1).drop("team", axis=1).drop("score", axis=1).drop("opponent", axis=1)
trainY = train["score"]

regr = linear_model.LinearRegression()
regr.fit(trainX, trainY)

newTestDf = testDf.drop("team", axis=1)

results = regr.predict(newTestDf)

team1 = ""
team1Score = 0.0
team2 = ""
team2Score = 0.0

for index, row in testDf.iterrows():
    if(index % 2 == 1):
      team2 = row["team"]
      team2Score = str(round(results[index], 2))
      game = Game(team1, team2, team1Score, team2Score, 0,0,0,False)
      game.printResult()
      print("=================")
    else:
      team1 = row["team"]
      team1Score = str(round(results[index], 2))

In [18]:
#trying neural networks
url = "/content/gdrive/My Drive/Machine Learning/NFLGamesStatisticalData.csv"
cols = ["year", "team", "opponent", "passYardsAndRedZone", "rushYardsAndRedZone", "turnovers", "penalties", "sacks", "startingPosition", "pointsPerDrive", "thirdPct", "completionPct", "homeOrAway", "score"]
trainDf = pd.read_csv(url, names = cols, na_values = '?')

url = "/content/gdrive/My Drive/Machine Learning/UpcomingNFLGames.csv"
cols = ["team", "passYardsAndRedZone", "rushYardsAndRedZone", "turnovers", "penalties", "sacks", "startingPosition", "pointsPerDrive", "thirdPct", "completionPct", "homeOrAway"]
testDf = pd.read_csv(url, names = cols, na_values = '?')

nx = df.drop("year", axis=1).drop("team", axis=1).drop("score", axis=1).drop("opponent", axis=1).shape[1]
nin = nx

nout = 1  # number of outputs
model = Sequential()

model.add(Dense(units=1000, activation='relu', input_dim=nx))
model.add(Dense(units=1000, activation='relu'))
model.add(Dense(units=500, activation='relu'))
model.add(Dense(units=500, activation='relu'))
model.add(Dense(units=250, activation='relu'))
model.add(Dense(units=125, activation='relu'))
model.add(Dense(units=50, activation='relu'))
model.add(Dense(units=25, activation='relu'))
model.add(Dense(units=25, activation='relu'))
model.add(Dense(units=nout, name='output'))

#model.summary()
model.compile(optimizer = 'adam',loss = 'mean_squared_error')

trainX = trainDf.drop("year", axis=1).drop("team", axis=1).drop("score", axis=1).drop("opponent", axis=1)
trainY = trainDf["score"]
testX = testDf.drop("team", axis=1)
print(testDf.head())

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
trainX = sc.fit_transform(trainX)
testX = sc.transform(testX)

model.fit(trainX, trainY, epochs=10, batch_size=100)

results = model.predict(testX)

for index, row in testDf.iterrows():
    print(row["team"] + ": " + str(round(results[index][0], 2)))
    if(index % 2 == 1):
      print("=================")

                  team  passYardsAndRedZone  ...  completionPct  homeOrAway
0  Philadelphia Eagles            13802.975  ...            6.9           1
1      New York Giants            13215.795  ...            6.9           0

[2 rows x 11 columns]
Train on 5120 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Philadelphia Eagles: 30.08
New York Giants: 21.4
