In [1]:
# ignore warnings :)
import warnings
warnings.filterwarnings('ignore')

# libraries
import pandas as pd
import numpy as np
import requests
import bs4
from bs4 import BeautifulSoup
import time

# pre processing
from sklearn.preprocessing import StandardScaler

# models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeClassifier

In [2]:
# GLOBAL CONSTANTS
ALL_TEAMS = ["BUF", "NE", "MIA", "NYJ", "BAL", "CIN", "CLE", "PIT", "TEN",
            "HOU", "IND", "JAX", "LAC", "DEN", "LV", "KC", "DAL", "WAS",
            "PHI", "NYG", "GB", "CHI", "MIN", "DET", "TB", "CAR", "NO",
            "ATL", "ARI", "LA", "SEA", "SF"] # team codes, as used on nflgamedata.com

CURRENT_WEEK = 13
# todo: scrape spreads
SPREADS = {} # all spreads are from the perspective of the home team
SPREADS[9] = [14, 2.5, 3.5, 7, 4, 2.5, -4.5, 10.5, 3, -1.5, -2.5, 14, 1]
SPREADS[10] = [2.5, -1, -7.5, -2.5, -9.5, -4, -7, 2.5, -2, -5, 5, -3, -6.5, -10.5]
SPREADS[13] = [4.5, -2, -8.5, 3, -1, 7, -3, 2.5, -5.5, 8, -3.5, 2.5, 2.5, -9, -4]

week = CURRENT_WEEK # populated manually
GAMES_LIST = [] # populated in populateGamesListUntilWeek() -> populateGamesList()
TEAMS = {} # TEAMS maps team codes to team objects
GAMES_DF = pd.DataFrame() # populated in populateGamesListUntilWeek()

In [3]:
class Team:
    def __init__(self, name, wins, losses, diff, oRank, dRank):
        self.name = name
        self.wins = wins
        self.losses = losses
        self.diff = diff
        self.oRank = oRank
        self.dRank = dRank
        self.SOS = 0
        
        self.games = {} # indexed by week
        
    def addGame(self, week, game):
        self.games[week] = game
        
    def getStrengthOfSchedule(self):
        opp_win_rates = []
        for game in self.games.values():
            if game.home == self.name:
                opp = TEAMS[game.away]
            else:
                opp = TEAMS[game.home]
            opp_win_rates.append(opp.wins / (opp.losses + 0.5)) # offset for undefeated
        self.SOS = np.mean(opp_win_rates)
        return np.mean(opp_win_rates)
        
    def getHomeAdvantageDifference(self):
        return 0 # todo
        
        
    def getRanking(self): 
        return self.ranking
        # todo
    
    def __str__(self):
        return "{} | W-L: {}-{} | PD: {} | OR: {} | DR: {}".format(self.name, self.wins, self.losses, self.diff, self.oRank, self.dRank)

In [4]:
class Game:
    def __init__(self, week, away, awayScore, home, homeScore, awayDR, awayOR, awayPD, homeDR, homeOR, homePD, result):
        self.week = week
        self.away = away
        self.awayScore = awayScore
        self.home = home
        self.homeScore = homeScore
        self.awayDR = awayDR
        self.awayOR = awayOR
        self.awayPD = awayPD
        self.homeDR = homeDR
        self.homeOR = homeOR
        self.homePD = homePD
        
        self.result = result
        
    def __str__(self):
        return "AWAY: {}: {} | HOME: {}: {} | RESULT IS HOME BY: {}".format(self.away, self.awayScore, self.home, self.homeScore, self.result)

In [5]:
def populateTeams():
    url = "https://nflgamedata.com/schedule.php?season=2022&week={}".format(CURRENT_WEEK)
    table = pd.read_html(url)
    summary = table[2][0]
    summary = " ".join(summary)
    summary = summary.split("  ")

    for i in range(10, len(summary)):
        if i % 6 == 4:
            pass # rank seems useless, i'll use w/l and pd
        elif i % 6 == 5:
            name = summary[i]
        elif i % 6 == 0:
            wins = int(summary[i].split("-")[0])
            losses = int(summary[i].split("-")[1])
        elif i % 6 == 1:
            diff = int(summary[i])
        elif i % 6 == 2:
            oRank = int(summary[i])
        else: # i % 6 == 3
            dRank = int(summary[i])
            TEAMS[name] = Team(name, wins, losses, diff, oRank, dRank)

In [6]:
def populateGamesList(week):
    time.sleep(0.2) # FOR SAKE OF NFLGAMEDATA
    url = "https://nflgamedata.com/schedule.php?season=2022&week={}".format(week)
    table = pd.read_html(url)
    game_data = table[2][1]
    game_data = " ".join(game_data)
    game_data = game_data.split("  ")

    awayNext = True
    for i in range(len(game_data)):
        if "-- BYE --" in str(game_data[i]):
            break
        if game_data[i] in ALL_TEAMS:
            if "-- BYE --" in str(game_data[i + 1]):
                break
            if awayNext:
                away = game_data[i]
                try:
                    awayScore = int(game_data[i+1])
                except ValueError:
                    awayScore = 0
                awayNext = False
            else:
                home = game_data[i]
                try:
                    homeScore = int(game_data[i-1])
                except ValueError:
                    homeScore = 0
                awayDR = TEAMS[away].dRank
                awayOR = TEAMS[away].oRank
                awayPD = TEAMS[away].diff
                homeDR = TEAMS[home].dRank
                homeOR = TEAMS[home].oRank
                homePD = TEAMS[home].diff
                GAMES_LIST.append(Game(week, away, awayScore, home, homeScore, awayDR, awayOR, awayPD, homeDR, homeOR, homePD, awayScore - homeScore))
                awayNext = True

In [7]:
def populateGamesListUntilWeek(i):
    populateTeams() # populates TEAMS
    
    for i in range(1, i + 1): # populates GAMES_LIST
        populateGamesList(i)
        print(i)
        
    GAMES_DF = pd.DataFrame()

    # populate GAMES_DF with all games from GAMES_LIST
    # todo: this can be much more efficient
    GAMES_DF['Week'] = [game.week for game in GAMES_LIST]
    GAMES_DF['Away'] = [game.away for game in GAMES_LIST]
    GAMES_DF['Away Score'] = [game.awayScore for game in GAMES_LIST]
    GAMES_DF['Home'] = [game.home for game in GAMES_LIST]
    GAMES_DF['Home Score'] = [game.homeScore for game in GAMES_LIST]
    GAMES_DF['Away DR'] = [game.awayDR for game in GAMES_LIST]
    GAMES_DF['Away OR'] = [game.awayOR for game in GAMES_LIST]
    GAMES_DF['Away PD'] = [game.awayPD for game in GAMES_LIST]
    GAMES_DF['Home DR'] = [game.homeDR for game in GAMES_LIST]
    GAMES_DF['Home OR'] = [game.homeOR for game in GAMES_LIST]
    GAMES_DF['Home PD'] = [game.homePD for game in GAMES_LIST]
    GAMES_DF['Result'] = [game.result for game in GAMES_LIST]

    # populating Team.games for team in TEAMS
    for index, row in GAMES_DF.iterrows():
        TEAMS[row["Away"]].addGame(row["Week"], GAMES_LIST[index])
        TEAMS[row["Home"]].addGame(row["Week"], GAMES_LIST[index])
        
    GAMES_DF["Away SOS"] = [TEAMS[team].getStrengthOfSchedule() for team in GAMES_DF["Away"]]
    GAMES_DF["Home SOS"] = [TEAMS[team].getStrengthOfSchedule() for team in GAMES_DF["Home"]]
        
    return GAMES_DF

In [8]:
def predictWeek(week):
    GAMES_DF = populateGamesListUntilWeek(week)
    
    X = GAMES_DF[["Week", "Away DR", "Away OR", "Away PD", "Home DR", "Home OR", "Home PD", "Away SOS", "Home SOS"]]
    y = GAMES_DF[["Week", "Result", "Away Score", "Home Score"]]
    X_train = X[X["Week"] < week].drop("Week", axis=1)
    X_test = X[X["Week"] == week].drop("Week", axis=1)
    print(X_train)
    print(X_test)
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)
    print(X_train)
    print(X_test)

    y_train_result = y[y["Week"] < week]["Result"]
    y_test_result = y[y["Week"] == week]["Result"]

    y_train_away = y[y["Week"] < week]["Away Score"]
    y_test_away = y[y["Week"] == week]["Away Score"]

    y_train_home = y[y["Week"] < week]["Home Score"]
    y_test_home = y[y["Week"] == week]["Home Score"]
    
    mdl_result = LinearRegression().fit(X_train, y_train_result)
    # temporary start
    features = ["Away DR", "Away OR", "Away PD", "Home DR", "Home OR", "Home PD", "Away SOS", "Home SOS"]
    coefs = list(mdl_result.coef_)
    for i in range(len(features)):
        print(features[i], coefs[i])
    # temporary end
    mdl_away = LinearRegression().fit(X_train, y_train_away)
    mdl_home = LinearRegression().fit(X_train, y_train_home)

    preds_result = mdl_result.predict(X_test)
    preds_away = mdl_away.predict(X_test)
    preds_home = mdl_home.predict(X_test)

    analysis = pd.DataFrame()
    analysis["away"] = GAMES_DF[X_train.shape[0]:]["Away"]
    analysis["true away score"] = GAMES_DF[X_train.shape[0]:]["Away Score"]
    analysis["pred away"] = [round(float(pred), 2) for pred in list(preds_away)]
    analysis["pred home"] = [round(float(pred), 2) for pred in list(preds_home)]
    analysis["true home score"] = GAMES_DF[X_train.shape[0]:]["Home Score"]
    analysis["home"] = GAMES_DF[X_train.shape[0]:]["Home"]
    analysis["true result"] = GAMES_DF[X_train.shape[0]:]["Result"]
    analysis["pred result"] = [round(float(pred), 2) for pred in list(preds_result)]
    analysis["spread pred"] = SPREADS[week]
    analysis["spread pred error"] = abs(analysis["pred result"] - analysis["spread pred"])
    return analysis


In [9]:
result = predictWeek(CURRENT_WEEK)

1
2
3
4
5
6
7
8
9
10
11
12
13
     Away DR  Away OR  Away PD  Home DR  Home OR  Home PD  Away SOS  Home SOS
0         10        2      110       15       30      -75  1.461124  1.307642
1         14       23      -31       28       14      -21  1.060781  0.796055
2         30        6      -23       13       29      -36  1.152639  0.875432
3          4        9       76       31       16      -54  0.960048  1.342167
4         26       26      -74       11        4       54  1.670906  1.314939
..       ...      ...      ...      ...      ...      ...       ...       ...
175       29       13      -11       24        8       10  0.944110  0.735186
176       15       30      -75       19        1       83  1.307642  1.034084
177       14       23      -31        4        9       76  1.060781  0.960048
178       21       21      -48        3        5       87  1.830965  1.092935
179       26       26      -74        7       31      -47  1.670906  1.575795

[180 rows x 8 columns]
     Away 

In [10]:
result

Unnamed: 0,away,true away score,pred away,pred home,true home score,home,true result,pred result,spread pred,spread pred error
180,BUF,0,24.88,17.73,0,NE,0,7.15,4.5,2.65
181,PIT,0,19.77,23.81,-2,ATL,2,-4.04,-2.0,2.04
182,DEN,0,13.35,24.39,0,BAL,0,-11.04,-8.5,2.54
183,GB,-3,22.66,23.38,0,CHI,-3,-0.72,3.0,3.72
184,JAX,0,29.3,27.1,-1,DET,1,2.2,-1.0,3.2
185,CLE,-7,26.98,24.08,0,HOU,-7,2.91,7.0,4.09
186,NYJ,0,20.97,19.52,-3,MIN,3,1.45,-3.0,4.45
187,WAS,0,19.72,18.89,0,NYG,0,0.83,2.5,1.67
188,TEN,0,14.11,22.23,0,PHI,0,-8.12,-5.5,2.62
189,SEA,-8,24.81,21.33,0,LA,-8,3.48,8.0,4.52


# 2022 code above

---------------------------------------------------------------------------------------------

# 2021 code below

# NFL GAME PREDICTOR

https://nflgamedata.com/schedule.php?season=2021&week=1

https://rbsdm.com/stats/box_scores/?_inputs_&type=%22reg%22&away=%22DAL%22&home=%22TB%22&year=%222021%22

In [None]:
# GLOBAL CONSTANTS
ALL_TEAMS = ["BUF", "NE", "MIA", "NYJ", "BAL", "CIN", "CLE", "PIT", "TEN",
            "HOU", "IND", "JAX", "LAC", "DEN", "LV", "KC", "DAL", "WAS",
            "PHI", "NYG", "GB", "CHI", "MIN", "DET", "TB", "CAR", "NO",
            "ATL", "ARI", "LA", "SEA", "SF"]

CURRENT_WEEK = 14
SPREADS = {} # spreads are from the pov of the home team
SPREADS[5] = [1.5, -2.5, -3, 3, 8.5, 4.5, -10, 1.5, -9.5, 2.5, -2.5, -5.5, -5.5, -7, -3, -7]
SPREADS[6] = [6.5, 3.5, -3, 1, 4.5, 3.5, -9.5, 10.5, 7, -2.5, -3.5, 4, -4.5, 5.5]
SPREADS[7] = [-3.5, -6, -9.5, 2.5, -7.5, 3, 5.5, -15, -3, -18.5, -12.5, -4, 4.5]
SPREADS[8] = [-6, -3, -13.5, 3, -3.5, 3, 14.5, 1.5, 9, 5.5, 0, -3, 0, 2.5, 0]
SPREADS[9] = [] # didnt do week 9 oops
SPREADS[10] = [7.5, -10, -10.5, -1, 13, -9, -3, 10, -10, -2.5, -2.5, -3.5, 2.5, 4]
SPREADS[13] = [0, 11, 7.5, -3, 7.5, 10, -6, 7, -1, -13, 4, 3.5, -10, -2.5]
SPREADS[14] = [-3.5, -3, -2.5, 7.5, -9.5, 5, -8.5, 4, -8, -10.5, -1, -3.5, -13, -3]

week = CURRENT_WEEK
GAMES_LIST = []
WEEK_6_GAMES = []
TEAMS = {}
GAMES_DF = pd.DataFrame()

In [None]:
# def pullMatchups(week):
#     time.sleep(1) # FOR SAKE OF NFLGAMEDATA
#     url = "https://nflgamedata.com/schedule.php?season=2021&week={}".format(week)
#     table = pd.read_html(url)
#     matchups = table[2][1]
#     matchups = " ".join(matchups).split(" ")

#     this_week = []
#     for i in range(len(matchups)):
#         if matchups[i] in ALL_TEAMS:
#             this_week.append(matchups[i])

#     games = []
#     for i in range(0, len(this_week), 2):
#         games.append((this_week[i], this_week[i+1]))

#     return games

# # m = pullMatchups(week = 2)

# # for matchup in m:
# #     url = "https://rbsdm.com/stats/box_scores/?_inputs_&type=%22reg%22&away=%22{}%22&home=%22{}%22&year=%222021%22"
# #     url = url.format(matchup[0], matchup[1])

# #     r = requests.get(url)
# #     bs = BeautifulSoup(r.content)
# #     bs 

# # TODO, pull stats from rbsdm