In [1]:
# probably going to have to install this if you haven't already
#from sportsreference.nfl.boxscore import Boxscores, Boxscore
from sportsipy.nfl.boxscore import Boxscores, Boxscore

# the usual imports
import pandas as pd
import numpy as np
import hvplot.pandas

pd.set_option('display.max_columns', None)

In [2]:
# The game_Data function is extracting game statistics for each game. It loops through each game and each week grabbing the statistics referenced.
# This function creates the foundation for our final dataset.
def game_data(game_df,game_stats):
    try:
        # Creates a dataframe for the away_team and the home_team. Sets column names to be exact matches between the two.
        away_team_df = game_df[['away_name', 'away_abbr', 'away_score']].rename(columns = {'away_name': 'team_name', 'away_abbr': 'team_abbr', 'away_score': 'score'})
        home_team_df = game_df[['home_name','home_abbr', 'home_score']].rename(columns = {'home_name': 'team_name', 'home_abbr': 'team_abbr', 'home_score': 'score'})
        try:
            if game_df.loc[0,'away_score'] > game_df.loc[0,'home_score']:
                away_team_df = pd.merge(away_team_df, pd.DataFrame({'game_won' : [1], 'game_lost' : [0]}),left_index = True, right_index = True)
                home_team_df = pd.merge(home_team_df, pd.DataFrame({'game_won' : [0], 'game_lost' : [1]}),left_index = True, right_index = True)
            elif game_df.loc[0,'away_score'] < game_df.loc[0,'home_score']:
                away_team_df = pd.merge(away_team_df, pd.DataFrame({'game_won' : [0], 'game_lost' : [1]}),left_index = True, right_index = True)
                home_team_df = pd.merge(home_team_df, pd.DataFrame({'game_won' : [1], 'game_lost' : [0]}),left_index = True, right_index = True)
            else: 
                away_team_df = pd.merge(away_team_df, pd.DataFrame({'game_won' : [0], 'game_lost' : [0]}),left_index = True, right_index = True)
                home_team_df = pd.merge(home_team_df, pd.DataFrame({'game_won' : [0], 'game_lost' : [0]}),left_index = True, right_index = True)
        except TypeError:
                away_team_df = pd.merge(away_team_df, pd.DataFrame({'game_won' : [np.nan], 'game_lost' : [np.nan]}),left_index = True, right_index = True)
                home_team_df = pd.merge(home_team_df, pd.DataFrame({'game_won' : [np.nan], 'game_lost' : [np.nan]}),left_index = True, right_index = True)        

        # Creating the away_team & home_team stats dataframe. Grabbing the selected stats and then renaming them to match home == away dataframe column names.
        away_stats_df = game_stats.dataframe[['away_first_downs', 'away_fourth_down_attempts',
               'away_fourth_down_conversions', 'away_fumbles', 'away_fumbles_lost',
               'away_interceptions', 'away_net_pass_yards', 'away_pass_attempts',
               'away_pass_completions', 'away_pass_touchdowns', 'away_pass_yards',
               'away_penalties', 'away_points', 'away_rush_attempts',
               'away_rush_touchdowns', 'away_rush_yards', 'away_third_down_attempts',
               'away_third_down_conversions', 'away_time_of_possession',
               'away_times_sacked', 'away_total_yards', 'away_turnovers',
               'away_yards_from_penalties', 'away_yards_lost_from_sacks']].reset_index().drop(columns ='index').rename(columns = {
               'away_first_downs': 'first_downs', 'away_fourth_down_attempts':'fourth_down_attempts',
               'away_fourth_down_conversions':'fourth_down_conversions' , 'away_fumbles': 'fumbles', 'away_fumbles_lost': 'fumbles_lost',
               'away_interceptions': 'interceptions', 'away_net_pass_yards':'net_pass_yards' , 'away_pass_attempts': 'pass_attempts',
               'away_pass_completions':'pass_completions' , 'away_pass_touchdowns': 'pass_touchdowns', 'away_pass_yards': 'pass_yards',
               'away_penalties': 'penalties', 'away_points': 'points', 'away_rush_attempts': 'rush_attempts',
               'away_rush_touchdowns': 'rush_touchdowns', 'away_rush_yards': 'rush_yards', 'away_third_down_attempts': 'third_down_attempts',
               'away_third_down_conversions': 'third_down_conversions', 'away_time_of_possession': 'time_of_possession',
               'away_times_sacked': 'times_sacked', 'away_total_yards': 'total_yards', 'away_turnovers': 'turnovers',
               'away_yards_from_penalties':'yards_from_penalties', 'away_yards_lost_from_sacks': 'yards_lost_from_sacks'})

        home_stats_df = game_stats.dataframe[['home_first_downs', 'home_fourth_down_attempts',
               'home_fourth_down_conversions', 'home_fumbles', 'home_fumbles_lost',
               'home_interceptions', 'home_net_pass_yards', 'home_pass_attempts',
               'home_pass_completions', 'home_pass_touchdowns', 'home_pass_yards',
               'home_penalties', 'home_points', 'home_rush_attempts',
               'home_rush_touchdowns', 'home_rush_yards', 'home_third_down_attempts',
               'home_third_down_conversions', 'home_time_of_possession',
               'home_times_sacked', 'home_total_yards', 'home_turnovers',
               'home_yards_from_penalties', 'home_yards_lost_from_sacks']].reset_index().drop(columns = 'index').rename(columns = {
               'home_first_downs': 'first_downs', 'home_fourth_down_attempts':'fourth_down_attempts',
               'home_fourth_down_conversions':'fourth_down_conversions' , 'home_fumbles': 'fumbles', 'home_fumbles_lost': 'fumbles_lost',
               'home_interceptions': 'interceptions', 'home_net_pass_yards':'net_pass_yards' , 'home_pass_attempts': 'pass_attempts',
               'home_pass_completions':'pass_completions' , 'home_pass_touchdowns': 'pass_touchdowns', 'home_pass_yards': 'pass_yards',
               'home_penalties': 'penalties', 'home_points': 'points', 'home_rush_attempts': 'rush_attempts',
               'home_rush_touchdowns': 'rush_touchdowns', 'home_rush_yards': 'rush_yards', 'home_third_down_attempts': 'third_down_attempts',
               'home_third_down_conversions': 'third_down_conversions', 'home_time_of_possession': 'time_of_possession',
               'home_times_sacked': 'times_sacked', 'home_total_yards': 'total_yards', 'home_turnovers': 'turnovers',
               'home_yards_from_penalties':'yards_from_penalties', 'home_yards_lost_from_sacks': 'yards_lost_from_sacks'})
        
        # Merge the team_df & stats_df for both home & away teams. Set the left_index & right_index to True so that both dataframes merge on the same indices. 
        away_team_df = pd.merge(away_team_df, away_stats_df,left_index = True, right_index = True)
        home_team_df = pd.merge(home_team_df, home_stats_df,left_index = True, right_index = True)
        try:
            # Converting time_of_possession from MM:SS format into seconds(int). 
            away_team_df['time_of_possession'] = (int(away_team_df['time_of_possession'].loc[0][0:2]) * 60) + int(away_team_df['time_of_possession'].loc[0][3:5])
            home_team_df['time_of_possession'] = (int(home_team_df['time_of_possession'].loc[0][0:2]) * 60) + int(home_team_df['time_of_possession'].loc[0][3:5])
        except TypeError:
            away_team_df['time_of_possession'] = np.nan
            home_team_df['time_of_possession'] = np.nan
    except TypeError:
        away_team_df = pd.DataFrame()
        home_team_df = pd.DataFrame()
    return away_team_df, home_team_df

def game_data_up_to_week(weeks,year):
    weeks_games_df = pd.DataFrame()
    for w in range(len(weeks)):
        date_string = str(weeks[w]) + '-' + str(year)
        week_scores = Boxscores(weeks[w],year)
        week_games_df = pd.DataFrame()
        for g in range(len(week_scores.games[date_string])):
            game_str = week_scores.games[date_string][g]['boxscore']
            game_stats = Boxscore(game_str)
            game_df = pd.DataFrame(week_scores.games[date_string][g], index = [0])
            away_team_df, home_team_df = game_data(game_df,game_stats)
            away_team_df['week'] = weeks[w]
            home_team_df['week'] = weeks[w]
            week_games_df = pd.concat([week_games_df,away_team_df])
            week_games_df = pd.concat([week_games_df,home_team_df])
        weeks_games_df = pd.concat([weeks_games_df,week_games_df])
    return weeks_games_df

def get_schedule(year):
    weeks = list(range(1,18))
    schedule_df = pd.DataFrame()
    for w in range(len(weeks)):
        date_string = str(weeks[w]) + '-' + str(year)
        week_scores = Boxscores(weeks[w],year)
        week_games_df = pd.DataFrame()
        for g in range(len(week_scores.games[date_string])):
            game = pd.DataFrame(week_scores.games[date_string][g], index = [0])[['away_name', 'away_abbr','home_name', 'home_abbr','winning_name', 'winning_abbr' ]]
            game['week'] = weeks[w]
            week_games_df = pd.concat([week_games_df,game])
        schedule_df = pd.concat([schedule_df, week_games_df]).reset_index().drop(columns = 'index') 
    return schedule_df 

def agg_weekly_data(schedule_df,weeks_games_df,current_week,weeks):
    schedule_df = schedule_df[schedule_df.week < current_week]
    agg_games_df = pd.DataFrame()
    for w in range(1,len(weeks)):
        games_df = schedule_df[schedule_df.week == weeks[w]]
        agg_weekly_df = weeks_games_df[weeks_games_df.week < weeks[w]].drop(columns = ['score','week','game_won', 'game_lost']).groupby(by=["team_name", "team_abbr"]).mean().reset_index()
        win_loss_df = weeks_games_df[weeks_games_df.week < weeks[w]][["team_name", "team_abbr",'game_won', 'game_lost']].groupby(by=["team_name", "team_abbr"]).sum().reset_index()
        win_loss_df['win_perc'] = win_loss_df['game_won'] / (win_loss_df['game_won'] + win_loss_df['game_lost'])
        win_loss_df = win_loss_df.drop(columns = ['game_won', 'game_lost'])

        try:
            agg_weekly_df['fourth_down_perc'] = agg_weekly_df['fourth_down_conversions'] / agg_weekly_df['fourth_down_attempts']  
        except ZeroDivisionError:
            agg_weekly_df['fourth_down_perc'] = 0 
        agg_weekly_df['fourth_down_perc'] = agg_weekly_df['fourth_down_perc'].fillna(0)

        try:
            agg_weekly_df['third_down_perc'] = agg_weekly_df['third_down_conversions'] / agg_weekly_df['third_down_attempts']  
        except ZeroDivisionError:
            agg_weekly_df['third_down_perc'] = 0
        agg_weekly_df['third_down_perc'] = agg_weekly_df['third_down_perc'].fillna(0)  

        agg_weekly_df = agg_weekly_df.drop(columns = ['fourth_down_attempts', 'fourth_down_conversions', 'third_down_attempts', 'third_down_conversions'])
        agg_weekly_df = pd.merge(win_loss_df,agg_weekly_df,left_on = ['team_name', 'team_abbr'], right_on = ['team_name', 'team_abbr'])

        away_df = pd.merge(games_df,agg_weekly_df,how = 'inner', left_on = ['away_name', 'away_abbr'], right_on = ['team_name', 'team_abbr']).drop(columns = ['team_name', 'team_abbr']).rename(columns = {
                'win_perc': 'away_win_perc',
               'first_downs': 'away_first_downs', 'fumbles': 'away_fumbles', 'fumbles_lost':'away_fumbles_lost', 'interceptions':'away_interceptions',
               'net_pass_yards': 'away_net_pass_yards', 'pass_attempts':'away_pass_attempts', 'pass_completions':'away_pass_completions',
               'pass_touchdowns':'away_pass_touchdowns', 'pass_yards':'away_pass_yards', 'penalties':'away_penalties', 'points':'away_points', 'rush_attempts':'away_rush_attempts',
               'rush_touchdowns':'away_rush_touchdowns', 'rush_yards':'away_rush_yards', 'time_of_possession':'away_time_of_possession', 'times_sacked':'away_times_sacked',
               'total_yards':'away_total_yards', 'turnovers':'away_turnovers', 'yards_from_penalties':'away_yards_from_penalties',
               'yards_lost_from_sacks': 'away_yards_lost_from_sacks', 'fourth_down_perc':'away_fourth_down_perc', 'third_down_perc':'away_third_down_perc'})

        home_df = pd.merge(games_df,agg_weekly_df,how = 'inner', left_on = ['home_name', 'home_abbr'], right_on = ['team_name', 'team_abbr']).drop(columns = ['team_name', 'team_abbr']).rename(columns = {
                'win_perc': 'home_win_perc',
               'first_downs': 'home_first_downs', 'fumbles': 'home_fumbles', 'fumbles_lost':'home_fumbles_lost', 'interceptions':'home_interceptions',
               'net_pass_yards': 'home_net_pass_yards', 'pass_attempts':'home_pass_attempts', 'pass_completions':'home_pass_completions',
               'pass_touchdowns':'home_pass_touchdowns', 'pass_yards':'home_pass_yards', 'penalties':'home_penalties', 'points':'home_points', 'rush_attempts':'home_rush_attempts',
               'rush_touchdowns':'home_rush_touchdowns', 'rush_yards':'home_rush_yards', 'time_of_possession':'home_time_of_possession', 'times_sacked':'home_times_sacked',
               'total_yards':'home_total_yards', 'turnovers':'home_turnovers', 'yards_from_penalties':'home_yards_from_penalties',
               'yards_lost_from_sacks': 'home_yards_lost_from_sacks', 'fourth_down_perc':'home_fourth_down_perc', 'third_down_perc':'home_third_down_perc'})

        agg_weekly_df = pd.merge(away_df,home_df,left_on = ['away_name', 'away_abbr', 'home_name', 'home_abbr', 'winning_name',
               'winning_abbr', 'week'], right_on = ['away_name', 'away_abbr', 'home_name', 'home_abbr', 'winning_name',
               'winning_abbr', 'week'])

        agg_weekly_df['win_perc_dif'] = agg_weekly_df['away_win_perc'] - agg_weekly_df['home_win_perc']
        agg_weekly_df['first_downs_dif'] = agg_weekly_df['away_first_downs'] - agg_weekly_df['home_first_downs']
        agg_weekly_df['fumbles_dif'] = agg_weekly_df['away_fumbles'] - agg_weekly_df['home_fumbles']
        agg_weekly_df['interceptions_dif'] = agg_weekly_df['away_interceptions'] - agg_weekly_df['home_interceptions']
        agg_weekly_df['net_pass_yards_dif'] = agg_weekly_df['away_net_pass_yards'] - agg_weekly_df['home_net_pass_yards']
        agg_weekly_df['pass_attempts_dif'] = agg_weekly_df['away_pass_attempts'] - agg_weekly_df['home_pass_attempts']
        agg_weekly_df['pass_completions_dif'] = agg_weekly_df['away_pass_completions'] - agg_weekly_df['home_pass_completions']
        agg_weekly_df['pass_touchdowns_dif'] = agg_weekly_df['away_pass_touchdowns'] - agg_weekly_df['home_pass_touchdowns']
        agg_weekly_df['pass_yards_dif'] = agg_weekly_df['away_pass_yards'] - agg_weekly_df['home_pass_yards']
        agg_weekly_df['penalties_dif'] = agg_weekly_df['away_penalties'] - agg_weekly_df['home_penalties']
        agg_weekly_df['points_dif'] = agg_weekly_df['away_points'] - agg_weekly_df['home_points']
        agg_weekly_df['rush_attempts_dif'] = agg_weekly_df['away_rush_attempts'] - agg_weekly_df['home_rush_attempts']
        agg_weekly_df['rush_touchdowns_dif'] = agg_weekly_df['away_rush_touchdowns'] - agg_weekly_df['home_rush_touchdowns']
        agg_weekly_df['rush_yards_dif'] = agg_weekly_df['away_rush_yards'] - agg_weekly_df['home_rush_yards']
        agg_weekly_df['time_of_possession_dif'] = agg_weekly_df['away_time_of_possession'] - agg_weekly_df['home_time_of_possession']
        agg_weekly_df['times_sacked_dif'] = agg_weekly_df['away_times_sacked'] - agg_weekly_df['home_times_sacked']
        agg_weekly_df['total_yards_dif'] = agg_weekly_df['away_total_yards'] - agg_weekly_df['home_total_yards']
        agg_weekly_df['turnovers_dif'] = agg_weekly_df['away_turnovers'] - agg_weekly_df['home_turnovers']
        agg_weekly_df['yards_from_penalties_dif'] = agg_weekly_df['away_yards_from_penalties'] - agg_weekly_df['home_yards_from_penalties']
        agg_weekly_df['yards_lost_from_sacks_dif'] = agg_weekly_df['away_yards_lost_from_sacks'] - agg_weekly_df['home_yards_lost_from_sacks']
        agg_weekly_df['fourth_down_perc_dif'] = agg_weekly_df['away_fourth_down_perc'] - agg_weekly_df['home_fourth_down_perc']
        agg_weekly_df['third_down_perc_dif'] = agg_weekly_df['away_third_down_perc'] - agg_weekly_df['home_third_down_perc']

        agg_weekly_df = agg_weekly_df.drop(columns = ['away_win_perc',
               'away_first_downs', 'away_fumbles', 'away_fumbles_lost', 'away_interceptions',
               'away_net_pass_yards', 'away_pass_attempts','away_pass_completions',
               'away_pass_touchdowns', 'away_pass_yards', 'away_penalties', 'away_points', 'away_rush_attempts',
               'away_rush_touchdowns', 'away_rush_yards', 'away_time_of_possession', 'away_times_sacked',
               'away_total_yards', 'away_turnovers', 'away_yards_from_penalties',
               'away_yards_lost_from_sacks','away_fourth_down_perc', 'away_third_down_perc','home_win_perc',
               'home_first_downs', 'home_fumbles', 'home_fumbles_lost', 'home_interceptions',
               'home_net_pass_yards', 'home_pass_attempts','home_pass_completions',
               'home_pass_touchdowns', 'home_pass_yards', 'home_penalties', 'home_points', 'home_rush_attempts',
               'home_rush_touchdowns', 'home_rush_yards', 'home_time_of_possession', 'home_times_sacked',
               'home_total_yards', 'home_turnovers', 'home_yards_from_penalties',
               'home_yards_lost_from_sacks','home_fourth_down_perc', 'home_third_down_perc'])
            
        if (weeks[w] == current_week and weeks[w] > 3 and agg_weekly_df['winning_name'].isnull().values.any()):
            agg_weekly_df['result'] = np.nan
            print(f"Week {weeks[w]} games have not finished yet.")
        
#         if (agg_weekly_df['winning_name'].isnull().values.any() and weeks[w] > 3):
#             agg_weekly_df['result'] = np.nan
#             print(f"Week {weeks[w]} games have not finished yet.")
        else:
            agg_weekly_df['result'] = agg_weekly_df['winning_name'] == agg_weekly_df['away_name']
            agg_weekly_df['result'] = agg_weekly_df['result'].astype('float')
        agg_weekly_df = agg_weekly_df.drop(columns = ['winning_name', 'winning_abbr'])
        agg_games_df = pd.concat([agg_games_df, agg_weekly_df])
    agg_games_df = agg_games_df.reset_index().drop(columns = 'index')
    # What is .drop(index = 20) doing?
    agg_games_df = agg_games_df.drop(index = 20, axis=0)
    return agg_games_df

def get_elo():
    elo_df = pd.read_csv('https://projects.fivethirtyeight.com/nfl-api/nfl_elo_latest.csv')
    elo_df = elo_df.drop(columns = ['season','neutral' ,'playoff', 'elo_prob1', 'elo_prob2', 'elo1_post', 'elo2_post',
           'qbelo1_pre', 'qbelo2_pre', 'qb1', 'qb2', 'qb1_adj', 'qb2_adj', 'qbelo_prob1', 'qbelo_prob2',
           'qb1_game_value', 'qb2_game_value', 'qb1_value_post', 'qb2_value_post',
           'qbelo1_post', 'qbelo2_post', 'score1', 'score2', 'quality', 'importance', 'total_rating'])
    elo_df.date = pd.to_datetime(elo_df.date)
    elo_df = elo_df[elo_df.date < '01-05-2022']

    elo_df['team1'] = elo_df['team1'].replace(['KC', 'JAX', 'CAR', 'BAL', 'BUF', 'MIN', 'DET', 'ATL', 'NE', 'WSH',
           'CIN', 'NO', 'SF', 'LAR', 'NYG', 'DEN', 'CLE', 'IND', 'TEN', 'NYJ',
           'TB', 'MIA', 'PIT', 'PHI', 'GB', 'CHI', 'DAL', 'ARI', 'LAC', 'HOU',
           'SEA', 'OAK'],
            ['kan','jax','car', 'rav', 'buf', 'min', 'det', 'atl', 'nwe', 'was', 
            'cin', 'nor', 'sfo', 'ram', 'nyg', 'den', 'cle', 'clt', 'oti', 'nyj', 
             'tam','mia', 'pit', 'phi', 'gnb', 'chi', 'dal', 'crd', 'sdg', 'htx', 'sea', 'rai' ])
    elo_df['team2'] = elo_df['team2'].replace(['KC', 'JAX', 'CAR', 'BAL', 'BUF', 'MIN', 'DET', 'ATL', 'NE', 'WSH',
           'CIN', 'NO', 'SF', 'LAR', 'NYG', 'DEN', 'CLE', 'IND', 'TEN', 'NYJ',
           'TB', 'MIA', 'PIT', 'PHI', 'GB', 'CHI', 'DAL', 'ARI', 'LAC', 'HOU',
           'SEA', 'OAK'],
            ['kan','jax','car', 'rav', 'buf', 'min', 'det', 'atl', 'nwe', 'was', 
            'cin', 'nor', 'sfo', 'ram', 'nyg', 'den', 'cle', 'clt', 'oti', 'nyj', 
             'tam','mia', 'pit', 'phi', 'gnb', 'chi', 'dal', 'crd', 'sdg', 'htx', 'sea', 'rai' ])
    return elo_df

def merge_rankings(agg_games_df,elo_df):
    agg_games_df = pd.merge(agg_games_df, elo_df, how = 'inner', left_on = ['home_abbr', 'away_abbr'], right_on = ['team1', 'team2']).drop(columns = ['date','team1', 'team2'])
    agg_games_df['elo_dif'] = agg_games_df['elo2_pre'] - agg_games_df['elo1_pre']
    agg_games_df['qb_dif'] = agg_games_df['qb2_value_pre'] - agg_games_df['qb1_value_pre']
    agg_games_df = agg_games_df.drop(columns = ['elo1_pre', 'elo2_pre', 'qb1_value_pre', 'qb2_value_pre'])
    return agg_games_df

def prep_test_train(current_week,weeks,year):
    current_week = current_week + 1
    schedule_df  = get_schedule(year)
    weeks_games_df = game_data_up_to_week(weeks,year)
    agg_games_df = agg_weekly_data(schedule_df,weeks_games_df,current_week,weeks)
    elo_df = get_elo()
    agg_games_df = merge_rankings(agg_games_df, elo_df)
    train_df = agg_games_df[agg_games_df.result.notna()]
    current_week = current_week - 1
    test_df = agg_games_df[agg_games_df.week == current_week]
    return test_df, train_df

def display(y_pred,X_test):
    for g in range(len(y_pred)):
        #win_prob = np.round(y_pred[g],2)
        win_prob = int(y_pred[g] * 100)
        away_team = X_test.reset_index().drop(columns = 'index').loc[g,'away_name']
        home_team = X_test.reset_index().drop(columns = 'index').loc[g,'home_name']
        print(f'The {away_team} have a probability of {win_prob}% of beating the {home_team}.')

## Get all the data for 2021, up to the current week 13

In [3]:
# this step takes about five minutes to run 
current_week = 14
weeks = list(range(1,current_week + 1))
year = 2021

pred_games_df, comp_games_df = prep_test_train(current_week,weeks,year)

In [4]:
# drop duplicate that get picked up for last week
pred_games_df = pred_games_df.drop_duplicates()

In [5]:
# drop duplicate that get picked up for last week
df = pd.concat([comp_games_df, pred_games_df], axis=0).drop_duplicates()

# Write out 2021 info, so I don't have to load it all the time
df.to_csv("2021_week_2_through_14.csv", index = False)

In [6]:
# read the full dataframe in and drop an unneeded column
df = pd.read_csv('2021_week_2_through_14.csv')

# Logistic Regression

### Logistic Regression Predictions

In [7]:
# for models
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [8]:
pred_week = 12
comp_games_df = df[df['week'] < pred_week]
pred_games_df = df[df['week'] == pred_week]

In [9]:
train_df = comp_games_df
test_df = pred_games_df

X_train = train_df.drop(columns = ['away_name', 'away_abbr', 'home_name', 'home_abbr', 'week','result'])
y_train = train_df[['result']] 
X_test = test_df.drop(columns = ['away_name', 'away_abbr', 'home_name', 'home_abbr', 'week','result'])
y_test = test_df[['result']]

In [10]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to the features training dataset
X_scaler = scaler.fit(X_train)

# Fit the scaler to the features training dataset
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [11]:
# Logistic Model
clf_unscaled = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

clf_scaled = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

clf_unscaled.fit(X_train, np.ravel(y_train.values))
clf_scaled.fit(X_train_scaled, np.ravel(y_train.values))

y_pred_unscaled = clf_unscaled.predict_proba(X_test)
y_pred_scaled = clf_scaled.predict_proba(X_test_scaled)

y_pred_unscaled = y_pred_unscaled[:,1]
y_pred_scaled = y_pred_scaled[:,1]

print("Logistic Regression - Unscaled\n")
display(y_pred_unscaled,test_df)

print("\nLogistic Regression - Scaled\n")
display(y_pred_scaled,test_df)

Logistic Regression - Unscaled

The Chicago Bears have a probability of 57% of beating the Detroit Lions.
The Las Vegas Raiders have a probability of 30% of beating the Dallas Cowboys.
The Buffalo Bills have a probability of 80% of beating the New Orleans Saints.
The Philadelphia Eagles have a probability of 55% of beating the New York Giants.
The Tennessee Titans have a probability of 50% of beating the New England Patriots.
The Carolina Panthers have a probability of 44% of beating the Miami Dolphins.
The Atlanta Falcons have a probability of 46% of beating the Jacksonville Jaguars.
The New York Jets have a probability of 59% of beating the Houston Texans.
The Tampa Bay Buccaneers have a probability of 43% of beating the Indianapolis Colts.
The Pittsburgh Steelers have a probability of 29% of beating the Cincinnati Bengals.
The Los Angeles Chargers have a probability of 41% of beating the Denver Broncos.
The Minnesota Vikings have a probability of 49% of beating the San Francisco 49e

### Get Logistic Regression Accuracy

In [12]:
# make a for loop to the get accuracy for each week
def accuracy_score_log_reg(df):
    
    accuracy = pd.DataFrame(
        columns=[
            'week', 
            'Logistic Regression Accuracy - Unscaled', 
            'Logistic Regeression Accuracy - Scaled',
            'Log Reg - Unscaled - drop 40% to 60%',
            'Log Reg - Scaled - drop 40% to 60%'
        ])
    
    for w in df['week'].unique()[1:-1]:
        train_df = df[df['week'] < w]
        test_df = df[df['week'] == w]
        
        X_train = train_df.drop(columns = ['away_name', 'away_abbr', 'home_name', 'home_abbr', 'week','result'])
        y_train = train_df[['result']] 
        X_test = test_df.drop(columns = ['away_name', 'away_abbr', 'home_name', 'home_abbr', 'week','result'])
        y_test = test_df[['result']]
        
        # Create a StandardScaler instance
        scaler = StandardScaler()

        # Fit the scaler to the features training dataset
        X_scaler = scaler.fit(X_train)

        # Fit the scaler to the features training dataset
        X_train_scaled = X_scaler.transform(X_train)
        X_test_scaled = X_scaler.transform(X_test)
        
        clf_unscaled = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                           intercept_scaling=1, class_weight='balanced', random_state=None, 
                           solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

        clf_scaled = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                           intercept_scaling=1, class_weight='balanced', random_state=None, 
                           solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

        clf_unscaled.fit(X_train, np.ravel(y_train.values))
        clf_scaled.fit(X_train_scaled, np.ravel(y_train.values))
        
        y_pred_unscaled = clf_unscaled.predict_proba(X_test)
        y_pred_scaled = clf_scaled.predict_proba(X_test_scaled)
        
        y_pred_unscaled = y_pred_unscaled[:,1]
        y_pred_scaled = y_pred_scaled[:,1]
        
        accuracy_score_unscaled = accuracy_score(y_test, np.round(y_pred_unscaled))
        accuracy_score_scaled = accuracy_score(y_test, np.round(y_pred_scaled))
        accuracy_score_unscaled_drop_40_60 = accuracy_score(y_test[(y_pred_unscaled < .4) | (y_pred_unscaled > .6)], np.round(y_pred_unscaled[(y_pred_unscaled < .4) | (y_pred_unscaled > .6)]))
        accuracy_score_scaled_drop_40_60 = accuracy_score(y_test[(y_pred_scaled < .4) | (y_pred_scaled > .6)],np.round(y_pred_scaled[(y_pred_scaled < .4) | (y_pred_scaled > .6)]))
        
        accuracy.loc[w,:] = [w, accuracy_score_unscaled, accuracy_score_scaled, accuracy_score_unscaled_drop_40_60, accuracy_score_scaled_drop_40_60]
        
    return(accuracy)

In [13]:
# Call the accuracy function and output the df.
logistic_regression_accuracy = accuracy_score_log_reg(df)
logistic_regression_accuracy

Unnamed: 0,week,Logistic Regression Accuracy - Unscaled,Logistic Regeression Accuracy - Scaled,Log Reg - Unscaled - drop 40% to 60%,Log Reg - Scaled - drop 40% to 60%
3,3,0.6,0.466667,0.538462,0.555556
4,4,0.6875,0.6875,0.666667,0.692308
5,5,0.5,0.625,0.466667,0.666667
6,6,0.857143,0.857143,0.833333,0.909091
7,7,0.769231,0.769231,0.75,0.8
8,8,0.466667,0.466667,0.5,0.466667
9,9,0.642857,0.714286,0.6,0.636364
10,10,0.5,0.5,0.5,0.538462
11,11,0.666667,0.6,0.777778,0.777778
12,12,0.666667,0.6,0.8,0.714286


# Tensorflow 

### Tensorflow Predictions

In [14]:
# import tensorflow libraries
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split

In [15]:
# week we want to predict
pred_week = 14

# To build a tensorflow model, we're going to use the three weeks prior to the week we are trying to predict.
# tensorflow tends to overfit models on the data it has.
comp_games_df = df[(df['week'] >= (pred_week - 3)) & (df['week'] < pred_week)]
pred_games_df = df[df['week'] == pred_week]

In [16]:
train_df = comp_games_df
test_df = pred_games_df

X_train = train_df.drop(columns = ['away_name', 'away_abbr', 'home_name', 'home_abbr', 'week','result'])
y_train = train_df[['result']] 
X_test = test_df.drop(columns = ['away_name', 'away_abbr', 'home_name', 'home_abbr', 'week','result'])
y_test = test_df[['result']]

## Fit and build the tensorflow model

In [17]:
# Define the the number of inputs (features) to the model
number_input_features = X_train.shape[1]

# Define the number of neurons in the output layer
number_output_neurons = 1

# Define the number of hidden nodes for three layer
hidden_nodes_layer1 =  (number_input_features + 1) // 2
hidden_nodes_layer2 = (hidden_nodes_layer1 + 1) // 2
hidden_nodes_layer3 = (hidden_nodes_layer2 + 1) // 2

# Create the Sequential model instance
nn = Sequential()

# Add the hidden layers
nn.add(Dense(units=hidden_nodes_layer1, activation='relu', input_dim=number_input_features))
nn.add(Dense(units=hidden_nodes_layer2, activation='relu'))
nn.add(Dense(units=hidden_nodes_layer3, activation='relu'))

# Add the output layer to the model specifying the number of output neurons and activation function
nn.add(Dense(units=number_output_neurons, activation='sigmoid'))

# Display the Sequential model summary
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 12)                300       
                                                                 
 dense_1 (Dense)             (None, 6)                 78        
                                                                 
 dense_2 (Dense)             (None, 3)                 21        
                                                                 
 dense_3 (Dense)             (None, 1)                 4         
                                                                 
Total params: 403
Trainable params: 403
Non-trainable params: 0
_________________________________________________________________


In [18]:
# Compile the Sequential model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit the model using 50 epochs and the training data
fit_model = nn.fit(X_train, y_train, epochs=500, verbose=0)

### Export the tensorflow model

In [19]:
# Set the model's file path
#file_path = "Resources/model_nn_2021_2_12_not_scaled_relu_relu_sigmoid_loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']_1000 epochs.h5"
file_path = "Resources/model_nn_2021_weeks_11_through_12.h5"

# Export your model to a HDF5 file
nn.save(file_path)

### Import the tensorflowe model

In [20]:
# load a previously built model.
#file_path = "Resources/model_nn_relu_relu_sigmoid_loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']_200 epochs.h5"
#file_path = "Resources/model_nn_relu_relu_sigmoid_loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']_200 epochs.h5"

nn = tf.keras.models.load_model(file_path)

In [21]:
# Evaluate the model loss and accuracy metrics using the evaluate method and the test data        
nn_predictions_unscaled = nn.predict(X_test)       

print("Tensorflow - Unscaled \n")
display(nn_predictions_unscaled.squeeze(),test_df)

Tensorflow - Unscaled 

The Pittsburgh Steelers have a probability of 92% of beating the Minnesota Vikings.
The San Francisco 49ers have a probability of 81% of beating the Cincinnati Bengals.
The Atlanta Falcons have a probability of 92% of beating the Carolina Panthers.
The Dallas Cowboys have a probability of 1% of beating the Washington Football Team.
The Jacksonville Jaguars have a probability of 1% of beating the Tennessee Titans.
The New Orleans Saints have a probability of 95% of beating the New York Jets.
The Las Vegas Raiders have a probability of 0% of beating the Kansas City Chiefs.
The Seattle Seahawks have a probability of 0% of beating the Houston Texans.
The Baltimore Ravens have a probability of 99% of beating the Cleveland Browns.
The New York Giants have a probability of 83% of beating the Los Angeles Chargers.
The Detroit Lions have a probability of 81% of beating the Denver Broncos.
The Buffalo Bills have a probability of 0% of beating the Tampa Bay Buccaneers.
The

## Tensorflow Accuracy

In [22]:
# make a for loop to the get accuracy for each week
def accuracy_score_tensorflow(df):
    
    accuracy = pd.DataFrame(columns=['week', 'Tensorflow Accuracy - Unscaled'])
    
    for w in df['week'].unique()[1:-1]:
        train_df = df[df['week'] < w]
        test_df = df[df['week'] == w]
        
        X_train = train_df.drop(columns = ['away_name', 'away_abbr', 'home_name', 'home_abbr', 'week','result'])
        y_train = train_df[['result']] 
        X_test = test_df.drop(columns = ['away_name', 'away_abbr', 'home_name', 'home_abbr', 'week','result'])
        y_test = test_df[['result']]
        
        # Evaluate the model loss and accuracy metrics using the evaluate method and the test data        
        model_accuracy_unscaled = nn.evaluate(X_test, y_test, verbose=0)       
    
        # Assign the accuracy to 
        accuracy.loc[w,:] = [w, model_accuracy_unscaled[1]]
        
    return(accuracy)

In [23]:
# Run the accuracy_score_Tensorflow function to get an array of accuracies
tensorflow_accuracy = accuracy_score_tensorflow(df)
tensorflow_accuracy

Unnamed: 0,week,Tensorflow Accuracy - Unscaled
3,3,0.4
4,4,0.4375
5,5,0.375
6,6,0.571429
7,7,0.692308
8,8,0.666667
9,9,0.285714
10,10,0.5
11,11,1.0
12,12,1.0


In [24]:
accuracy_df = pd.concat(
    [logistic_regression_accuracy, tensorflow_accuracy.drop(columns=['week'])], axis=1)
accuracy_df

Unnamed: 0,week,Logistic Regression Accuracy - Unscaled,Logistic Regeression Accuracy - Scaled,Log Reg - Unscaled - drop 40% to 60%,Log Reg - Scaled - drop 40% to 60%,Tensorflow Accuracy - Unscaled
3,3,0.6,0.466667,0.538462,0.555556,0.4
4,4,0.6875,0.6875,0.666667,0.692308,0.4375
5,5,0.5,0.625,0.466667,0.666667,0.375
6,6,0.857143,0.857143,0.833333,0.909091,0.571429
7,7,0.769231,0.769231,0.75,0.8,0.692308
8,8,0.466667,0.466667,0.5,0.466667,0.666667
9,9,0.642857,0.714286,0.6,0.636364,0.285714
10,10,0.5,0.5,0.5,0.538462,0.5
11,11,0.666667,0.6,0.777778,0.777778,1.0
12,12,0.666667,0.6,0.8,0.714286,1.0


# Random Forest Model

In [25]:
pred_week = 11
comp_games_df = df[df['week'] < pred_week]
pred_games_df = df[df['week'] == pred_week]

In [26]:
train_df = comp_games_df
test_df = pred_games_df

X_train = train_df.drop(columns = ['away_name', 'away_abbr', 'home_name', 'home_abbr', 'week','result'])
y_train = train_df[['result']] 
X_test = test_df.drop(columns = ['away_name', 'away_abbr', 'home_name', 'home_abbr', 'week','result'])
y_test = test_df[['result']]

In [28]:
# Create a random forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rf_model = RandomForestClassifier(n_estimators=100, random_state=1)

# Fitting the model
rf_fitted = rf_model.fit(X_train, np.ravel(y_train.values))

# Making predictions using the testing data
y_pred = rf_fitted.predict_proba(X_test)
y_pred = y_pred[:,1]
y = rf_fitted.predict(X_test)

# print("RandomForest - Unscaled \n")
display(y_pred, test_df)
accuracy_score(y_test, y)

The New England Patriots have a probability of 67% of beating the Atlanta Falcons.
The Washington Football Team have a probability of 55% of beating the Carolina Panthers.
The Indianapolis Colts have a probability of 36% of beating the Buffalo Bills.
The New Orleans Saints have a probability of 45% of beating the Philadelphia Eagles.
The Houston Texans have a probability of 26% of beating the Tennessee Titans.
The Miami Dolphins have a probability of 55% of beating the New York Jets.
The Green Bay Packers have a probability of 63% of beating the Minnesota Vikings.
The San Francisco 49ers have a probability of 77% of beating the Jacksonville Jaguars.
The Detroit Lions have a probability of 12% of beating the Cleveland Browns.
The Baltimore Ravens have a probability of 67% of beating the Chicago Bears.
The Cincinnati Bengals have a probability of 72% of beating the Las Vegas Raiders.
The Arizona Cardinals have a probability of 65% of beating the Seattle Seahawks.
The Dallas Cowboys have 

0.8

In [29]:
# make a for loop to the get accuracy for each week
def accuracy_score_random_forest(df):
    
    accuracy = pd.DataFrame(columns=['week', 'Random Forest Accuracy', 'Random Forest - drop 40% - 60%'])
    
    for w in df['week'].unique()[1:-1]:
        train_df = df[df['week'] < w]
        test_df = df[df['week'] == w]
        
        X_train = train_df.drop(columns = ['away_name', 'away_abbr', 'home_name', 'home_abbr', 'week','result'])
        y_train = train_df[['result']] 
        X_test = test_df.drop(columns = ['away_name', 'away_abbr', 'home_name', 'home_abbr', 'week','result'])
        y_test = test_df[['result']]
        
        # Fitting the model
        classifier = RandomForestClassifier(n_estimators=100, random_state=1)
        classifier.fit(X_train, np.ravel(y_train.values))
        y_pred = classifier.predict(X_test)
        y_pred_percent =  classifier.predict_proba(X_test)[:,1]
        
        model_accuracy = accuracy_score(y_test, y_pred)
        model_accuracy_drop_40_60 = accuracy_score(y_test[(y_pred_percent < .4) | (y_pred_percent > .6)], y_pred[(y_pred_percent < .4) | (y_pred_percent > .6)])

        # Assign the accuracy to 
        accuracy.loc[w,:] = [w, model_accuracy, model_accuracy_drop_40_60]
        
    return(accuracy)

In [30]:
random_forest_accuracy = accuracy_score_random_forest(df)
random_forest_accuracy

Unnamed: 0,week,Random Forest Accuracy,Random Forest - drop 40% - 60%
3,3,0.6,0.75
4,4,0.625,0.7
5,5,0.625,0.8
6,6,0.785714,0.833333
7,7,0.846154,0.8
8,8,0.6,0.583333
9,9,0.642857,0.6
10,10,0.642857,0.545455
11,11,0.8,0.7
12,12,0.466667,0.583333


In [31]:
accuracy_df = pd.concat(
    [logistic_regression_accuracy, 
     tensorflow_accuracy.drop(columns=['week']),
     random_forest_accuracy.drop(columns=['week'])], 
    axis=1)

accuracy_df

Unnamed: 0,week,Logistic Regression Accuracy - Unscaled,Logistic Regeression Accuracy - Scaled,Log Reg - Unscaled - drop 40% to 60%,Log Reg - Scaled - drop 40% to 60%,Tensorflow Accuracy - Unscaled,Random Forest Accuracy,Random Forest - drop 40% - 60%
3,3,0.6,0.466667,0.538462,0.555556,0.4,0.6,0.75
4,4,0.6875,0.6875,0.666667,0.692308,0.4375,0.625,0.7
5,5,0.5,0.625,0.466667,0.666667,0.375,0.625,0.8
6,6,0.857143,0.857143,0.833333,0.909091,0.571429,0.785714,0.833333
7,7,0.769231,0.769231,0.75,0.8,0.692308,0.846154,0.8
8,8,0.466667,0.466667,0.5,0.466667,0.666667,0.6,0.583333
9,9,0.642857,0.714286,0.6,0.636364,0.285714,0.642857,0.6
10,10,0.5,0.5,0.5,0.538462,0.5,0.642857,0.545455
11,11,0.666667,0.6,0.777778,0.777778,1.0,0.8,0.7
12,12,0.666667,0.6,0.8,0.714286,1.0,0.466667,0.583333


In [37]:
accuracy_df.to_csv('Resources/accuracy_df.csv')

In [32]:
accuracy_df.sum()

week                                             88
Logistic Regression Accuracy - Unscaled    6.928159
Logistic Regeression Accuracy - Scaled     6.857921
Log Reg - Unscaled - drop 40% to 60%       7.032906
Log Reg - Scaled - drop 40% to 60%         7.357176
Tensorflow Accuracy - Unscaled             6.928617
Random Forest Accuracy                     7.277106
Random Forest - drop 40% - 60%             7.673232
dtype: object

In [36]:
# plot the accuracy dataframe
accuracy_df.hvplot(x='week', width=1000, height=500)

In [41]:
%%HTML
<div class='tableauPlaceholder' id='viz1638998595666' style='position: relative'><noscript><a href='#'><img alt=' ' src='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;NF&#47;NFLPredictionModelAccuracy&#47;NFLPredictionModelAccuracy&#47;1_rss.png' style='border: none' /></a></noscript><object class='tableauViz'  style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='' /><param name='name' value='NFLPredictionModelAccuracy&#47;NFLPredictionModelAccuracy' /><param name='tabs' value='yes' /><param name='toolbar' value='yes' /><param name='static_image' value='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;NF&#47;NFLPredictionModelAccuracy&#47;NFLPredictionModelAccuracy&#47;1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='language' value='en-US' /><param name='filter' value='publish=yes' /></object></div>                <script type='text/javascript'>                    var divElement = document.getElementById('viz1638998595666');                    var vizElement = divElement.getElementsByTagName('object')[0];                    if ( divElement.offsetWidth > 800 ) { vizElement.style.minWidth='800px';vizElement.style.maxWidth='100%';vizElement.style.minHeight='1250px';vizElement.style.maxHeight=(divElement.offsetWidth*0.75)+'px';} else if ( divElement.offsetWidth > 500 ) { vizElement.style.minWidth='800px';vizElement.style.maxWidth='100%';vizElement.style.minHeight='1250px';vizElement.style.maxHeight=(divElement.offsetWidth*0.75)+'px';} else { vizElement.style.width='100%';vizElement.style.minHeight='800px';vizElement.style.maxHeight=(divElement.offsetWidth*1.77)+'px';}                     var scriptElement = document.createElement('script');                    scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js';                    vizElement.parentNode.insertBefore(scriptElement, vizElement);                </script>