In [1]:
from sportsipy.nfl.boxscore import Boxscores, Boxscore
from sportsipy.nfl.schedule import Schedule, Game
from sportsipy.nfl.teams import Teams
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

In [7]:
# The game_Data function is extracting game statistics for each game. It loops through each game and each week grabbing the statistics referenced.
# This function creates the foundation for our final dataset.
def game_data(game_df,game_stats):
    try:
        # Creates a dataframe for the away_team and the home_team. Sets column names to be exact matches between the two.
        away_team_df = game_df[['away_name', 'away_abbr', 'away_score']].rename(columns = {'away_name': 'team_name', 'away_abbr': 'team_abbr', 'away_score': 'score'})
        home_team_df = game_df[['home_name','home_abbr', 'home_score']].rename(columns = {'home_name': 'team_name', 'home_abbr': 'team_abbr', 'home_score': 'score'})
        try:
            if game_df.loc[0,'away_score'] > game_df.loc[0,'home_score']:
                away_team_df = pd.merge(away_team_df, pd.DataFrame({'game_won' : [1], 'game_lost' : [0]}),left_index = True, right_index = True)
                home_team_df = pd.merge(home_team_df, pd.DataFrame({'game_won' : [0], 'game_lost' : [1]}),left_index = True, right_index = True)
            elif game_df.loc[0,'away_score'] < game_df.loc[0,'home_score']:
                away_team_df = pd.merge(away_team_df, pd.DataFrame({'game_won' : [0], 'game_lost' : [1]}),left_index = True, right_index = True)
                home_team_df = pd.merge(home_team_df, pd.DataFrame({'game_won' : [1], 'game_lost' : [0]}),left_index = True, right_index = True)
            else: 
                away_team_df = pd.merge(away_team_df, pd.DataFrame({'game_won' : [0], 'game_lost' : [0]}),left_index = True, right_index = True)
                home_team_df = pd.merge(home_team_df, pd.DataFrame({'game_won' : [0], 'game_lost' : [0]}),left_index = True, right_index = True)
        except TypeError:
                away_team_df = pd.merge(away_team_df, pd.DataFrame({'game_won' : [np.nan], 'game_lost' : [np.nan]}),left_index = True, right_index = True)
                home_team_df = pd.merge(home_team_df, pd.DataFrame({'game_won' : [np.nan], 'game_lost' : [np.nan]}),left_index = True, right_index = True)        

        # Creating the away_team & home_team stats dataframe. Grabbing the selected stats and then renaming them to match home == away dataframe column names.
        away_stats_df = game_stats.dataframe[['away_first_downs', 'away_fourth_down_attempts',
               'away_fourth_down_conversions', 'away_fumbles', 'away_fumbles_lost',
               'away_interceptions', 'away_net_pass_yards', 'away_pass_attempts',
               'away_pass_completions', 'away_pass_touchdowns', 'away_pass_yards',
               'away_penalties', 'away_points', 'away_rush_attempts',
               'away_rush_touchdowns', 'away_rush_yards', 'away_third_down_attempts',
               'away_third_down_conversions', 'away_time_of_possession',
               'away_times_sacked', 'away_total_yards', 'away_turnovers',
               'away_yards_from_penalties', 'away_yards_lost_from_sacks']].reset_index().drop(columns ='index').rename(columns = {
               'away_first_downs': 'first_downs', 'away_fourth_down_attempts':'fourth_down_attempts',
               'away_fourth_down_conversions':'fourth_down_conversions' , 'away_fumbles': 'fumbles', 'away_fumbles_lost': 'fumbles_lost',
               'away_interceptions': 'interceptions', 'away_net_pass_yards':'net_pass_yards' , 'away_pass_attempts': 'pass_attempts',
               'away_pass_completions':'pass_completions' , 'away_pass_touchdowns': 'pass_touchdowns', 'away_pass_yards': 'pass_yards',
               'away_penalties': 'penalties', 'away_points': 'points', 'away_rush_attempts': 'rush_attempts',
               'away_rush_touchdowns': 'rush_touchdowns', 'away_rush_yards': 'rush_yards', 'away_third_down_attempts': 'third_down_attempts',
               'away_third_down_conversions': 'third_down_conversions', 'away_time_of_possession': 'time_of_possession',
               'away_times_sacked': 'times_sacked', 'away_total_yards': 'total_yards', 'away_turnovers': 'turnovers',
               'away_yards_from_penalties':'yards_from_penalties', 'away_yards_lost_from_sacks': 'yards_lost_from_sacks'})

        home_stats_df = game_stats.dataframe[['home_first_downs', 'home_fourth_down_attempts',
               'home_fourth_down_conversions', 'home_fumbles', 'home_fumbles_lost',
               'home_interceptions', 'home_net_pass_yards', 'home_pass_attempts',
               'home_pass_completions', 'home_pass_touchdowns', 'home_pass_yards',
               'home_penalties', 'home_points', 'home_rush_attempts',
               'home_rush_touchdowns', 'home_rush_yards', 'home_third_down_attempts',
               'home_third_down_conversions', 'home_time_of_possession',
               'home_times_sacked', 'home_total_yards', 'home_turnovers',
               'home_yards_from_penalties', 'home_yards_lost_from_sacks']].reset_index().drop(columns = 'index').rename(columns = {
               'home_first_downs': 'first_downs', 'home_fourth_down_attempts':'fourth_down_attempts',
               'home_fourth_down_conversions':'fourth_down_conversions' , 'home_fumbles': 'fumbles', 'home_fumbles_lost': 'fumbles_lost',
               'home_interceptions': 'interceptions', 'home_net_pass_yards':'net_pass_yards' , 'home_pass_attempts': 'pass_attempts',
               'home_pass_completions':'pass_completions' , 'home_pass_touchdowns': 'pass_touchdowns', 'home_pass_yards': 'pass_yards',
               'home_penalties': 'penalties', 'home_points': 'points', 'home_rush_attempts': 'rush_attempts',
               'home_rush_touchdowns': 'rush_touchdowns', 'home_rush_yards': 'rush_yards', 'home_third_down_attempts': 'third_down_attempts',
               'home_third_down_conversions': 'third_down_conversions', 'home_time_of_possession': 'time_of_possession',
               'home_times_sacked': 'times_sacked', 'home_total_yards': 'total_yards', 'home_turnovers': 'turnovers',
               'home_yards_from_penalties':'yards_from_penalties', 'home_yards_lost_from_sacks': 'yards_lost_from_sacks'})
        
        # Merge the team_df & stats_df for both home & away teams. Set the left_index & right_index to True so that both dataframes merge on the same indices. 
        away_team_df = pd.merge(away_team_df, away_stats_df,left_index = True, right_index = True)
        home_team_df = pd.merge(home_team_df, home_stats_df,left_index = True, right_index = True)
        try:
            # Converting time_of_possession from MM:SS format into seconds(int). 
            away_team_df['time_of_possession'] = (int(away_team_df['time_of_possession'].loc[0][0:2]) * 60) + int(away_team_df['time_of_possession'].loc[0][3:5])
            home_team_df['time_of_possession'] = (int(home_team_df['time_of_possession'].loc[0][0:2]) * 60) + int(home_team_df['time_of_possession'].loc[0][3:5])
        except TypeError:
            away_team_df['time_of_possession'] = np.nan
            home_team_df['time_of_possession'] = np.nan
    except TypeError:
        away_team_df = pd.DataFrame()
        home_team_df = pd.DataFrame()
    return away_team_df, home_team_df


def game_data_up_to_week(weeks,year):
    weeks_games_df = pd.DataFrame()
    for w in range(len(weeks)):
        date_string = str(weeks[w]) + '-' + str(year)
        week_scores = Boxscores(weeks[w],year)
        week_games_df = pd.DataFrame()
        for g in range(len(week_scores.games[date_string])):
            game_str = week_scores.games[date_string][g]['boxscore']
            game_stats = Boxscore(game_str)
            game_df = pd.DataFrame(week_scores.games[date_string][g], index = [0])
            away_team_df, home_team_df = game_data(game_df,game_stats)
            away_team_df['week'] = weeks[w]
            home_team_df['week'] = weeks[w]
            week_games_df = pd.concat([week_games_df,away_team_df])
            week_games_df = pd.concat([week_games_df,home_team_df])
        weeks_games_df = pd.concat([weeks_games_df,week_games_df])
    return weeks_games_df

def get_schedule(year):
    weeks = list(range(1,19))
    schedule_df = pd.DataFrame()
    for w in range(len(weeks)):
        date_string = str(weeks[w]) + '-' + str(year)
        week_scores = Boxscores(weeks[w],year)
        week_games_df = pd.DataFrame()
        for g in range(len(week_scores.games[date_string])):
            game = pd.DataFrame(week_scores.games[date_string][g], index = [0])[['away_name', 'away_abbr','home_name', 'home_abbr','winning_name', 'winning_abbr' ]]
            game['week'] = weeks[w]
            week_games_df = pd.concat([week_games_df,game])
        schedule_df = pd.concat([schedule_df, week_games_df]).reset_index().drop(columns = 'index') 
    return schedule_df 

def agg_weekly_data(schedule_df,weeks_games_df,current_week,weeks):
    schedule_df = schedule_df[schedule_df.week < current_week]
    agg_games_df = pd.DataFrame()
    for w in range(1,len(weeks)):
        games_df = schedule_df[schedule_df.week == weeks[w]]
        agg_weekly_df = weeks_games_df[weeks_games_df.week < weeks[w]].drop(columns = ['score','week','game_won', 'game_lost']).groupby(by=["team_name", "team_abbr"]).mean().reset_index()
        win_loss_df = weeks_games_df[weeks_games_df.week < weeks[w]][["team_name", "team_abbr",'game_won', 'game_lost']].groupby(by=["team_name", "team_abbr"]).sum().reset_index()
        win_loss_df['win_perc'] = win_loss_df['game_won'] / (win_loss_df['game_won'] + win_loss_df['game_lost'])
        win_loss_df = win_loss_df.drop(columns = ['game_won', 'game_lost'])

        try:
            agg_weekly_df['fourth_down_perc'] = agg_weekly_df['fourth_down_conversions'] / agg_weekly_df['fourth_down_attempts']  
        except ZeroDivisionError:
            agg_weekly_df['fourth_down_perc'] = 0 
        agg_weekly_df['fourth_down_perc'] = agg_weekly_df['fourth_down_perc'].fillna(0)

        try:
            agg_weekly_df['third_down_perc'] = agg_weekly_df['third_down_conversions'] / agg_weekly_df['third_down_attempts']  
        except ZeroDivisionError:
            agg_weekly_df['third_down_perc'] = 0
        agg_weekly_df['third_down_perc'] = agg_weekly_df['third_down_perc'].fillna(0)  

        agg_weekly_df = agg_weekly_df.drop(columns = ['fourth_down_attempts', 'fourth_down_conversions', 'third_down_attempts', 'third_down_conversions'])
        agg_weekly_df = pd.merge(win_loss_df,agg_weekly_df,left_on = ['team_name', 'team_abbr'], right_on = ['team_name', 'team_abbr'])

        away_df = pd.merge(games_df,agg_weekly_df,how = 'inner', left_on = ['away_name', 'away_abbr'], right_on = ['team_name', 'team_abbr']).drop(columns = ['team_name', 'team_abbr']).rename(columns = {
                'win_perc': 'away_win_perc',
               'first_downs': 'away_first_downs', 'fumbles': 'away_fumbles', 'fumbles_lost':'away_fumbles_lost', 'interceptions':'away_interceptions',
               'net_pass_yards': 'away_net_pass_yards', 'pass_attempts':'away_pass_attempts', 'pass_completions':'away_pass_completions',
               'pass_touchdowns':'away_pass_touchdowns', 'pass_yards':'away_pass_yards', 'penalties':'away_penalties', 'points':'away_points', 'rush_attempts':'away_rush_attempts',
               'rush_touchdowns':'away_rush_touchdowns', 'rush_yards':'away_rush_yards', 'time_of_possession':'away_time_of_possession', 'times_sacked':'away_times_sacked',
               'total_yards':'away_total_yards', 'turnovers':'away_turnovers', 'yards_from_penalties':'away_yards_from_penalties',
               'yards_lost_from_sacks': 'away_yards_lost_from_sacks', 'fourth_down_perc':'away_fourth_down_perc', 'third_down_perc':'away_third_down_perc'})

        home_df = pd.merge(games_df,agg_weekly_df,how = 'inner', left_on = ['home_name', 'home_abbr'], right_on = ['team_name', 'team_abbr']).drop(columns = ['team_name', 'team_abbr']).rename(columns = {
                'win_perc': 'home_win_perc',
               'first_downs': 'home_first_downs', 'fumbles': 'home_fumbles', 'fumbles_lost':'home_fumbles_lost', 'interceptions':'home_interceptions',
               'net_pass_yards': 'home_net_pass_yards', 'pass_attempts':'home_pass_attempts', 'pass_completions':'home_pass_completions',
               'pass_touchdowns':'home_pass_touchdowns', 'pass_yards':'home_pass_yards', 'penalties':'home_penalties', 'points':'home_points', 'rush_attempts':'home_rush_attempts',
               'rush_touchdowns':'home_rush_touchdowns', 'rush_yards':'home_rush_yards', 'time_of_possession':'home_time_of_possession', 'times_sacked':'home_times_sacked',
               'total_yards':'home_total_yards', 'turnovers':'home_turnovers', 'yards_from_penalties':'home_yards_from_penalties',
               'yards_lost_from_sacks': 'home_yards_lost_from_sacks', 'fourth_down_perc':'home_fourth_down_perc', 'third_down_perc':'home_third_down_perc'})

        agg_weekly_df = pd.merge(away_df,home_df,left_on = ['away_name', 'away_abbr', 'home_name', 'home_abbr', 'winning_name',
               'winning_abbr', 'week'], right_on = ['away_name', 'away_abbr', 'home_name', 'home_abbr', 'winning_name',
               'winning_abbr', 'week'])

        agg_weekly_df['win_perc_dif'] = agg_weekly_df['away_win_perc'] - agg_weekly_df['home_win_perc']
        agg_weekly_df['first_downs_dif'] = agg_weekly_df['away_first_downs'] - agg_weekly_df['home_first_downs']
        agg_weekly_df['fumbles_dif'] = agg_weekly_df['away_fumbles'] - agg_weekly_df['home_fumbles']
        agg_weekly_df['interceptions_dif'] = agg_weekly_df['away_interceptions'] - agg_weekly_df['home_interceptions']
        agg_weekly_df['net_pass_yards_dif'] = agg_weekly_df['away_net_pass_yards'] - agg_weekly_df['home_net_pass_yards']
        agg_weekly_df['pass_attempts_dif'] = agg_weekly_df['away_pass_attempts'] - agg_weekly_df['home_pass_attempts']
        agg_weekly_df['pass_completions_dif'] = agg_weekly_df['away_pass_completions'] - agg_weekly_df['home_pass_completions']
        agg_weekly_df['pass_touchdowns_dif'] = agg_weekly_df['away_pass_touchdowns'] - agg_weekly_df['home_pass_touchdowns']
        agg_weekly_df['pass_yards_dif'] = agg_weekly_df['away_pass_yards'] - agg_weekly_df['home_pass_yards']
        agg_weekly_df['penalties_dif'] = agg_weekly_df['away_penalties'] - agg_weekly_df['home_penalties']
        agg_weekly_df['points_dif'] = agg_weekly_df['away_points'] - agg_weekly_df['home_points']
        agg_weekly_df['rush_attempts_dif'] = agg_weekly_df['away_rush_attempts'] - agg_weekly_df['home_rush_attempts']
        agg_weekly_df['rush_touchdowns_dif'] = agg_weekly_df['away_rush_touchdowns'] - agg_weekly_df['home_rush_touchdowns']
        agg_weekly_df['rush_yards_dif'] = agg_weekly_df['away_rush_yards'] - agg_weekly_df['home_rush_yards']
        agg_weekly_df['time_of_possession_dif'] = agg_weekly_df['away_time_of_possession'] - agg_weekly_df['home_time_of_possession']
        agg_weekly_df['times_sacked_dif'] = agg_weekly_df['away_times_sacked'] - agg_weekly_df['home_times_sacked']
        agg_weekly_df['total_yards_dif'] = agg_weekly_df['away_total_yards'] - agg_weekly_df['home_total_yards']
        agg_weekly_df['turnovers_dif'] = agg_weekly_df['away_turnovers'] - agg_weekly_df['home_turnovers']
        agg_weekly_df['yards_from_penalties_dif'] = agg_weekly_df['away_yards_from_penalties'] - agg_weekly_df['home_yards_from_penalties']
        agg_weekly_df['yards_lost_from_sacks_dif'] = agg_weekly_df['away_yards_lost_from_sacks'] - agg_weekly_df['home_yards_lost_from_sacks']
        agg_weekly_df['fourth_down_perc_dif'] = agg_weekly_df['away_fourth_down_perc'] - agg_weekly_df['home_fourth_down_perc']
        agg_weekly_df['third_down_perc_dif'] = agg_weekly_df['away_third_down_perc'] - agg_weekly_df['home_third_down_perc']

        agg_weekly_df = agg_weekly_df.drop(columns = ['away_win_perc',
               'away_first_downs', 'away_fumbles', 'away_fumbles_lost', 'away_interceptions',
               'away_net_pass_yards', 'away_pass_attempts','away_pass_completions',
               'away_pass_touchdowns', 'away_pass_yards', 'away_penalties', 'away_points', 'away_rush_attempts',
               'away_rush_touchdowns', 'away_rush_yards', 'away_time_of_possession', 'away_times_sacked',
               'away_total_yards', 'away_turnovers', 'away_yards_from_penalties',
               'away_yards_lost_from_sacks','away_fourth_down_perc', 'away_third_down_perc','home_win_perc',
               'home_first_downs', 'home_fumbles', 'home_fumbles_lost', 'home_interceptions',
               'home_net_pass_yards', 'home_pass_attempts','home_pass_completions',
               'home_pass_touchdowns', 'home_pass_yards', 'home_penalties', 'home_points', 'home_rush_attempts',
               'home_rush_touchdowns', 'home_rush_yards', 'home_time_of_possession', 'home_times_sacked',
               'home_total_yards', 'home_turnovers', 'home_yards_from_penalties',
               'home_yards_lost_from_sacks','home_fourth_down_perc', 'home_third_down_perc'])
        
        if (agg_weekly_df['winning_name'].isnull().values.any() and weeks[w] > 3):
            agg_weekly_df['result'] = np.nan
            print(f"Week {weeks[w]} games have not finished yet.")
        else:
            agg_weekly_df['result'] = agg_weekly_df['winning_name'] == agg_weekly_df['away_name']
            agg_weekly_df['result'] = agg_weekly_df['result'].astype('float')
        agg_weekly_df = agg_weekly_df.drop(columns = ['winning_name', 'winning_abbr'])
        agg_games_df = pd.concat([agg_games_df, agg_weekly_df])
    agg_games_df = agg_games_df.reset_index().drop(columns = 'index')
    # What is .drop(index = 20) doing?
    agg_games_df = agg_games_df.drop(index = 20, axis=0)
    return agg_games_df

def get_elo():
    elo_df = pd.read_csv('Resources/nfl_elo_2020.csv')
    elo_df = elo_df.drop(columns = ['season','neutral' ,'playoff', 'elo_prob1', 'score1', 'score2','result1'])
    elo_df.date = pd.to_datetime(elo_df.date)
    elo_df = elo_df[elo_df.date < '01-05-2022']

    elo_df['team1'] = elo_df['team1'].replace(['KC', 'JAX', 'CAR', 'BAL', 'BUF', 'MIN', 'DET', 'ATL', 'NE', 'WSH',
           'CIN', 'NO', 'SF', 'LAR', 'NYG', 'DEN', 'CLE', 'IND', 'TEN', 'NYJ',
           'TB', 'MIA', 'PIT', 'PHI', 'GB', 'CHI', 'DAL', 'ARI', 'LAC', 'HOU',
           'SEA', 'OAK'],
            ['kan','jax','car', 'rav', 'buf', 'min', 'det', 'atl', 'nwe', 'was', 
            'cin', 'nor', 'sfo', 'ram', 'nyg', 'den', 'cle', 'clt', 'oti', 'nyj', 
             'tam','mia', 'pit', 'phi', 'gnb', 'chi', 'dal', 'crd', 'sdg', 'htx', 'sea', 'rai' ])
    elo_df['team2'] = elo_df['team2'].replace(['KC', 'JAX', 'CAR', 'BAL', 'BUF', 'MIN', 'DET', 'ATL', 'NE', 'WSH',
           'CIN', 'NO', 'SF', 'LAR', 'NYG', 'DEN', 'CLE', 'IND', 'TEN', 'NYJ',
           'TB', 'MIA', 'PIT', 'PHI', 'GB', 'CHI', 'DAL', 'ARI', 'LAC', 'HOU',
           'SEA', 'OAK'],
            ['kan','jax','car', 'rav', 'buf', 'min', 'det', 'atl', 'nwe', 'was', 
            'cin', 'nor', 'sfo', 'ram', 'nyg', 'den', 'cle', 'clt', 'oti', 'nyj', 
             'tam','mia', 'pit', 'phi', 'gnb', 'chi', 'dal', 'crd', 'sdg', 'htx', 'sea', 'rai' ])
    return elo_df

def merge_rankings(agg_games_df,elo_df):
    agg_games_df = pd.merge(agg_games_df, elo_df, how = 'inner', left_on = ['home_abbr', 'away_abbr'], right_on = ['team1', 'team2']).drop(columns = ['date','team1', 'team2'])
    agg_games_df['elo_dif'] = agg_games_df['elo2'] - agg_games_df['elo1']
    agg_games_df = agg_games_df.drop(columns = ['elo1', 'elo2'])
    return agg_games_df


def prep_test_train(current_week,weeks,year):
    current_week = current_week + 1
    schedule_df  = get_schedule(year)
    weeks_games_df = game_data_up_to_week(weeks,year)
    agg_games_df = agg_weekly_data(schedule_df,weeks_games_df,current_week,weeks)
    elo_df = get_elo()
    agg_games_df = merge_rankings(agg_games_df, elo_df)
    train_df = agg_games_df[agg_games_df.result.notna()]
    current_week = current_week - 1
    test_df = agg_games_df[agg_games_df.week == current_week]
    return test_df, train_df

def display(y_pred,X_test):
    for g in range(len(y_pred)):
        win_prob = round(y_pred[g],2)
        away_team = X_test.reset_index().drop(columns = 'index').loc[g,'away_name']
        home_team = X_test.reset_index().drop(columns = 'index').loc[g,'home_name']
        print(f'The {away_team} have a probability of {win_prob} of beating the {home_team}.')
    

In [8]:
#game = game_data_up_to_week(range(1,4),2021)
#game


In [31]:
current_week = 16
weeks = list(range(1,current_week + 1))
year = 2020

pred_games_df, comp_games_df = prep_test_train(current_week,weeks,year)

In [32]:
pred_games_df

Unnamed: 0,away_name,away_abbr,home_name,home_abbr,week,win_perc_dif,first_downs_dif,fumbles_dif,interceptions_dif,net_pass_yards_dif,pass_attempts_dif,pass_completions_dif,pass_touchdowns_dif,pass_yards_dif,penalties_dif,points_dif,rush_attempts_dif,rush_touchdowns_dif,rush_yards_dif,time_of_possession_dif,times_sacked_dif,total_yards_dif,turnovers_dif,yards_from_penalties_dif,yards_lost_from_sacks_dif,fourth_down_perc_dif,third_down_perc_dif,result,elo_dif
209,Minnesota Vikings,min,New Orleans Saints,nor,16,-0.285714,1.142857,-0.5,0.5,7.428571,-1.928571,-2.357143,0.285714,10.285714,-1.071429,-2.642857,0.071429,-0.428571,15.928571,-107.285714,0.5,23.357143,0.571429,-24.357143,2.857143,-0.116541,-0.02186,0.0,-178.963705
210,Tampa Bay Buccaneers,tam,Detroit Lions,det,16,0.285714,-0.785714,0.0,0.0,3.928571,1.642857,1.428571,0.571429,-4.714286,-0.285714,4.714286,-0.642857,-0.071429,1.214286,9.928571,-1.285714,5.142857,-0.071429,-4.642857,-8.642857,0.083333,0.004993,1.0,238.516153
211,San Francisco 49ers,sfo,Arizona Cardinals,crd,16,-0.214286,-2.5,-0.071429,0.428571,7.142857,0.928571,-0.285714,-0.357143,14.428571,-1.714286,-4.142857,-3.642857,-0.214286,-35.214286,135.0,0.785714,-28.071429,0.785714,-9.357143,7.285714,-0.327068,-0.019235,1.0,-58.793097
212,Miami Dolphins,mia,Las Vegas Raiders,rai,16,0.142857,-1.0,-0.428571,0.142857,-31.785714,-1.0,-1.142857,-0.285714,-29.142857,-0.928571,-1.785714,-1.928571,-0.285714,-15.571429,-37.0,0.571429,-47.357143,-0.357143,-11.857143,2.642857,0.068627,-0.086804,1.0,101.109481
213,New York Giants,nyg,Baltimore Ravens,rav,16,-0.285714,-1.928571,-0.428571,0.0,11.785714,6.428571,3.642857,-0.928571,17.142857,-1.285714,-11.357143,-7.285714,-0.642857,-59.214286,-91.857143,0.714286,-47.428571,0.285714,-20.928571,5.357143,0.037152,-0.065835,0.0,-255.776182
214,Indianapolis Colts,clt,Pittsburgh Steelers,pit,16,-0.071429,3.0,-1.071429,-0.071429,20.642857,-5.142857,-2.5,-0.571429,20.071429,0.785714,2.357143,3.928571,0.357143,25.142857,-25.857143,0.285714,45.785714,-0.357143,12.428571,-0.571429,0.14,-0.018493,0.0,38.596584
215,Cleveland Browns,cle,New York Jets,nyj,16,0.642857,5.785714,0.857143,-0.285714,54.571429,-0.785714,1.285714,1.0,42.071429,-0.071429,11.571429,7.071429,0.642857,49.714286,227.071429,-1.428571,104.285714,-0.214286,-4.571429,-12.5,0.028011,0.112625,0.0,241.029053
216,Atlanta Falcons,atl,Kansas City Chiefs,kan,16,-0.642857,-2.714286,-0.571429,0.428571,-37.928571,-0.285714,-1.5,-0.928571,-32.428571,-1.571429,-5.714286,-0.357143,0.0,-22.571429,-49.785714,0.928571,-60.5,0.142857,-11.857143,5.5,-0.050167,-0.057603,0.0,-285.037103
217,Chicago Bears,chi,Jacksonville Jaguars,jax,16,0.428571,0.571429,0.071429,0.0,-8.714286,-0.071429,0.571429,0.214286,-5.357143,-0.785714,2.857143,1.785714,-0.071429,1.5,114.5,-0.214286,-7.214286,-0.214286,-12.928571,3.357143,0.208333,-0.07965,1.0,227.302318
218,Cincinnati Bengals,cin,Houston Texans,htx,16,-0.054945,0.285714,0.214286,0.214286,-59.0,3.5,0.5,-0.714286,-54.285714,0.5,-3.142857,3.5,0.214286,10.571429,251.642857,0.214286,-48.428571,0.428571,5.857143,4.714286,0.186667,-0.06374,1.0,-81.564347


In [33]:
comp_games_df

Unnamed: 0,away_name,away_abbr,home_name,home_abbr,week,win_perc_dif,first_downs_dif,fumbles_dif,interceptions_dif,net_pass_yards_dif,pass_attempts_dif,pass_completions_dif,pass_touchdowns_dif,pass_yards_dif,penalties_dif,points_dif,rush_attempts_dif,rush_touchdowns_dif,rush_yards_dif,time_of_possession_dif,times_sacked_dif,total_yards_dif,turnovers_dif,yards_from_penalties_dif,yards_lost_from_sacks_dif,fourth_down_perc_dif,third_down_perc_dif,result,elo_dif
0,Cincinnati Bengals,cin,Cleveland Browns,cle,2,0.000000,-1.000000,-1.000000,0.000000,5.000000,-3.000000,2.000000,-1.000000,4.000000,-1.000000,7.000000,1.000000,1.000000,-16.000000,-12.000000,1.000000,-11.000000,-1.000000,-36.000000,-1.000000,0.000000,0.178571,0.0,-59.457910
1,Minnesota Vikings,min,Indianapolis Colts,clt,2,0.000000,-2.000000,0.000000,-1.000000,-109.000000,-21.000000,-17.000000,1.000000,-104.000000,0.000000,14.000000,0.000000,1.000000,46.000000,-879.000000,1.000000,-63.000000,-1.000000,-60.000000,5.000000,-0.333333,0.166667,0.0,80.630236
2,New York Giants,nyg,Chicago Bears,chi,2,-1.000000,-6.000000,-1.000000,2.000000,48.000000,5.000000,6.000000,-1.000000,37.000000,-2.000000,-11.000000,-8.000000,0.000000,-120.000000,-49.000000,2.000000,-72.000000,2.000000,-23.000000,-11.000000,0.500000,0.351515,0.0,-166.404690
3,Carolina Panthers,car,Tampa Bay Buccaneers,tam,2,0.000000,-1.000000,-2.000000,-2.000000,35.000000,-2.000000,-1.000000,-1.000000,30.000000,-4.000000,7.000000,4.000000,1.000000,43.000000,11.000000,-2.000000,78.000000,-3.000000,-38.000000,-5.000000,-0.500000,0.153846,0.0,-87.215651
4,Denver Broncos,den,Pittsburgh Steelers,pit,2,-1.000000,0.000000,1.000000,0.000000,8.000000,1.000000,1.000000,-2.000000,-13.000000,2.000000,-12.000000,-4.000000,1.000000,-34.000000,-264.000000,-2.000000,-26.000000,0.000000,35.000000,-21.000000,-1.000000,-0.200000,0.0,-45.328094
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
221,Los Angeles Rams,ram,Seattle Seahawks,sea,16,-0.071429,-0.642857,0.214286,-0.071429,-0.714286,1.000000,-0.142857,-1.214286,-8.642857,-1.000000,-4.857143,3.857143,0.428571,2.500000,62.000000,-1.500000,1.785714,0.285714,-1.714286,-7.928571,-0.038462,0.032292,0.0,-4.964735
222,Los Angeles Rams,ram,Seattle Seahawks,sea,16,-0.071429,-0.642857,0.214286,-0.071429,-0.714286,1.000000,-0.142857,-1.214286,-8.642857,-1.000000,-4.857143,3.857143,0.428571,2.500000,62.000000,-1.500000,1.785714,0.285714,-1.714286,-7.928571,-0.038462,0.032292,0.0,-40.911725
223,Philadelphia Eagles,phi,Dallas Cowboys,dal,16,-0.049451,-1.928571,0.500000,0.357143,-49.785714,-2.571429,-4.428571,-0.071429,-44.500000,0.285714,-2.571429,-1.714286,0.071429,16.357143,81.642857,1.642857,-33.428571,-0.071429,-1.500000,5.285714,-0.113978,-0.035035,0.0,18.185917
224,Tennessee Titans,oti,Green Bay Packers,gnb,16,-0.071429,1.714286,0.214286,0.071429,-22.142857,-3.214286,-3.214286,-0.642857,-24.214286,-0.214286,0.142857,4.714286,0.571429,31.500000,-236.428571,0.142857,9.357143,0.000000,4.571429,-2.071429,0.082707,-0.043704,0.0,-45.157679


In [12]:
train_df = comp_games_df
test_df = pred_games_df

X_train = train_df.drop(columns = ['away_name', 'away_abbr', 'home_name', 'home_abbr', 'week','result'])
y_train = train_df[['result']] 
X_test = test_df.drop(columns = ['away_name', 'away_abbr', 'home_name', 'home_abbr', 'week','result'])
y_test = test_df[['result']]

In [13]:
from sklearn.linear_model import LogisticRegression

# Figure out what these Logistic Regression settings do.
# penalty='l1' - Imposes a penalty to the logistic model for having too many variables. Shrinks the coefficients of less contributive variables. Results in more sparse data.
# dual=False - Duality vs Primal
# tol=0.001 - Tolerance for stopping criteria. This tells scikit to stop searching for a minimum (or maximum) once some tolerance is achieved, i.e. once you're close enough.
# fit_intercept=True - Sets te y-intercept to the line of best fit (False would keep it as a y-intercept of 0.)
# intercept_scaling=1 - This is the default. description is over my head.
# class_weight = 'balanced' - “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data
clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

clf.fit(X_train, np.ravel(y_train.values))
y_pred = clf.predict_proba(X_test)
y_pred = y_pred[:,1]

display(y_pred,test_df)

The Minnesota Vikings have a probability of 0.31 of beating the New Orleans Saints.
The Tampa Bay Buccaneers have a probability of 0.81 of beating the Detroit Lions.
The San Francisco 49ers have a probability of 0.4 of beating the Arizona Cardinals.
The Miami Dolphins have a probability of 0.49 of beating the Las Vegas Raiders.
The New York Giants have a probability of 0.21 of beating the Baltimore Ravens.
The Indianapolis Colts have a probability of 0.39 of beating the Pittsburgh Steelers.
The Cleveland Browns have a probability of 0.85 of beating the New York Jets.
The Atlanta Falcons have a probability of 0.14 of beating the Kansas City Chiefs.
The Chicago Bears have a probability of 0.73 of beating the Jacksonville Jaguars.
The Cincinnati Bengals have a probability of 0.5 of beating the Houston Texans.
The Carolina Panthers have a probability of 0.62 of beating the Washington Football Team.
The Denver Broncos have a probability of 0.51 of beating the Los Angeles Chargers.
The Los A

In [14]:
y_pred

array([0.31150194, 0.8115242 , 0.40206155, 0.48965883, 0.20737741,
       0.39465738, 0.84516588, 0.1353775 , 0.72861784, 0.4966973 ,
       0.62109314, 0.50820165, 0.50940529, 0.48667397, 0.45526414,
       0.26197437, 0.8468588 ])

In [27]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test,np.round(y_pred))

0.6470588235294118

In [26]:
# Create dataframe to hold prediction results
results_df = test_df[['week','away_name','home_name']]
results_df['away_team_win_%'] = y_pred
results_df['home_team_win_%'] = 1 - results_df['away_team_win_%']
results_df['predicted_winner'] = 0

# Loop through results dataframe and add predicted_winner name to column
for index, row in results_df.iterrows():
    if results_df.loc[index,'away_team_win_%'] > results_df.loc[index,'home_team_win_%']:
        results_df.loc[index,'predicted_winner'] = results_df.loc[index,'away_name']
    else:
        results_df.loc[index,'predicted_winner'] = results_df.loc[index,'home_name']

# Set Index to the 'week' column
results_df=results_df.set_index('week')      

# Use get_schedule function to pull dataframe that shows actual weekly winners. Includes all weeks.
weekly_winner = get_schedule(year)
weekly_winner = weekly_winner.drop(columns=['away_abbr','home_abbr','winning_abbr'])
weekly_winner = weekly_winner.rename(columns={'away_name':'ww_away_name','home_name':'ww_home_name'})

# Slices weekly_winner dataframe based on current week model is predicting.
current_week_wins = weekly_winner.loc[weekly_winner['week'] == current_week]
current_week_wins = current_week_wins.set_index('week')

# Sort both dataframes by away_name
results_df = results_df.sort_values(by='away_name')
current_week_wins = current_week_wins.sort_values(by='ww_away_name')

# Combine results & weekly_winner df's. Drop dupe columns and rename winning_name to 'actual_winner'
# The below code was working in prior workbook but now is popping an error. Need to troubleshoot.
actual_results_df = pd.concat([results_df,current_week_wins])
#.drop(columns=['ww_away_name','ww_home_name']).rename(columns={'winning_name':'actual_winner'})
actual_results_df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[r

Unnamed: 0_level_0,away_name,home_name,away_team_win_%,home_team_win_%,predicted_winner,ww_away_name,ww_home_name,winning_name
week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
16,Atlanta Falcons,Kansas City Chiefs,0.135377,0.864623,Kansas City Chiefs,,,
16,Buffalo Bills,New England Patriots,0.846859,0.153141,Buffalo Bills,,,
16,Carolina Panthers,Washington Football Team,0.621093,0.378907,Carolina Panthers,,,
16,Chicago Bears,Jacksonville Jaguars,0.728618,0.271382,Chicago Bears,,,
16,Cincinnati Bengals,Houston Texans,0.496697,0.503303,Houston Texans,,,
16,Cleveland Browns,New York Jets,0.845166,0.154834,Cleveland Browns,,,
16,Denver Broncos,Los Angeles Chargers,0.508202,0.491798,Denver Broncos,,,
16,Indianapolis Colts,Pittsburgh Steelers,0.394657,0.605343,Pittsburgh Steelers,,,
16,Los Angeles Rams,Seattle Seahawks,0.509405,0.490595,Los Angeles Rams,,,
16,Los Angeles Rams,Seattle Seahawks,0.486674,0.513326,Seattle Seahawks,,,


In [11]:
# Export actual_results_df to csv based on the current_week predicted.
file_path = str(f'Exports/week_{current_week}_base_results.csv')
actual_results_df.to_csv(file_path)

In [11]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test,np.round(y_pred))

0.6666666666666666

In [10]:
import xgboost as xgb

dtest = xgb.DMatrix(X_test, y_test, feature_names=X_test.columns)
dtrain = xgb.DMatrix(X_train, y_train,feature_names=X_train.columns)

param = {'verbosity':1, 
         'objective':'binary:hinge',
         'feature_selector': 'shuffle',
         'booster':'gblinear',
         'eval_metric' :'error',
         'learning_rate': 0.05}

evallist = [(dtrain, 'train'), (dtest, 'test')]

ModuleNotFoundError: No module named 'xgboost'

In [None]:
num_round = 300
bst = xgb.train(param, dtrain, num_round, evallist)

In [None]:
X_test = pred_games_df.drop(columns = ['away_name', 'away_abbr', 'home_name', 'home_abbr', 'week','result'])
y_pred = clf.predict_proba(X_test)
y_pred = y_pred[:,1]

display(y_pred,pred_games_df)