[link text](https:// [link text](https://))

Importing Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

Scraping/Merging Data

In [None]:
import pandas as pd

# Load data
nfl_2024 = pd.read_csv('/content/nfl_2024_new.csv')
nfl_2024_scores = pd.read_csv('/content/nfl_2024_scores.csv')

# Merge play-by-play and scores data
merged_2024 = nfl_2024.merge(nfl_2024_scores, left_on=['Date', 'OffenseTeam', 'DefenseTeam'], right_on=['Date', 'Visitor', 'Home'], how='left') \
    .merge(nfl_2024_scores, left_on=['Date', 'OffenseTeam', 'DefenseTeam'], right_on=['Date', 'Home', 'Visitor'], how='left', suffixes=('', '_reverse')) \
    .assign(**{column: lambda df, column=column: df[column].combine_first(df[column + '_reverse']) for column in ['Week', 'Visitor', 'VisitorScore', 'Home', 'HomeScore', 'OT']}) \
    .drop(columns=[col + '_reverse' for col in ['Week', 'Visitor', 'VisitorScore', 'Home', 'HomeScore', 'OT']])

# Add HomeWon column
merged_2024['HomeWon'] = merged_2024['HomeScore'] > merged_2024['VisitorScore']

# Display first few rows of updated data for verification
merged_2024[['Date', 'Home', 'Visitor', 'HomeScore', 'VisitorScore', 'HomeWon']].head(50)

Unnamed: 0,Date,Home,Visitor,HomeScore,VisitorScore,HomeWon
0,12/29/2024,MIN,GB,27.0,25.0,True
1,12/29/2024,MIN,GB,27.0,25.0,True
2,12/29/2024,MIN,GB,27.0,25.0,True
3,12/29/2024,MIN,GB,27.0,25.0,True
4,12/29/2024,MIN,GB,27.0,25.0,True
5,12/29/2024,MIN,GB,27.0,25.0,True
6,12/29/2024,TB,CAR,48.0,14.0,True
7,12/29/2024,TB,CAR,48.0,14.0,True
8,12/29/2024,TB,CAR,48.0,14.0,True
9,12/29/2024,TB,CAR,48.0,14.0,True


Team Feature Extraction

In [43]:
import pandas as pd
import numpy as np

# Assume merged_2024 already contains the required data for 2024
# merged_2024 contains columns such as 'Home', 'Visitor', 'HomeScore', 'VisitorScore', 'HomeWon'
# Load the dataset containing the upcoming games schedule.
upcoming_games = pd.read_csv('/content/upcoming_games_2025_week1.csv')

# 1. Average Points Scored
# Calculate the average points scored by the home and visitor teams.
avg_points_scored_home = merged_2024.groupby('Home')['HomeScore'].mean()
avg_points_scored_visitor = merged_2024.groupby('Visitor')['VisitorScore'].mean()

# 2. Average Points Allowed
# Calculate the average points allowed by the home and visitor teams.
avg_points_allowed_home = merged_2024.groupby('Home')['VisitorScore'].mean()
avg_points_allowed_visitor = merged_2024.groupby('Visitor')['HomeScore'].mean()

# Calculate the overall average points scored and allowed by combining the home and visitor averages.
overall_avg_points_scored = (avg_points_scored_home + avg_points_scored_visitor) / 2
overall_avg_points_allowed = (avg_points_allowed_home + avg_points_allowed_visitor) / 2

# 3. Win Rate
# Calculate the total number of wins for home and visitor teams.
home_wins = merged_2024.groupby('Home')['HomeWon'].sum()
visitor_wins = merged_2024.groupby('Visitor').apply(lambda x: len(x) - x['HomeWon'].sum())

# Calculate the total number of games played by each team as home and visitor.
total_games_home = merged_2024['Home'].value_counts()
total_games_visitor = merged_2024['Visitor'].value_counts()

# Calculate the overall number of wins and total games played by each team.
overall_wins = home_wins + visitor_wins
total_games = total_games_home + total_games_visitor

# Calculate the win rate for each team.
win_rate = overall_wins / total_games

# Create a new data frame to store the features for each team.
team_features = pd.DataFrame({
    'AvgPointsScored': overall_avg_points_scored,
    'AvgPointsAllowed': overall_avg_points_allowed,
    'WinRate': win_rate
})

# Reset the index of the team_features DataFrame and rename the index column to "Team".
team_features.reset_index(inplace=True)
team_features.rename(columns={'Home': 'Team'}, inplace=True)

# Display the first few rows of the team_features DataFrame.
team_features.head(32)

  visitor_wins = merged_2024.groupby('Visitor').apply(lambda x: len(x) - x['HomeWon'].sum())


Unnamed: 0,Team,AvgPointsScored,AvgPointsAllowed,WinRate
0,ARI,21.93905,22.417655,0.419609
1,ATL,22.178923,23.802673,0.504973
2,BAL,30.215613,22.331169,0.679041
3,BUF,32.132682,21.771034,0.810774
4,CAR,18.581203,31.259658,0.260912
5,CHI,17.595422,22.197156,0.244108
6,CIN,28.780905,26.494829,0.501461
7,CLE,15.724711,25.285987,0.185644
8,DAL,20.87366,27.726174,0.443215
9,DEN,24.825513,19.431284,0.565964


In [44]:
upcoming_games.head(16)

Unnamed: 0,Home,Visitor
0,PHI,DAL
1,LAC,KC
2,NO,ARI
3,NYJ,PIT
4,IND,MIA
5,ATL,TB
6,WAS,NYG
7,JAX,CAR
8,CLE,CIN
9,NE,LV


In [45]:
# Calculate defensive features for each NFL team.

# 1. Average points defended:
# This metric is essentially the same as AvgPointsAllowed, which we already computed in previous steps so we won't recompute it here.

# 2. Average conceded plays:
# A play is considered successful for the offense if it results in a touchdown or doesn't result in a turnover.
# Create a new column 'SuccessfulPlay' in the all_data DataFrame to represent this.
merged_2024['SuccessfulPlay'] = merged_2024['IsTouchdown'] | (~merged_2024['IsInterception'] & ~merged_2024['IsFumble'])

# Calculate the average rate of successful plays conceded when playing at home.
avg_conceded_plays_home = merged_2024.groupby('Home')['SuccessfulPlay'].mean()

# Calculate the average rate of successful plays conceded when playing as a visitor.
avg_conceded_plays_visitor = merged_2024.groupby('Visitor')['SuccessfulPlay'].mean()

# Calculate the overall average rate of successful plays conceded for each team.
overall_avg_conceded_plays = (avg_conceded_plays_home + avg_conceded_plays_visitor) / 2

# 3. Average forced turnovers:
# Create a new column 'Turnover' that indicates if a play resulted in a turnover (either interception or fumble).
merged_2024['Turnover'] = merged_2024['IsInterception'] | merged_2024['IsFumble']

# Calculate the average rate of turnovers forced when playing at home.
avg_forced_turnovers_home = merged_2024.groupby('Home')['Turnover'].mean()

# Calculate the average rate of turnovers forced when playing as a visitor.
avg_forced_turnovers_visitor = merged_2024.groupby('Visitor')['Turnover'].mean()

# Calculate the overall average rate of turnovers forced for each team.
overall_avg_forced_turnovers = (avg_forced_turnovers_home + avg_forced_turnovers_visitor) / 2

# Create a new DataFrame to store the defensive features for each team.
team_features_defensive = pd.DataFrame({
    'Team': team_features['Team'].values,
    'AvgPointsDefended': team_features['AvgPointsAllowed'].values,
    'AvgConcededPlays': overall_avg_conceded_plays.values,
    'AvgForcedTurnovers': overall_avg_forced_turnovers.values
})

# Merge the defensive features with the original team features to create a combined DataFrame.
team_features_combined = team_features.merge(team_features_defensive, on='Team', how='left')

# Display the first few rows of the combined team features DataFrame.
team_features_combined.head(32)

Unnamed: 0,Team,AvgPointsScored,AvgPointsAllowed,WinRate,AvgPointsDefended,AvgConcededPlays,AvgForcedTurnovers
0,ARI,21.93905,22.417655,0.419609,22.417655,-1.019879,0.021251
1,ATL,22.178923,23.802673,0.504973,23.802673,-1.02221,0.023877
2,BAL,30.215613,22.331169,0.679041,22.331169,-1.015552,0.017282
3,BUF,32.132682,21.771034,0.810774,21.771034,-1.018904,0.020579
4,CAR,18.581203,31.259658,0.260912,31.259658,-1.017999,0.018779
5,CHI,17.595422,22.197156,0.244108,22.197156,-1.017277,0.01789
6,CIN,28.780905,26.494829,0.501461,26.494829,-1.019098,0.022027
7,CLE,15.724711,25.285987,0.185644,25.285987,-1.019512,0.021505
8,DAL,20.87366,27.726174,0.443215,27.726174,-1.02603,0.027007
9,DEN,24.825513,19.431284,0.565964,19.431284,-1.017416,0.020217


In [46]:
# Calculate additional offensive features

# 1. Average yards per play
avg_yards_per_play_home = merged_2024.groupby('Home')['Yards'].mean()
avg_yards_per_play_visitor = merged_2024.groupby('Visitor')['Yards'].mean()
overall_avg_yards_per_play = (avg_yards_per_play_home + avg_yards_per_play_visitor) / 2

# 2. Average total yards per game
total_yards_per_game_home = merged_2024.groupby(['SeasonYear', 'Home'])['Yards'].sum() / merged_2024.groupby(['SeasonYear', 'Home']).size()
total_yards_per_game_visitor = merged_2024.groupby(['SeasonYear', 'Visitor'])['Yards'].sum() / merged_2024.groupby(['SeasonYear', 'Visitor']).size()
overall_avg_yards_per_game = (total_yards_per_game_home + total_yards_per_game_visitor).groupby(level=1).mean()

# 3. Average pass completion rate
avg_pass_completion_rate_home = merged_2024.groupby('Home').apply(lambda x: 1 - x['IsIncomplete'].mean())
avg_pass_completion_rate_visitor = merged_2024.groupby('Visitor').apply(lambda x: 1 - x['IsIncomplete'].mean())
overall_avg_pass_completion_rate = (avg_pass_completion_rate_home + avg_pass_completion_rate_visitor) / 2

# 4. Average touchdowns per game
avg_touchdowns_per_game_home = merged_2024.groupby(['SeasonYear', 'Home'])['IsTouchdown'].sum() / merged_2024.groupby(['SeasonYear', 'Home']).size()
avg_touchdowns_per_game_visitor = merged_2024.groupby(['SeasonYear', 'Visitor'])['IsTouchdown'].sum() / merged_2024.groupby(['SeasonYear', 'Visitor']).size()
overall_avg_touchdowns_per_game = (avg_touchdowns_per_game_home + avg_touchdowns_per_game_visitor).groupby(level=1).mean()

# 5. Average rush success rate
avg_rush_success_rate_home = merged_2024.groupby('Home').apply(lambda x: x['Yards'][x['IsRush'] == 1].mean())
avg_rush_success_rate_visitor = merged_2024.groupby('Visitor').apply(lambda x: x['Yards'][x['IsRush'] == 1].mean())
overall_avg_rush_success_rate = (avg_rush_success_rate_home + avg_rush_success_rate_visitor) / 2

# Creating a dataframe for the new offensive features
new_offensive_features = pd.DataFrame({
    'Team': team_features_combined['Team'],
    'AvgYardsPerPlay': overall_avg_yards_per_play.values,
    'AvgYardsPerGame': overall_avg_yards_per_game.values,
    'AvgPassCompletionRate': overall_avg_pass_completion_rate.values,
    'AvgTouchdownsPerGame': overall_avg_touchdowns_per_game.values,
    'AvgRushSuccessRate': overall_avg_rush_success_rate.values
})

# Merging with the existing combined features
team_features_expanded = team_features_combined.merge(new_offensive_features, on='Team')

team_features_expanded.head(32)

  avg_pass_completion_rate_home = merged_2024.groupby('Home').apply(lambda x: 1 - x['IsIncomplete'].mean())
  avg_pass_completion_rate_visitor = merged_2024.groupby('Visitor').apply(lambda x: 1 - x['IsIncomplete'].mean())
  avg_rush_success_rate_home = merged_2024.groupby('Home').apply(lambda x: x['Yards'][x['IsRush'] == 1].mean())
  avg_rush_success_rate_visitor = merged_2024.groupby('Visitor').apply(lambda x: x['Yards'][x['IsRush'] == 1].mean())


Unnamed: 0,Team,AvgPointsScored,AvgPointsAllowed,WinRate,AvgPointsDefended,AvgConcededPlays,AvgForcedTurnovers,AvgYardsPerPlay,AvgYardsPerGame,AvgPassCompletionRate,AvgTouchdownsPerGame,AvgRushSuccessRate
0,ARI,21.93905,22.417655,0.419609,22.417655,-1.019879,0.021251,3.982764,7.965529,0.888556,0.052776,5.232888
1,ATL,22.178923,23.802673,0.504973,23.802673,-1.02221,0.023877,4.055938,8.111875,0.889247,0.057662,4.730858
2,BAL,30.215613,22.331169,0.679041,22.331169,-1.015552,0.017282,4.111842,8.223684,0.874797,0.068648,5.048079
3,BUF,32.132682,21.771034,0.810774,21.771034,-1.018904,0.020579,4.05299,8.105981,0.873419,0.076164,4.897583
4,CAR,18.581203,31.259658,0.260912,31.259658,-1.017999,0.018779,3.82352,7.64704,0.879671,0.063857,5.078586
5,CHI,17.595422,22.197156,0.244108,22.197156,-1.017277,0.01789,3.623948,7.247896,0.869852,0.052716,4.569097
6,CIN,28.780905,26.494829,0.501461,26.494829,-1.019098,0.022027,4.020278,8.040556,0.872226,0.078036,4.638758
7,CLE,15.724711,25.285987,0.185644,25.285987,-1.019512,0.021505,3.640863,7.281725,0.859458,0.054924,4.629079
8,DAL,20.87366,27.726174,0.443215,27.726174,-1.02603,0.027007,3.886588,7.773176,0.870121,0.06119,4.717464
9,DEN,24.825513,19.431284,0.565964,19.431284,-1.017416,0.020217,3.686734,7.373468,0.866159,0.053687,4.332698


In [47]:
# Calculate additional defensive features

# 1. Average yards allowed per play
avg_yards_allowed_per_play_home = merged_2024.groupby('Home')['Yards'].mean()
avg_yards_allowed_per_play_visitor = merged_2024.groupby('Visitor')['Yards'].mean()
overall_avg_yards_allowed_per_play = (avg_yards_allowed_per_play_home + avg_yards_allowed_per_play_visitor) / 2

# 2. Average total yards allowed per game
total_yards_allowed_per_game_home = merged_2024.groupby(['SeasonYear', 'Home'])['Yards'].sum() / merged_2024.groupby(['SeasonYear', 'Home']).size()
total_yards_allowed_per_game_visitor = merged_2024.groupby(['SeasonYear', 'Visitor'])['Yards'].sum() / merged_2024.groupby(['SeasonYear', 'Visitor']).size()
overall_avg_yards_allowed_per_game = (total_yards_allowed_per_game_home + total_yards_allowed_per_game_visitor).groupby(level=1).mean()

# 3. Average pass completion allowed rate
avg_pass_completion_allowed_rate_home = merged_2024.groupby('Home').apply(lambda x: 1 - x['IsIncomplete'].mean())
avg_pass_completion_allowed_rate_visitor = merged_2024.groupby('Visitor').apply(lambda x: 1 - x['IsIncomplete'].mean())
overall_avg_pass_completion_allowed_rate = (avg_pass_completion_allowed_rate_home + avg_pass_completion_allowed_rate_visitor) / 2

# 4. Average touchdowns allowed per game
avg_touchdowns_allowed_per_game_home = merged_2024.groupby(['SeasonYear', 'Home'])['IsTouchdown'].sum() / merged_2024.groupby(['SeasonYear', 'Home']).size()
avg_touchdowns_allowed_per_game_visitor = merged_2024.groupby(['SeasonYear', 'Visitor'])['IsTouchdown'].sum() / merged_2024.groupby(['SeasonYear', 'Visitor']).size()
overall_avg_touchdowns_allowed_per_game = (avg_touchdowns_allowed_per_game_home + avg_touchdowns_allowed_per_game_visitor).groupby(level=1).mean()

# 5. Average rush success allowed rate
avg_rush_success_allowed_rate_home = merged_2024.groupby('Home').apply(lambda x: x['Yards'][x['IsRush'] == 1].mean())
avg_rush_success_allowed_rate_visitor = merged_2024.groupby('Visitor').apply(lambda x: x['Yards'][x['IsRush'] == 1].mean())
overall_avg_rush_success_allowed_rate = (avg_rush_success_allowed_rate_home + avg_rush_success_allowed_rate_visitor) / 2

# Creating a dataframe for the new defensive features
new_defensive_features = pd.DataFrame({
    'Team': team_features_expanded['Team'],
    'AvgYardsAllowedPerPlay': overall_avg_yards_allowed_per_play.values,
    'AvgYardsAllowedPerGame': overall_avg_yards_allowed_per_game.values,
    'AvgPassCompletionAllowedRate': overall_avg_pass_completion_allowed_rate.values,
    'AvgTouchdownsAllowedPerGame': overall_avg_touchdowns_allowed_per_game.values,
    'AvgRushSuccessAllowedRate': overall_avg_rush_success_allowed_rate.values
})

# Merging with the existing combined features
team_features_complete = team_features_expanded.merge(new_defensive_features, on='Team')

team_features_complete

  avg_pass_completion_allowed_rate_home = merged_2024.groupby('Home').apply(lambda x: 1 - x['IsIncomplete'].mean())
  avg_pass_completion_allowed_rate_visitor = merged_2024.groupby('Visitor').apply(lambda x: 1 - x['IsIncomplete'].mean())
  avg_rush_success_allowed_rate_home = merged_2024.groupby('Home').apply(lambda x: x['Yards'][x['IsRush'] == 1].mean())
  avg_rush_success_allowed_rate_visitor = merged_2024.groupby('Visitor').apply(lambda x: x['Yards'][x['IsRush'] == 1].mean())


Unnamed: 0,Team,AvgPointsScored,AvgPointsAllowed,WinRate,AvgPointsDefended,AvgConcededPlays,AvgForcedTurnovers,AvgYardsPerPlay,AvgYardsPerGame,AvgPassCompletionRate,AvgTouchdownsPerGame,AvgRushSuccessRate,AvgYardsAllowedPerPlay,AvgYardsAllowedPerGame,AvgPassCompletionAllowedRate,AvgTouchdownsAllowedPerGame,AvgRushSuccessAllowedRate
0,ARI,21.93905,22.417655,0.419609,22.417655,-1.019879,0.021251,3.982764,7.965529,0.888556,0.052776,5.232888,3.982764,7.965529,0.888556,0.052776,5.232888
1,ATL,22.178923,23.802673,0.504973,23.802673,-1.02221,0.023877,4.055938,8.111875,0.889247,0.057662,4.730858,4.055938,8.111875,0.889247,0.057662,4.730858
2,BAL,30.215613,22.331169,0.679041,22.331169,-1.015552,0.017282,4.111842,8.223684,0.874797,0.068648,5.048079,4.111842,8.223684,0.874797,0.068648,5.048079
3,BUF,32.132682,21.771034,0.810774,21.771034,-1.018904,0.020579,4.05299,8.105981,0.873419,0.076164,4.897583,4.05299,8.105981,0.873419,0.076164,4.897583
4,CAR,18.581203,31.259658,0.260912,31.259658,-1.017999,0.018779,3.82352,7.64704,0.879671,0.063857,5.078586,3.82352,7.64704,0.879671,0.063857,5.078586
5,CHI,17.595422,22.197156,0.244108,22.197156,-1.017277,0.01789,3.623948,7.247896,0.869852,0.052716,4.569097,3.623948,7.247896,0.869852,0.052716,4.569097
6,CIN,28.780905,26.494829,0.501461,26.494829,-1.019098,0.022027,4.020278,8.040556,0.872226,0.078036,4.638758,4.020278,8.040556,0.872226,0.078036,4.638758
7,CLE,15.724711,25.285987,0.185644,25.285987,-1.019512,0.021505,3.640863,7.281725,0.859458,0.054924,4.629079,3.640863,7.281725,0.859458,0.054924,4.629079
8,DAL,20.87366,27.726174,0.443215,27.726174,-1.02603,0.027007,3.886588,7.773176,0.870121,0.06119,4.717464,3.886588,7.773176,0.870121,0.06119,4.717464
9,DEN,24.825513,19.431284,0.565964,19.431284,-1.017416,0.020217,3.686734,7.373468,0.866159,0.053687,4.332698,3.686734,7.373468,0.866159,0.053687,4.332698


In [48]:
# Reload the upcoming games data
upcoming_games = pd.read_csv('/content/upcoming_games_2025_week1.csv')

# Feature encoding: merging the upcoming games data with the team features data
upcoming_encoded_home = upcoming_games.merge(team_features_complete, left_on='Home', right_on='Team', how='left')
upcoming_encoded_both = upcoming_encoded_home.merge(team_features_complete, left_on='Visitor', right_on='Team', suffixes=('_Home', '_Visitor'), how='left')

upcoming_encoded_both

Unnamed: 0,Home,Visitor,Team_Home,AvgPointsScored_Home,AvgPointsAllowed_Home,WinRate_Home,AvgPointsDefended_Home,AvgConcededPlays_Home,AvgForcedTurnovers_Home,AvgYardsPerPlay_Home,...,AvgYardsPerPlay_Visitor,AvgYardsPerGame_Visitor,AvgPassCompletionRate_Visitor,AvgTouchdownsPerGame_Visitor,AvgRushSuccessRate_Visitor,AvgYardsAllowedPerPlay_Visitor,AvgYardsAllowedPerGame_Visitor,AvgPassCompletionAllowedRate_Visitor,AvgTouchdownsAllowedPerGame_Visitor,AvgRushSuccessAllowedRate_Visitor
0,PHI,DAL,PHI,27.784119,18.390556,0.807188,18.390556,-1.021505,0.022849,3.713618,...,3.886588,7.773176,0.870121,0.06119,4.717464,3.886588,7.773176,0.870121,0.06119,4.717464
1,LAC,KC,LAC,23.27507,17.615053,0.633606,17.615053,-1.016398,0.017415,3.737752,...,3.710325,7.420649,0.8723,0.052486,4.402571,3.710325,7.420649,0.8723,0.052486,4.402571
2,NO,ARI,NO,19.658743,23.417726,0.311367,23.417726,-1.01653,0.018674,3.958671,...,3.982764,7.965529,0.888556,0.052776,5.232888,3.982764,7.965529,0.888556,0.052776,5.232888
3,NYJ,PIT,NYJ,19.241102,23.26158,0.247626,23.26158,-1.016311,0.018481,3.623839,...,3.763171,7.526341,0.880841,0.053082,4.335458,3.763171,7.526341,0.880841,0.053082,4.335458
4,IND,MIA,IND,22.046055,25.088614,0.438019,25.088614,-1.023501,0.025656,3.978577,...,3.698139,7.396277,0.880802,0.044901,4.45224,3.698139,7.396277,0.880802,0.044901,4.45224
5,ATL,TB,ATL,22.178923,23.802673,0.504973,23.802673,-1.02221,0.023877,4.055938,...,4.110274,8.220547,0.880633,0.06872,5.119207,4.110274,8.220547,0.880633,0.06872,5.119207
6,WAS,NYG,WAS,28.506914,23.731939,0.680583,23.731939,-1.017854,0.018141,3.907699,...,3.713084,7.426168,0.874942,0.056458,4.874058,3.713084,7.426168,0.874942,0.056458,4.874058
7,JAX,CAR,JAX,18.676046,25.760421,0.241252,25.760421,-1.015473,0.016482,3.959279,...,3.82352,7.64704,0.879671,0.063857,5.078586,3.82352,7.64704,0.879671,0.063857,5.078586
8,CLE,CIN,CLE,15.724711,25.285987,0.185644,25.285987,-1.019512,0.021505,3.640863,...,4.020278,8.040556,0.872226,0.078036,4.638758,4.020278,8.040556,0.872226,0.078036,4.638758
9,NE,LV,NE,16.903046,25.333822,0.181087,25.333822,-1.021808,0.02302,3.655997,...,3.590382,7.180764,0.867423,0.050676,4.314221,3.590382,7.180764,0.867423,0.050676,4.314221


Data Training Preparation

In [None]:
# Prepare training data leak free using prior only rolling features and low ram

import pandas as pd
import numpy as np

# Check required columns in merged data
required_cols = {'Week', 'Home', 'Visitor'}
missing = [c for c in required_cols if c not in merged_2024.columns]
if missing:
    raise ValueError(f"Merged_2024 is missing required columns: {missing}")

# Validate or derive HomeWon column
homewon_exists = 'HomeWon' in merged_2024.columns

home_pts_candidates = ['HomePoints','HomeScore','home_points','Home_Score','HomePTS']
away_pts_candidates = ['VisitorPoints','AwayPoints','VisitorScore','away_points','Visitor_Score','VisitorPTS']
home_points_col = next((c for c in home_pts_candidates if c in merged_2024.columns), None)
visitor_points_col = next((c for c in away_pts_candidates if c in merged_2024.columns), None)

if not homewon_exists:
    if home_points_col and visitor_points_col:
        merged_2024 = merged_2024.copy()
        merged_2024['HomeWon'] = (pd.to_numeric(merged_2024[home_points_col], errors='coerce')
                                  > pd.to_numeric(merged_2024[visitor_points_col], errors='coerce')).astype(int)
    else:
        raise ValueError("Need either HomeWon or score columns to derive it")

# Ensure one row per game to reduce duplicates
merged_2024_games = (
    merged_2024
    .drop_duplicates(subset=['Week','Home','Visitor'])
    .reset_index(drop=True)
    .copy()
)

# Keep only minimal columns to save ram
keep_cols = ['Week','Home','Visitor','HomeWon']
if home_points_col and visitor_points_col:
    keep_cols += [home_points_col, visitor_points_col]
merged_2024_games = merged_2024_games[keep_cols]

# Build per team per game table for home and visitor perspectives
home_df = merged_2024_games[['Week','Home','Visitor','HomeWon']].rename(columns={'Home':'Team','Visitor':'Opp'}).copy()
home_df['IsHome'] = 1
home_df['Won'] = home_df['HomeWon'].astype(int)
if home_points_col and visitor_points_col:
    home_df['PointsFor'] = pd.to_numeric(merged_2024_games[home_points_col], errors='coerce')
    home_df['PointsAgainst'] = pd.to_numeric(merged_2024_games[visitor_points_col], errors='coerce')

vis_df = merged_2024_games[['Week','Home','Visitor','HomeWon']].rename(columns={'Visitor':'Team','Home':'Opp'}).copy()
vis_df['IsHome'] = 0
vis_df['Won'] = (1 - vis_df['HomeWon'].astype(int))
if home_points_col and visitor_points_col:
    vis_df['PointsFor'] = pd.to_numeric(merged_2024_games[visitor_points_col], errors='coerce')
    vis_df['PointsAgainst'] = pd.to_numeric(merged_2024_games[home_points_col], errors='coerce')

team_games = (
    pd.concat([home_df, vis_df], ignore_index=True)
    .sort_values(['Team','Week'])
    .reset_index(drop=True)
)

# Define function for prior only aligned rolling mean
def prior_mean_aligned(df, group_col, value_col):
    g = df.groupby(group_col)[value_col]
    prior_sum = g.cumsum().shift(1)
    prior_cnt = g.cumcount()
    return prior_sum / prior_cnt.replace(0, np.nan)

# Compute leak free rolling features
team_games['WinRate'] = prior_mean_aligned(team_games, 'Team', 'Won')
if ('PointsFor' in team_games.columns) and ('PointsAgainst' in team_games.columns):
    team_games['AvgPointsScored'] = prior_mean_aligned(team_games, 'Team', 'PointsFor')
    team_games['AvgPointsAllowed'] = prior_mean_aligned(team_games, 'Team', 'PointsAgainst')

# Prepare snapshot of pre game features
feat_cols = ['WinRate','AvgPointsScored','AvgPointsAllowed']
feat_cols = [c for c in feat_cols if c in team_games.columns]

pre_game_feats = team_games[['Team','Week'] + feat_cols].copy()

# Merge home features
home_merge = merged_2024_games.merge(pre_game_feats, left_on=['Home','Week'], right_on=['Team','Week'], how='left')
home_merge = home_merge.rename(columns={c: f"{c}_Home" for c in feat_cols}).drop(columns=['Team'])

# Merge visitor features
both_merge = home_merge.merge(pre_game_feats, left_on=['Visitor','Week'], right_on=['Team','Week'], how='left')
both_merge = both_merge.rename(columns={c: f"{c}_Visitor" for c in feat_cols}).drop(columns=['Team'])

# Compute difference features home minus visitor
for base in feat_cols:
    h = f"{base}_Home"
    v = f"{base}_Visitor"
    both_merge[f"Diff_{base}"] = both_merge[h] - both_merge[v]

# Final training matrices
training_labels = both_merge['HomeWon'].astype(int).copy()
training_data = both_merge[[c for c in both_merge.columns if c.startswith('Diff_')]].copy()

# Handle first week NaNs by filling with column means
if training_data.isna().any().any():
    training_data = training_data.fillna(training_data.mean(numeric_only=True))

# Optional preview
training_data.head(50)


Unnamed: 0,Diff_AvgPointsScored,Diff_AvgPointsAllowed,Diff_WinRate,Diff_AvgPointsDefended,Diff_AvgConcededPlays,Diff_AvgForcedTurnovers,Diff_AvgYardsPerPlay,Diff_AvgYardsPerGame,Diff_AvgPassCompletionRate,Diff_AvgTouchdownsPerGame,Diff_AvgRushSuccessRate,Diff_AvgYardsAllowedPerPlay,Diff_AvgYardsAllowedPerGame,Diff_AvgPassCompletionAllowedRate,Diff_AvgTouchdownsAllowedPerGame,Diff_AvgRushSuccessAllowedRate
0,-1.080438,-0.905936,0.211162,-0.905936,-0.001482,0.00205,-0.167556,-0.335112,-0.012156,-0.004077,-0.49163,-0.167556,-0.335112,-0.012156,-0.004077,-0.49163
1,-1.080438,-0.905936,0.211162,-0.905936,-0.001482,0.00205,-0.167556,-0.335112,-0.012156,-0.004077,-0.49163,-0.167556,-0.335112,-0.012156,-0.004077,-0.49163
2,-1.080438,-0.905936,0.211162,-0.905936,-0.001482,0.00205,-0.167556,-0.335112,-0.012156,-0.004077,-0.49163,-0.167556,-0.335112,-0.012156,-0.004077,-0.49163
3,-1.080438,-0.905936,0.211162,-0.905936,-0.001482,0.00205,-0.167556,-0.335112,-0.012156,-0.004077,-0.49163,-0.167556,-0.335112,-0.012156,-0.004077,-0.49163
4,-1.080438,-0.905936,0.211162,-0.905936,-0.001482,0.00205,-0.167556,-0.335112,-0.012156,-0.004077,-0.49163,-0.167556,-0.335112,-0.012156,-0.004077,-0.49163
5,-1.080438,-0.905936,0.211162,-0.905936,-0.001482,0.00205,-0.167556,-0.335112,-0.012156,-0.004077,-0.49163,-0.167556,-0.335112,-0.012156,-0.004077,-0.49163
6,11.221799,-8.009988,0.298008,-8.009988,-0.00598,0.0062,0.286753,0.573507,0.000962,0.004863,0.040622,0.286753,0.573507,0.000962,0.004863,0.040622
7,11.221799,-8.009988,0.298008,-8.009988,-0.00598,0.0062,0.286753,0.573507,0.000962,0.004863,0.040622,0.286753,0.573507,0.000962,0.004863,0.040622
8,11.221799,-8.009988,0.298008,-8.009988,-0.00598,0.0062,0.286753,0.573507,0.000962,0.004863,0.040622,0.286753,0.573507,0.000962,0.004863,0.040622
9,11.221799,-8.009988,0.298008,-8.009988,-0.00598,0.0062,0.286753,0.573507,0.000962,0.004863,0.040622,0.286753,0.573507,0.000962,0.004863,0.040622


In [50]:
training_data.shape

(48063, 16)

In [51]:
training_labels.head()

Unnamed: 0,HomeWon
0,True
1,True
2,True
3,True
4,True


AI Model Training

In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# Initialize the logistic regression model
logreg = LogisticRegression(max_iter=1000)

# Evaluate the model's performance using cross-validation
cross_val_scores = cross_val_score(logreg, training_data, training_labels, cv=10)

cross_val_scores_mean = cross_val_scores.mean()

cross_val_scores_mean

np.float64(0.7876323940127195)

In [53]:
# Checking the shape of the training data
training_data.shape

(48063, 16)

In [54]:
# Train the logistic regression model on the entire cleaned training dataset
logreg.fit(training_data, training_labels)

Upcoming Game Predictions

In [55]:
# Step 1: Extract the 'Diff_' columns from the training data
# Create a list of the 'Diff_' columns used in training data
diff_columns = [col for col in training_encoded_both.columns if 'Diff_' in col]

# Step 2: Create the upcoming_encoded_final by adding the differences to the upcoming_encoded_both dataframe
upcoming_encoded_final = upcoming_encoded_both.copy()

# Add the 'Diff_' columns to the upcoming_encoded_both from training data
# We use the training data to fill in the Diff_ columns by matching on the 'Home' and 'Visitor' teams

for col in diff_columns:
    # Calculate the difference for each column for the upcoming games
    # Merging based on both 'Home' and 'Visitor' teams
    upcoming_encoded_final[col] = upcoming_encoded_final[f'{col.split("_")[1]}_Home'] - upcoming_encoded_final[f'{col.split("_")[1]}_Visitor']

upcoming_encoded_final.head(16)

Unnamed: 0,Home,Visitor,Team_Home,AvgPointsScored_Home,AvgPointsAllowed_Home,WinRate_Home,AvgPointsDefended_Home,AvgConcededPlays_Home,AvgForcedTurnovers_Home,AvgYardsPerPlay_Home,...,Diff_AvgYardsPerPlay,Diff_AvgYardsPerGame,Diff_AvgPassCompletionRate,Diff_AvgTouchdownsPerGame,Diff_AvgRushSuccessRate,Diff_AvgYardsAllowedPerPlay,Diff_AvgYardsAllowedPerGame,Diff_AvgPassCompletionAllowedRate,Diff_AvgTouchdownsAllowedPerGame,Diff_AvgRushSuccessAllowedRate
0,PHI,DAL,PHI,27.784119,18.390556,0.807188,18.390556,-1.021505,0.022849,3.713618,...,-0.17297,-0.34594,0.015664,0.00129,0.326966,-0.17297,-0.34594,0.015664,0.00129,0.326966
1,LAC,KC,LAC,23.27507,17.615053,0.633606,17.615053,-1.016398,0.017415,3.737752,...,0.027427,0.054854,0.000619,-0.004242,0.340779,0.027427,0.054854,0.000619,-0.004242,0.340779
2,NO,ARI,NO,19.658743,23.417726,0.311367,23.417726,-1.01653,0.018674,3.958671,...,-0.024093,-0.048187,-0.024769,0.005201,-0.416487,-0.024093,-0.048187,-0.024769,0.005201,-0.416487
3,NYJ,PIT,NYJ,19.241102,23.26158,0.247626,23.26158,-1.016311,0.018481,3.623839,...,-0.139332,-0.278664,-0.026069,0.003102,0.14792,-0.139332,-0.278664,-0.026069,0.003102,0.14792
4,IND,MIA,IND,22.046055,25.088614,0.438019,25.088614,-1.023501,0.025656,3.978577,...,0.280438,0.560876,0.001954,0.017634,0.326394,0.280438,0.560876,0.001954,0.017634,0.326394
5,ATL,TB,ATL,22.178923,23.802673,0.504973,23.802673,-1.02221,0.023877,4.055938,...,-0.054336,-0.108672,0.008614,-0.011058,-0.38835,-0.054336,-0.108672,0.008614,-0.011058,-0.38835
6,WAS,NYG,WAS,28.506914,23.731939,0.680583,23.731939,-1.017854,0.018141,3.907699,...,0.194615,0.389231,0.013771,0.012323,0.157608,0.194615,0.389231,0.013771,0.012323,0.157608
7,JAX,CAR,JAX,18.676046,25.760421,0.241252,25.760421,-1.015473,0.016482,3.959279,...,0.135758,0.271517,-0.008217,-0.005974,-0.514691,0.135758,0.271517,-0.008217,-0.005974,-0.514691
8,CLE,CIN,CLE,15.724711,25.285987,0.185644,25.285987,-1.019512,0.021505,3.640863,...,-0.379415,-0.75883,-0.012768,-0.023112,-0.00968,-0.379415,-0.75883,-0.012768,-0.023112,-0.00968
9,NE,LV,NE,16.903046,25.333822,0.181087,25.333822,-1.021808,0.02302,3.655997,...,0.065615,0.131231,0.006267,0.004816,0.445274,0.065615,0.131231,0.006267,0.004816,0.445274


In [56]:
# Predict the probability of the home team winning for the upcoming games
upcoming_game_probabilities = logreg.predict_proba(upcoming_encoded_final[[col for col in upcoming_encoded_final.columns if 'Diff_' in col]])

In [57]:
upcoming_game_probabilities

array([[0.09610039, 0.90389961],
       [0.81748473, 0.18251527],
       [0.61606328, 0.38393672],
       [0.888612  , 0.111388  ],
       [0.53133179, 0.46866821],
       [0.52188351, 0.47811649],
       [0.05061517, 0.94938483],
       [0.45150769, 0.54849231],
       [0.83684696, 0.16315304],
       [0.52705652, 0.47294348],
       [0.25792967, 0.74207033],
       [0.08685275, 0.91314725],
       [0.33481118, 0.66518882],
       [0.71622359, 0.28377641],
       [0.2957618 , 0.7042382 ],
       [0.97062569, 0.02937431]])

In [58]:
# Extract the probability that the home team will win (second column of the result)
upcoming_game_prob_home_win = upcoming_game_probabilities[:, 1]

# Add the predictions to the upcoming games dataframe
upcoming_encoded_final['Home Win Probability'] = upcoming_game_prob_home_win

# Determine the predicted winner based on HomeWinProbability
upcoming_encoded_final['Predicted Winner'] = upcoming_encoded_final.apply(
    lambda row: row['Home'] if row['Home Win Probability'] >= 0.5 else row['Visitor'], axis=1
)

# Sort by the probability of the home team winning for better visualization
upcoming_predictions = upcoming_encoded_final[['Home', 'Visitor', 'Home Win Probability', 'Predicted Winner']].sort_values(by='Home Win Probability', ascending=False)

upcoming_predictions

Unnamed: 0,Home,Visitor,Home Win Probability,Predicted Winner
6,WAS,NYG,0.949385,WAS
11,DEN,TEN,0.913147,DEN
0,PHI,DAL,0.9039,PHI
10,SEA,SF,0.74207,SEA
14,BUF,BAL,0.704238,BUF
12,LA,HOU,0.665189,LA
7,JAX,CAR,0.548492,JAX
5,ATL,TB,0.478116,TB
9,NE,LV,0.472943,LV
4,IND,MIA,0.468668,MIA
