# Details

In [122]:
import pandas as pd
import numpy as np

In [123]:
# read all cleaned tables
box_scores = pd.read_csv("../phase2/cleaned-data/box_scores_2010_2017.csv").drop(columns=["Unnamed: 0"])
game_data = pd.read_csv("../phase2/cleaned-data/nfl_game_data_2010_2023.csv").drop(columns=["Unnamed: 0"])
team_stats = pd.read_csv("../phase2/cleaned-data/nfl_team_stats_2010_2021.csv").drop(columns=["Unnamed: 0"])
nfl_teams = pd.read_csv("../phase2/cleaned-data/nfl_teams_info.csv").drop(columns=["Unnamed: 0"])

## Setting Up Data

In [124]:
# adding values for the St. Louis Rams and Las Vegas Raiders as they show up in the other datasets
rams = pd.DataFrame({"team_name": "St. Louis Rams", "team_name_short": "Rams", "team_id": "LAR", "team_conference": "NFC", "team_division": "NFC West"}, index=[0])
raiders = pd.DataFrame({"team_name": "Las Vegas Raiders", "team_name_short": "Raiders", "team_id": "LVR", "team_conference": "AFC", "team_division": "AFC West"}, index=[0])
nfl_teams = pd.concat([nfl_teams, rams, raiders], ignore_index=True)
nfl_teams = nfl_teams.sort_values(by=["team_division"]).reset_index(drop=True)
nfl_teams.head()

Unnamed: 0,team_name,team_name_short,team_id,team_conference,team_division
0,New England Patriots,Patriots,NE,AFC,AFC East
1,Buffalo Bills,Bills,BUF,AFC,AFC East
2,Miami Dolphins,Dolphins,MIA,AFC,AFC East
3,New York Jets,Jets,NYJ,AFC,AFC East
4,Baltimore Ravens,Ravens,BAL,AFC,AFC North


In [125]:
# function to get team id from city/team name
def get_team_id(city):
    # find the team name 
    for team in nfl_teams["team_name"]:
        if city in team:
            return nfl_teams[nfl_teams["team_name"] == team]["team_id"].values[0]
        elif city == "NY Giants":
            return "NYG"
        elif city == "NY Jets":
            return "NYJ"
        elif city == "LA Rams":
            return "LAR"
        elif city == "LA Chargers":
            return "LAC"

In [126]:
# adding team ids to the box scores dataset
box_scores["home_id"] = box_scores["home"].apply(get_team_id)
box_scores["away_id"] = box_scores["visitor"].apply(get_team_id)

box_scores.head()

Unnamed: 0,date,visitor,home,visitor_score,home_score,visitor_first_downs,visitor_net_yards,visitor_total_plays,visitor_avg_gain,visitor_time_of_possession,home_first_downs,home_net_yards,home_total_plays,home_avg_gain,home_time_of_possession,home_id,away_id
0,2014-09-07,Cleveland,Pittsburgh,27,30,23,389,64,6.1,27:33,24,503,67,7.5,32:27,PIT,CLE
1,2014-09-07,Jacksonville,Philadelphia,17,34,18,306,70,4.4,29:14,24,420,82,5.1,30:46,PHI,JAX
2,2014-09-04,Green Bay,Seattle,16,36,19,255,57,4.5,26:40,25,398,66,6.0,33:20,SEA,GB
3,2014-09-07,Minnesota,St. Louis,34,6,18,355,57,6.2,28:17,15,318,63,5.0,31:43,LAR,MIN
4,2014-09-07,Cincinnati,Baltimore,23,16,16,380,64,5.9,30:30,26,423,85,5.0,29:30,BAL,CIN


In [127]:
# adding team ids to the game data dataset
game_data["home_id"] = game_data["team_home"].apply(get_team_id)
game_data["away_id"] = game_data["team_away"].apply(get_team_id)

game_data.head()

Unnamed: 0,schedule_date,schedule_season,schedule_week,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,stadium,weather_temperature,weather_wind_mph,home_id,away_id
0,2010-09-09,2010,1,New Orleans Saints,14.0,9.0,Minnesota Vikings,NO,-5.0,Louisiana Superdome,72.0,0.0,NO,MIN
1,2010-09-12,2010,1,Buffalo Bills,10.0,15.0,Miami Dolphins,MIA,-3.0,Ralph Wilson Stadium,64.0,7.0,BUF,MIA
2,2010-09-12,2010,1,Chicago Bears,19.0,14.0,Detroit Lions,CHI,-6.5,Soldier Field,75.0,1.0,CHI,DET
3,2010-09-12,2010,1,Houston Texans,34.0,24.0,Indianapolis Colts,IND,-1.0,Reliant Stadium,89.0,5.0,HOU,IND
4,2010-09-12,2010,1,Jacksonville Jaguars,24.0,17.0,Denver Broncos,JAX,-3.0,EverBank Field,91.0,1.0,JAX,DEN


## Joining Data

In [128]:
# merge the box scores and game data using inner join on date and home/away id
box_game_data_merged = pd.merge(box_scores, game_data, how="inner", left_on=["date", "home_id", "away_id"], right_on=["schedule_date", "home_id", "away_id"])
box_game_data_merged.head()

Unnamed: 0,date,visitor,home,visitor_score,home_score,visitor_first_downs,visitor_net_yards,visitor_total_plays,visitor_avg_gain,visitor_time_of_possession,...,schedule_week,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,stadium,weather_temperature,weather_wind_mph
0,2014-09-07,Cleveland,Pittsburgh,27,30,23,389,64,6.1,27:33,...,1,Pittsburgh Steelers,30.0,27.0,Cleveland Browns,PIT,-5.5,Heinz Field,72.0,6.0
1,2014-09-07,Jacksonville,Philadelphia,17,34,18,306,70,4.4,29:14,...,1,Philadelphia Eagles,34.0,17.0,Jacksonville Jaguars,PHI,-10.0,Lincoln Financial Field,80.0,6.0
2,2014-09-04,Green Bay,Seattle,16,36,19,255,57,4.5,26:40,...,1,Seattle Seahawks,36.0,16.0,Green Bay Packers,SEA,-4.5,CenturyLink Field,70.0,5.0
3,2014-09-07,Minnesota,St. Louis,34,6,18,355,57,6.2,28:17,...,1,St. Louis Rams,6.0,34.0,Minnesota Vikings,LAR,-3.0,Edward Jones Dome,72.0,0.0
4,2014-09-07,Cincinnati,Baltimore,23,16,16,380,64,5.9,30:30,...,1,Baltimore Ravens,16.0,23.0,Cincinnati Bengals,BAL,-1.0,M&T Bank Stadium,78.0,0.0


In [129]:
# sort the merged dataset by date
box_game_data_merged = box_game_data_merged.sort_values(by=["date"]).reset_index(drop=True)
box_game_data_merged.head()

Unnamed: 0,date,visitor,home,visitor_score,home_score,visitor_first_downs,visitor_net_yards,visitor_total_plays,visitor_avg_gain,visitor_time_of_possession,...,schedule_week,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,stadium,weather_temperature,weather_wind_mph
0,2010-09-09,Minnesota,New Orleans,9,14,12,253,51,5.0,26:17,...,1,New Orleans Saints,14.0,9.0,Minnesota Vikings,NO,-5.0,Louisiana Superdome,72.0,0.0
1,2010-09-12,Indianapolis,Houston,24,34,25,463,69,6.7,29:07,...,1,Houston Texans,34.0,24.0,Indianapolis Colts,IND,-1.0,Reliant Stadium,89.0,5.0
2,2010-09-12,Detroit,Chicago,14,19,13,168,57,2.9,25:18,...,1,Chicago Bears,19.0,14.0,Detroit Lions,CHI,-6.5,Soldier Field,75.0,1.0
3,2010-09-12,Arizona,St. Louis,17,13,21,378,64,5.9,27:09,...,1,St. Louis Rams,13.0,17.0,Arizona Cardinals,ARI,-3.0,Edward Jones Dome,72.0,0.0
4,2010-09-12,Carolina,NY Giants,18,31,14,237,63,3.8,25:21,...,1,New York Giants,31.0,18.0,Carolina Panthers,NYG,-6.0,MetLife Stadium,65.0,1.0


In [130]:
# drop unnecessary columns
box_game_data_merged = box_game_data_merged.drop(columns=["schedule_date", "visitor", "home", "visitor_score", "home_score"])
box_game_data_merged.head()

Unnamed: 0,date,visitor_first_downs,visitor_net_yards,visitor_total_plays,visitor_avg_gain,visitor_time_of_possession,home_first_downs,home_net_yards,home_total_plays,home_avg_gain,...,schedule_week,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,stadium,weather_temperature,weather_wind_mph
0,2010-09-09,12,253,51,5.0,26:17,18,308,62,5.0,...,1,New Orleans Saints,14.0,9.0,Minnesota Vikings,NO,-5.0,Louisiana Superdome,72.0,0.0
1,2010-09-12,25,463,69,6.7,29:07,23,355,61,5.8,...,1,Houston Texans,34.0,24.0,Indianapolis Colts,IND,-1.0,Reliant Stadium,89.0,5.0
2,2010-09-12,13,168,57,2.9,25:18,23,463,70,6.6,...,1,Chicago Bears,19.0,14.0,Detroit Lions,CHI,-6.5,Soldier Field,75.0,1.0
3,2010-09-12,21,378,64,5.9,27:09,20,325,81,4.0,...,1,St. Louis Rams,13.0,17.0,Arizona Cardinals,ARI,-3.0,Edward Jones Dome,72.0,0.0
4,2010-09-12,14,237,63,3.8,25:21,21,376,67,5.6,...,1,New York Giants,31.0,18.0,Carolina Panthers,NYG,-6.0,MetLife Stadium,65.0,1.0


In [131]:
# convert time of possession to a float for minutes
def convert_time_to_float(time):
  if time == "None":
    return 0
  else:
    time_split = time.split(":")
    return float(time_split[0]) + float(time_split[1])/60

In [132]:
box_game_data_merged["visitor_time_of_possession"] = box_game_data_merged["visitor_time_of_possession"].apply(convert_time_to_float)
box_game_data_merged["home_time_of_possession"] = box_game_data_merged["home_time_of_possession"].apply(convert_time_to_float)

In [133]:
def get_winner_id(row):
    if row["score_home"] > row["score_away"]:
      return 0
    elif row["score_home"] < row["score_away"]:
      return 1
    else:
      return 2

In [134]:
# apply the get_winner_id function to the merged dataset
box_game_data_merged["winner_id"] = box_game_data_merged.apply(get_winner_id, axis=1)
box_game_data_merged.head()

Unnamed: 0,date,visitor_first_downs,visitor_net_yards,visitor_total_plays,visitor_avg_gain,visitor_time_of_possession,home_first_downs,home_net_yards,home_total_plays,home_avg_gain,...,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,stadium,weather_temperature,weather_wind_mph,winner_id
0,2010-09-09,12,253,51,5.0,26.283333,18,308,62,5.0,...,New Orleans Saints,14.0,9.0,Minnesota Vikings,NO,-5.0,Louisiana Superdome,72.0,0.0,0
1,2010-09-12,25,463,69,6.7,29.116667,23,355,61,5.8,...,Houston Texans,34.0,24.0,Indianapolis Colts,IND,-1.0,Reliant Stadium,89.0,5.0,0
2,2010-09-12,13,168,57,2.9,25.3,23,463,70,6.6,...,Chicago Bears,19.0,14.0,Detroit Lions,CHI,-6.5,Soldier Field,75.0,1.0,0
3,2010-09-12,21,378,64,5.9,27.15,20,325,81,4.0,...,St. Louis Rams,13.0,17.0,Arizona Cardinals,ARI,-3.0,Edward Jones Dome,72.0,0.0,1
4,2010-09-12,14,237,63,3.8,25.35,21,376,67,5.6,...,New York Giants,31.0,18.0,Carolina Panthers,NYG,-6.0,MetLife Stadium,65.0,1.0,0


In [135]:
# convert columns to ints / floats
box_game_data_merged["score_home"] = box_game_data_merged["score_home"].astype(int)
box_game_data_merged["score_away"] = box_game_data_merged["score_away"].astype(int)
box_game_data_merged["home_first_downs"] = box_game_data_merged["home_first_downs"].astype(int)
box_game_data_merged["home_net_yards"] = box_game_data_merged["home_net_yards"].astype(int)
box_game_data_merged["home_total_plays"] = box_game_data_merged["home_total_plays"].astype(int)
box_game_data_merged["home_avg_gain"] = box_game_data_merged["home_avg_gain"].astype(float)
box_game_data_merged["visitor_first_downs"] = box_game_data_merged["visitor_first_downs"].astype(int)
box_game_data_merged["visitor_net_yards"] = box_game_data_merged["visitor_net_yards"].astype(int)
box_game_data_merged["visitor_total_plays"] = box_game_data_merged["visitor_total_plays"].astype(int)
box_game_data_merged["visitor_avg_gain"] = box_game_data_merged["visitor_avg_gain"].astype(float)

In [136]:
box_game_data_merged.columns

Index(['date', 'visitor_first_downs', 'visitor_net_yards',
       'visitor_total_plays', 'visitor_avg_gain', 'visitor_time_of_possession',
       'home_first_downs', 'home_net_yards', 'home_total_plays',
       'home_avg_gain', 'home_time_of_possession', 'home_id', 'away_id',
       'schedule_season', 'schedule_week', 'team_home', 'score_home',
       'score_away', 'team_away', 'team_favorite_id', 'spread_favorite',
       'stadium', 'weather_temperature', 'weather_wind_mph', 'winner_id'],
      dtype='object')

Most Important Columns: 'home_id', 'away_id', 'visitor_net_yards', 'visitor_time_of_possession', 'home_net_yards', 'home_time_of_possession', 'score_home', 'score_away', 'stadium'

In [137]:
def get_last_n_games_stats(team_id, date, n):
  last_n_games = box_game_data_merged[(box_game_data_merged["home_id"] == team_id) | (box_game_data_merged["away_id"] == team_id)]
  last_n_games = last_n_games[last_n_games["date"] < date].tail(n)
  if len(last_n_games) < n:
    return None
  else:
    total_points_scored = 0
    total_points_allowed = 0
    total_first_downs = 0
    total_net_yards = 0
    total_total_plays = 0
    total_avg_gain = 0
    for index, row in last_n_games.iterrows():
        if row["home_id"] == team_id:
            total_points_scored += row["score_home"]
            total_points_allowed += row["score_away"]
            total_first_downs += row["home_first_downs"]
            total_net_yards += row["home_net_yards"]
            total_total_plays += row["home_total_plays"]
            total_avg_gain += row["home_avg_gain"]
        else:
            total_points_scored += row["score_away"]
            total_points_allowed += row["score_home"]
            total_first_downs += row["visitor_first_downs"]
            total_net_yards += row["visitor_net_yards"]
            total_total_plays += row["visitor_total_plays"]
            total_avg_gain += row["visitor_avg_gain"]
    return [total_points_scored/n, total_points_allowed/n, total_first_downs/n, total_net_yards/n, total_total_plays/n, total_avg_gain/n]

In [None]:
# create a new data frame that has date, home_id, away_id, home_last_n_games_stats, away_last_n_games_stats
def create_last_n_games_stats_df(n):
    last_n_games_stats = pd.DataFrame(columns=["date", "home_id", "away_id", "home_last_n_score", "home_last_n_allowed", "home_last_n_time_of_possession", 
                                            "home_last_n_first_downs", "home_last_n_total_plays", "home_last_n_avg_gain", "away_last_n_score", "away_last_n_allowed", 
                                            "away_last_n_time_of_possession", "away_last_n_first_downs", "away_last_n_total_plays", "away_last_n_avg_gain"])
    # go through each row in the original data frame and call the get_last_n_games_stats function 
    for index, row in box_game_data_merged.iterrows():
        home_last_n_games_stats = get_last_n_games_stats(row["home_id"], row["date"], n)
        away_last_n_games_stats = get_last_n_games_stats(row["away_id"], row["date"], n)
        if home_last_n_games_stats is not None and away_last_n_games_stats is not None:
            last_n_games_stats = last_n_games_stats.append({"date": row["date"], "home_id": row["home_id"], "away_id": row["away_id"], "home_last_n_score": home_last_n_games_stats[0], 
                                                            "home_last_n_allowed": home_last_n_games_stats[1], "home_last_n_time_of_possession": home_last_n_games_stats[2], 
                                                            "home_last_n_first_downs": home_last_n_games_stats[3], "home_last_n_total_plays": home_last_n_games_stats[4], 
                                                            "home_last_n_avg_gain": home_last_n_games_stats[5], "away_last_n_score": away_last_n_games_stats[0], 
                                                            "away_last_n_allowed": away_last_n_games_stats[1], "away_last_n_time_of_possession": away_last_n_games_stats[2], 
                                                            "away_last_n_first_downs": away_last_n_games_stats[3], "away_last_n_total_plays": away_last_n_games_stats[4], 
                                                            "away_last_n_avg_gain": away_last_n_games_stats[5]}, ignore_index=True)
        else:
            last_n_games_stats = last_n_games_stats.append({"date": row["date"], "home_id": row["home_id"], "away_id": row["away_id"], "home_last_n_score": None, 
                                                            "home_last_n_allowed": None, "home_last_n_time_of_possession": None, "home_last_n_first_downs": None, 
                                                            "home_last_n_total_plays": None, "home_last_n_avg_gain": None, "away_last_n_score": None, 
                                                            "away_last_n_allowed": None, "away_last_n_time_of_possession": None, "away_last_n_first_downs": None, 
                                                            "away_last_n_total_plays": None, "away_last_n_avg_gain": None}, ignore_index=True)
    return last_n_games_stats


In [None]:
last_5_games_stats = create_last_n_games_stats_df(5)
last_5_games_stats.tail()

In [None]:
# merge the last_5_games_stats dataset with the box_game_data_merged dataset on date, home_id, and away_id
box_game_data_merged_last_5 = pd.merge(box_game_data_merged, last_5_games_stats, how="inner", left_on=["date", "home_id", "away_id"], right_on=["date", "home_id", "away_id"])
box_game_data_merged_last_5.head()

## Train Model

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

### Explained Model

In [None]:
df_box_game = box_game_data_merged[:1500].copy()

X_train = df_box_game[['home_id', 'away_id', 'visitor_net_yards', 'visitor_time_of_possession', 'home_net_yards', 'home_time_of_possession', 'score_home', 'score_away']]
y_train = df_box_game["winner_id"]

ct = make_column_transformer(
    (OneHotEncoder(), ['home_id', 'away_id']),
    remainder='passthrough'
)

pipeline = make_pipeline(
    ct,
    StandardScaler(with_mean=False),
    KNeighborsRegressor(n_neighbors=5)
)

grid_search = GridSearchCV(
    pipeline,
    param_grid={
        "kneighborsregressor__n_neighbors": range(1, 20),
        "kneighborsregressor__metric": ["euclidean", "manhattan"]
    },
    scoring="neg_root_mean_squared_error",
    cv=10
)

grid_search.fit(X_train, y_train)
grid_search.best_estimator_


In [None]:
df_cv_results = pd.DataFrame(grid_search.cv_results_)
df_cv_results.head()

In [None]:
df_cv_results["param_kneighborsregressor__n_neighbors"] = df_cv_results["param_kneighborsregressor__n_neighbors"].astype("int")

df_cv_results.set_index("param_kneighborsregressor__n_neighbors", inplace = True)

In [None]:
df_cv_results["pos_mean_test_score"] = -df_cv_results["mean_test_score"]

(df_cv_results.groupby("param_kneighborsregressor__metric")["pos_mean_test_score"]).plot.line(xlabel = "k", ylabel = "test error", 
                                                                                                title = "KNN Model",
                                                                                                legend = True)

In [None]:
cv_errs = -cross_val_score(grid_search.best_estimator_, X=X_train.fillna(X_train.mean()),
                            y=y_train,
                            scoring="neg_root_mean_squared_error", cv=10)

cv_errs.mean()

In [None]:
df_box_game_test = box_game_data_merged[1500:].copy()

In [None]:
y_new = pd.Series(
    grid_search.best_estimator_.predict(X=df_box_game_test[['home_id', 'away_id', 'visitor_net_yards', 'visitor_time_of_possession', 'home_net_yards', 'home_time_of_possession', 'score_home', 'score_away']]),
    index=df_box_game_test.index
)

y_new

In [None]:
pred_vs_actual = pd.DataFrame({
    "Winner_pred": y_new,
    "Winner_actual": df_box_game_test["winner_id"],
    "Home_Team": df_box_game_test["home_id"],
    "Away_Team": df_box_game_test["away_id"]
})

pred_vs_actual["Winner_pred"] = pred_vs_actual["Winner_pred"].apply(lambda x: 0 if x < 0.5 else 1)
pred_vs_actual.count()

In [None]:
pred_vs_actual[pred_vs_actual["Winner_pred"] == pred_vs_actual["Winner_actual"]].count()

### Predict Model

In [None]:
# drop rows with null values
df_prediction = box_game_data_merged_last_5.copy()
df_prediction.dropna(inplace=True)
df_prediction.shape

##### KNeighborsRegressor for Multi Output

In [None]:
df_last_5 = df_prediction[:1500].copy()

X_train = df_last_5[['home_id', 'away_id', 'away_last_n_score', 'away_last_n_allowed', 'away_last_n_time_of_possession', 'away_last_n_first_downs', 
                     'away_last_n_total_plays', 'away_last_n_avg_gain', 'home_last_n_score', 'home_last_n_allowed', 
                     'home_last_n_time_of_possession', 'home_last_n_first_downs', 'home_last_n_total_plays', 'home_last_n_avg_gain']]
y_train = df_last_5[['score_home', 'score_away']]

ct = make_column_transformer(
    (OneHotEncoder(), ['home_id', 'away_id']),
    remainder='passthrough'
)

pipeline = make_pipeline(
    ct,
    StandardScaler(with_mean=False),
    KNeighborsRegressor(n_neighbors=5)
)

grid_search = GridSearchCV(
    pipeline,
    param_grid={
        "kneighborsregressor__n_neighbors": range(1, 60),
        "kneighborsregressor__metric": ["euclidean", "manhattan"]
    },
    scoring="neg_root_mean_squared_error",
    cv=10
)

grid_search.fit(X_train, y_train)
grid_search.best_estimator_

In [None]:
df_cv_results = pd.DataFrame(grid_search.cv_results_)

df_cv_results["param_kneighborsregressor__n_neighbors"] = df_cv_results["param_kneighborsregressor__n_neighbors"].astype("int")

df_cv_results.set_index("param_kneighborsregressor__n_neighbors", inplace = True)

df_cv_results["pos_mean_test_score"] = -df_cv_results["mean_test_score"]

(df_cv_results.groupby("param_kneighborsregressor__metric")["pos_mean_test_score"]).plot.line(xlabel = "k", ylabel = "test error",
                                                                                                title = "KNN Model",
                                                                                                legend = True)

In [None]:
cv_errs = -cross_val_score(grid_search.best_estimator_, X=X_train,
                            y=y_train,
                            scoring="neg_root_mean_squared_error", cv=10)

cv_errs.mean()

In [None]:
df_last_5_test = df_prediction[1500:].copy()

y_new = pd.DataFrame(
    grid_search.best_estimator_.predict(X=df_last_5_test[['home_id', 'away_id', 'away_last_n_score', 'away_last_n_allowed', 'away_last_n_time_of_possession', 'away_last_n_first_downs', 
                                                          'away_last_n_total_plays', 'away_last_n_avg_gain', 'home_last_n_score', 'home_last_n_allowed', 
                                                          'home_last_n_time_of_possession', 'home_last_n_first_downs', 'home_last_n_total_plays', 'home_last_n_avg_gain']]),
    index=df_last_5_test.index
)

y_new

In [None]:
# compare the predicted scores to the actual scores with the date, home team, and away team, score_home, and score_away, and the predicted scores
pred_vs_actual_knr = pd.DataFrame({
    "Date": df_last_5_test["date"],
    "Home_Team": df_last_5_test["home_id"],
    "Away_Team": df_last_5_test["away_id"],
    "Score_Home": df_last_5_test["score_home"],
    "Score_Away": df_last_5_test["score_away"],
    "Score_Home_Pred": y_new[0],
    "Score_Away_Pred": y_new[1],
    "Winner_Id": df_last_5_test["winner_id"]
})

pred_vs_actual_knr.head()

In [None]:
# calculated if prediction is correct for who wins (0 if home team wins, 1 if away team wins, 2 if tie)
pred_vs_actual_knr["Winner_Pred"] = pred_vs_actual_knr.apply(lambda x: 0 if x["Score_Home_Pred"] > x["Score_Away_Pred"] else 1 if x["Score_Home_Pred"] < x["Score_Away_Pred"] else 2, axis=1)
pred_vs_actual_knr.head()

In [None]:
print(len(pred_vs_actual_knr[pred_vs_actual_knr["Winner_Pred"] == pred_vs_actual_knr["Winner_Id"]]), "out of", len(pred_vs_actual_knr), "predictions were correct.")

##### KNeighbors Regressor: Home and Away Models

In [None]:
# home team model
X_home_train = df_last_5[['home_id', 'away_id', 'home_last_n_score', 'home_last_n_allowed', 'home_last_n_time_of_possession', 'home_last_n_first_downs', 
                        'home_last_n_total_plays', 'home_last_n_avg_gain']]
y_home_train = df_last_5['score_home']

# away team model
X_away_train = df_last_5[['home_id', 'away_id', 'away_last_n_score', 'away_last_n_allowed', 'away_last_n_time_of_possession', 'away_last_n_first_downs',
                        'away_last_n_total_plays', 'away_last_n_avg_gain']]
y_away_train = df_last_5['score_away']

ct = make_column_transformer(
    (OneHotEncoder(), ['home_id', 'away_id']),
    remainder='passthrough'
)

pipeline = make_pipeline(
    ct,
    StandardScaler(with_mean=False),
    KNeighborsRegressor(n_neighbors=5)
)

grid_search = GridSearchCV(
    pipeline,
    param_grid={
        "kneighborsregressor__n_neighbors": range(1, 60),
        "kneighborsregressor__metric": ["euclidean", "manhattan"]
    },
    scoring="neg_root_mean_squared_error",
    cv=10
)

grid_search.fit(X_home_train, y_home_train)
grid_search_home = grid_search.best_estimator_

grid_search.fit(X_away_train, y_away_train)
grid_search_away = grid_search.best_estimator_

In [None]:
grid_search_home

In [None]:
grid_search_away

In [None]:
cv_errs_home = -cross_val_score(grid_search_home, X=X_home_train,
                            y=y_home_train,
                            scoring="neg_root_mean_squared_error", cv=10)

cv_errs_home.mean()

In [None]:
cv_errs_home = -cross_val_score(grid_search_away, X=X_away_train,
                            y=y_away_train,
                            scoring="neg_root_mean_squared_error", cv=10)

cv_errs_home.mean()

In [None]:
df_last_5_test = df_prediction[1500:].copy()

y_home_new = pd.DataFrame(
    grid_search_home.predict(X=df_last_5_test[['home_id', 'away_id', 'home_last_n_score', 'home_last_n_allowed', 'home_last_n_time_of_possession', 
                                               'home_last_n_first_downs', 'home_last_n_total_plays', 'home_last_n_avg_gain']]),
    index=df_last_5_test.index
)

y_home_new

In [None]:
y_away_new = pd.DataFrame(
    grid_search_away.predict(X=df_last_5_test[['home_id', 'away_id', 'away_last_n_score', 'away_last_n_allowed', 'away_last_n_time_of_possession', 'away_last_n_first_downs', 
                                                          'away_last_n_total_plays', 'away_last_n_avg_gain']]),
    index=df_last_5_test.index
)

y_away_new

In [None]:
# compare the predicted scores to the actual scores with the date, home team, and away team, score_home, and score_away, and the predicted scores
pred_vs_actual_knr_individual = pd.DataFrame({
    "Date": df_last_5_test["date"],
    "Home_Team": df_last_5_test["home_id"],
    "Away_Team": df_last_5_test["away_id"],
    "Score_Home": df_last_5_test["score_home"],
    "Score_Away": df_last_5_test["score_away"],
    "Score_Home_Pred": y_home_new[0],
    "Score_Away_Pred": y_away_new[0],
    "Winner_Id": df_last_5_test["winner_id"]
})

pred_vs_actual_knr_individual.head()

In [None]:
# calculated if prediction is correct for who wins (0 if home team wins, 1 if away team wins, 2 if tie)
pred_vs_actual_knr_individual["Winner_Pred"] = pred_vs_actual_knr_individual.apply(lambda x: 0 if x["Score_Home_Pred"] > x["Score_Away_Pred"] else 1 if x["Score_Home_Pred"] < x["Score_Away_Pred"] else 2, axis=1)
pred_vs_actual_knr_individual.head()

In [None]:
print(len(pred_vs_actual_knr_individual[pred_vs_actual_knr_individual["Winner_Pred"] == pred_vs_actual_knr_individual["Winner_Id"]]), "out of", len(pred_vs_actual_knr_individual), "predictions were correct.")

##### Linear Regression Model

In [None]:
# linear regression model

from sklearn.linear_model import LinearRegression

pipeline_lr = make_pipeline(
    ct,
    StandardScaler(with_mean=False),
    LinearRegression()
)

In [None]:
pipeline_lr.fit(X_train, y_train)

In [None]:
# cross validation
cv_errs = -cross_val_score(pipeline_lr, X=X_train.fillna(X_train.mean()),
                            y=y_train,
                            scoring="neg_root_mean_squared_error", cv=10)

cv_errs.mean()

In [None]:
y_new = pd.DataFrame(
    pipeline_lr.predict(X=df_last_5_test[['home_id', 'away_id', 'away_last_n_score', 'away_last_n_allowed', 'away_last_n_time_of_possession', 'away_last_n_first_downs', 
                                                          'away_last_n_total_plays', 'away_last_n_avg_gain', 'home_last_n_score', 'home_last_n_allowed', 
                                                          'home_last_n_time_of_possession', 'home_last_n_first_downs', 'home_last_n_total_plays', 'home_last_n_avg_gain']]),
    index=df_last_5_test.index
)

y_new

In [None]:
pred_vs_actual_lr = pd.DataFrame({
    "Date": df_last_5_test["date"],
    "Home_Team": df_last_5_test["home_id"],
    "Away_Team": df_last_5_test["away_id"],
    "Score_Home": df_last_5_test["score_home"],
    "Score_Away": df_last_5_test["score_away"],
    "Score_Home_Pred": y_new[0],
    "Score_Away_Pred": y_new[1],
    "Winner_Id": df_last_5_test["winner_id"]
})

pred_vs_actual_lr.head()

In [None]:
# calculated if prediction is correct for who wins (0 if home team wins, 1 if away team wins, 2 if tie)
pred_vs_actual_lr["Winner_Pred"] = pred_vs_actual_lr.apply(lambda x: 0 if x["Score_Home_Pred"] > x["Score_Away_Pred"] else 1 if x["Score_Home_Pred"] < x["Score_Away_Pred"] else 2, axis=1)
pred_vs_actual_lr.head()

In [None]:
print(len(pred_vs_actual_lr[pred_vs_actual_lr["Winner_Pred"] == pred_vs_actual_lr["Winner_Id"]]), "out of", len(pred_vs_actual_lr), "predictions were correct.")

##### Ensemble Model: Voting Regressor

In [None]:
from sklearn.ensemble import VotingRegressor
from sklearn.multioutput import MultiOutputRegressor

all_weights = [[0.1, 0.9], [0.2, 0.8], [0.3, 0.7], [0.4, 0.6], [0.5, 0.5], [0.6, 0.4], [0.7, 0.3], [0.8, 0.2], [0.9, 0.1]]
errors = {}

for w in all_weights:
  ensemble_model = MultiOutputRegressor(VotingRegressor([
      ("linear", pipeline_lr),
      ("knn", grid_search.best_estimator_)],
      weights=w)
  )
  cv_errs = -cross_val_score(ensemble_model, X=X_train.fillna(X_train.mean()),
                                y=y_train,
                                scoring="neg_root_mean_squared_error", cv=10)
  errors[str(w)] = cv_errs.mean()

errors

In [None]:
ensemble_model_final = MultiOutputRegressor(VotingRegressor([
        ("linear", pipeline_lr),
        ("knn", grid_search.best_estimator_)],
        weights=[0.5, 0.5])
    )

ensemble_model_final.fit(X_train, y_train)

In [None]:
y_new = pd.DataFrame(
    ensemble_model_final.predict(X=df_last_5_test[['home_id', 'away_id', 'away_last_n_score', 'away_last_n_allowed', 'away_last_n_time_of_possession', 'away_last_n_first_downs', 
                                                          'away_last_n_total_plays', 'away_last_n_avg_gain', 'home_last_n_score', 'home_last_n_allowed', 
                                                          'home_last_n_time_of_possession', 'home_last_n_first_downs', 'home_last_n_total_plays', 'home_last_n_avg_gain']]),
    index=df_last_5_test.index
)

y_new

In [None]:
pred_vs_actual_ensemble = pd.DataFrame({
    "Date": df_last_5_test["date"],
    "Home_Team": df_last_5_test["home_id"],
    "Away_Team": df_last_5_test["away_id"],
    "Score_Home": df_last_5_test["score_home"],
    "Score_Away": df_last_5_test["score_away"],
    "Score_Home_Pred": y_new[0],
    "Score_Away_Pred": y_new[1],
    "Winner_Id": df_last_5_test["winner_id"]
})

pred_vs_actual_ensemble

In [None]:
# calculated if prediction is correct for who wins (0 if home team wins, 1 if away team wins, 2 if tie)
pred_vs_actual_ensemble["Winner_Pred"] = pred_vs_actual_ensemble.apply(lambda x: 0 if x["Score_Home_Pred"] > x["Score_Away_Pred"] else 1 if x["Score_Home_Pred"] < x["Score_Away_Pred"] else 2, axis=1)
pred_vs_actual_ensemble.head()

In [None]:
print(len(pred_vs_actual_ensemble[pred_vs_actual_ensemble["Winner_Pred"] == pred_vs_actual_ensemble["Winner_Id"]]), "out of", len(pred_vs_actual_ensemble), "predictions were correct.")

## Last-N Games Models Comparative Analysis

In [None]:
last_5_games_stats = create_last_n_games_stats_df(5)
box_game_data_merged_last_5 = pd.merge(box_game_data_merged, last_5_games_stats, how="inner", left_on=["date", "home_id", "away_id"], right_on=["date", "home_id", "away_id"])
box_game_data_merged_last_5.head()

### KNeighbor Regressor: Multi Output Model Function

### KNeighbor Regressor: Home and Away Models Function

### Linear Regression Function

### Ensemble Model: Voting Regressor Function