### Description:

This jupyter notebook will be testing to see how well models can predict a different calculated EPA. EPA in this file is calculated by subtracting the EP before the play and the EP after the play has finished. I will also **NOT** be using the field position and time intervals. These columns will be evaluated as numerical.

In [1]:
import pandas as pd
import numpy as np

dataframe = pd.read_csv("big_ten_pbp.csv")

dataframe["soccer_time"] = dataframe["period"] * (
    16 - (dataframe["minutes"] + (dataframe["seconds"] / 60)).round(0)
).astype("int32")

dataframe["score_differential"] = (
    dataframe["offense_score"] - dataframe["defense_score"]
)
big_ten = dataframe[dataframe["offense_conference"] == "Big Ten"]

In [2]:
conditions = [
    (big_ten["play_type"] == "Field Goal Missed"),
    (big_ten["play_type"] == "Field Goal Good"),
    (big_ten["play_type"] == "Passing Touchdown")
    | (big_ten["play_type"] == "Rusing Touchdown"),
    (big_ten["play_type"] == "Interception Return Touchdown")
    | (big_ten["play_type"] == "Fumble Return Touchdown")
    | (big_ten["play_type"] == "Punt Return Touchdown")
    | (big_ten["play_type"] == "Blocked Punt Touchdown")
    | (big_ten["play_type"] == "Missed Field Goal Return Touchdown")
    | (big_ten["play_type"] == "Blocked Field Goal Touchdown"),
    (big_ten["play_type"] == "Safety"),
]

values = [0, 3, 7, -7, 2]

big_ten["points_scored"] = np.select(conditions, values)

big_ten = big_ten[(big_ten["down"] > 0)]
big_ten = big_ten[(big_ten["distance"] > 0)]

big_ten["points_scored"].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  big_ten["points_scored"] = np.select(conditions, values)


array([ 0,  7,  3, -7,  2])

In [3]:
grouped = (
    big_ten.groupby(
        ["down", "distance", "soccer_time", "yard_line", "score_differential"]
    )["points_scored"]
    .agg(["sum", "count", "mean", "median"])
    .reset_index()
)
grouped["mean"].unique()

array([ 0.        ,  7.        ,  3.        ,  3.5       ,  4.66666667,
        2.33333333, -1.4       ,  1.4       , -7.        ,  2.8       ,
        2.        ,  0.875     ,  1.75      , -3.5       ,  1.16666667,
        1.55555556, -1.27272727,  1.27272727,  0.75      ,  1.        ,
        1.5       ,  5.25      ])

In [4]:
intervals = []
conditions = []

big_ten[
    [
        "next_down",
        "next_distance",
        "next_soccer_time",
        "next_yard_line",
        "next_score_differential",
        "next_home",
        "next_away",
    ]
] = big_ten[
    [
        "down",
        "distance",
        "soccer_time",
        "yard_line",
        "score_differential",
        "home",
        "away",
    ]
].shift(
    -1
)

big_ten = big_ten[
    (big_ten["home"] == big_ten["next_home"])
    & (big_ten["away"] == big_ten["next_away"])
]

big_ten = big_ten[:-1]

i = 0
for row in grouped.to_dict("records"):
    if i % 10000 == 0:
        print(f"{i} completed")
    intervals.append(row["mean"])
    conditions.append(
        (big_ten["next_down"] == row["down"])
        & (big_ten["next_distance"] == row["distance"])
        & (big_ten["next_soccer_time"] == row["soccer_time"])
        & (big_ten["next_yard_line"] == row["yard_line"])
        & (big_ten["next_score_differential"] == row["score_differential"])
    )
    i += 1

big_ten["xP After"] = np.select(conditions, intervals)
big_ten["xP After"].unique()

0 completed
10000 completed
20000 completed
30000 completed
40000 completed
50000 completed
60000 completed


array([ 0.        ,  7.        ,  3.        ,  2.33333333,  3.5       ,
       -7.        , -3.5       ,  1.        , -1.4       ,  2.        ,
        1.27272727,  1.4       ,  2.8       ,  4.66666667, -1.27272727,
        1.75      ,  1.55555556,  1.5       ,  0.875     ,  1.16666667,
        5.25      ,  0.75      ])

In [5]:
# this cell is placing xP values in the right rows and then subtracting the points scored from the xp to get the xP added for the play.
# I wanted to see the average difference between my xPa model compared to the College football data's version of it
intervals = []
conditions = []

i = 0
for row in grouped.to_dict("records"):
    if i % 10000 == 0:
        print(f"{i} completed")
    intervals.append(row["mean"])
    conditions.append(
        (big_ten["down"] == row["down"])
        & (big_ten["distance"] == row["distance"])
        & (big_ten["soccer_time"] == row["soccer_time"])
        & (big_ten["yard_line"] == row["yard_line"])
        & (big_ten["score_differential"] == row["score_differential"])
    )
    i += 1

big_ten["xP Before"] = np.select(conditions, intervals)
big_ten["xP Before"].unique()

0 completed
10000 completed
20000 completed
30000 completed
40000 completed
50000 completed
60000 completed


array([ 0.        ,  7.        ,  3.        ,  2.33333333,  3.5       ,
       -7.        , -3.5       , -1.4       ,  2.        ,  1.27272727,
        1.4       ,  2.8       ,  4.66666667, -1.27272727,  1.75      ,
        1.55555556,  1.5       ,  0.875     ,  1.        ,  1.16666667,
        5.25      ,  0.75      ])

In [6]:
big_ten.head()

Unnamed: 0.1,Unnamed: 0,id,offense,offense_conference,defense,defense_conference,home,away,offense_score,defense_score,...,points_scored,next_down,next_distance,next_soccer_time,next_yard_line,next_score_differential,next_home,next_away,xP After,xP Before
0,0,400935230101849903,Minnesota,Big Ten,Buffalo,Mid-American,Minnesota,Buffalo,0,0,...,0,1.0,10.0,2.0,33.0,0.0,Minnesota,Buffalo,0.0,0.0
6,6,400935230101865601,Minnesota,Big Ten,Buffalo,Mid-American,Minnesota,Buffalo,0,0,...,0,1.0,10.0,2.0,46.0,0.0,Minnesota,Buffalo,0.0,0.0
7,7,400935230101865602,Minnesota,Big Ten,Buffalo,Mid-American,Minnesota,Buffalo,0,0,...,0,2.0,5.0,2.0,51.0,0.0,Minnesota,Buffalo,0.0,0.0
8,8,400935230101865603,Minnesota,Big Ten,Buffalo,Mid-American,Minnesota,Buffalo,0,0,...,0,3.0,2.0,2.0,54.0,0.0,Minnesota,Buffalo,0.0,0.0
9,9,400935230101865604,Minnesota,Big Ten,Buffalo,Mid-American,Minnesota,Buffalo,0,0,...,0,4.0,3.0,2.0,53.0,0.0,Minnesota,Buffalo,0.0,0.0


In [7]:
# This cell is pulling and calculating the probability a first down is acheived by down and distance
# This will be used in the model making process to gauge how probable it is to make a first down

big_ten["xPa"] = big_ten["xP Before"] - big_ten["xP After"]

epa_diff = big_ten["xPa"] - big_ten["ppa"]
print(epa_diff.mean())

big_ten["got_first_down"] = big_ten["yards_gained"] >= big_ten["distance"]

first_down_prob = (
    big_ten.groupby(["down", "distance"])["got_first_down"]
    .agg(["sum", "count"])
    .reset_index()
)
first_down_prob["first_down_prob"] = first_down_prob["sum"] / first_down_prob["count"]

intervals = []
conditions = []

for i in range(len(first_down_prob)):
    intervals.append(first_down_prob["first_down_prob"][i])
    conditions.append(
        (big_ten["down"] == first_down_prob["down"][i])
        & (big_ten["distance"] == first_down_prob["distance"][i])
    )

big_ten["first_down_prob"] = np.select(conditions, intervals)

-0.1659673656526226


In [8]:
big_ten["xPa"].unique()

array([ 0.        , -7.        ,  7.        , -3.        ,  3.        ,
       -2.33333333,  2.33333333, -3.5       ,  3.5       , -1.        ,
        1.4       , -1.4       , -2.        ,  2.        , -1.27272727,
       -5.72727273, -2.8       ,  2.8       , -4.66666667,  4.66666667,
        1.27272727, -1.75      ,  1.75      , -4.2       , -1.55555556,
        1.55555556,  6.5       , -1.5       ,  1.5       , -5.5       ,
       -0.875     ,  0.875     ,  1.        , -1.16666667,  1.16666667,
        6.        , -5.25      ,  5.25      ,  4.2       ,  0.5       ,
        6.25      ,  0.75      , -5.83333333, -2.1       , -0.5       ,
       -0.75      ,  2.25      , -4.66666667])

In [10]:
# Now let's create the models for the usual 4th down decision making. I will make 3 different models (Field Goals, Go For It, and Punting)

subset = big_ten[
    [
        "play_type",
        "down",
        "distance",
        "soccer_time",
        "yard_line",
        "score_differential",
        "first_down_prob",
        "xPa",
    ]
]
subset = subset[~subset["play_type"].isin(["Kickoff", "Uncategorized"])]

fourth_down = subset[subset["down"] == 4]
predictiors = fourth_down.columns.drop(["xPa", "play_type"])
prediction_set = fourth_down[predictiors]

The models will be split up by the 3 decisions that could be made. One model for Field Goals, one for punting, and one for going for it.
I will then use each model to predict the xPa value for each 4th down situation for each deicison.

In [13]:
from sklearn.model_selection import train_test_split
from utils import rf_regress_params_tuner


field_goals_dummies = fourth_down[
    fourth_down["play_type"].isin(
        [
            "Field Goal Good",
            "Missed Field Goal Return Touchdown",
            "Missed Field Goal Return",
            "Blocked Field Goal",
            "Field Goal Missed",
            "Blocked Field Goal Touchdown",
        ]
    )
]

predictors = field_goals_dummies.columns.drop(
    ["xPa", "play_type"]
)  # predictor variables used: all variables besides the target variable
target = field_goals_dummies["xPa"].values  # target variable


# splits the subset into a training set to fit the models on and a testing set to test the models on for their accuracy
fg_train_data, fg_test_data, fg_train_sln, fg_test_sln = train_test_split(
    field_goals_dummies[predictors], target, test_size=0.2, random_state=0
)

field_goals_params = rf_regress_params_tuner(
    fg_train_data, fg_test_data, fg_train_sln, fg_test_sln, field_goals_dummies
)
print(field_goals_params)

INFO:root:Best r2 after tuning max depth is: 0.6592652688220796
INFO:root:Best r2 after tuning n_estimators is: 0.6592652688220796
INFO:root:Best r2 after tuning min samples split is: 0.6592652688220796
INFO:root:Best r2 after tuning min samples leaf is: 0.6592652688220796
INFO:root:Best r2 after tuning max features is: 0.668256774621893
INFO:root:Final Model Accuracy 0.668256774621893
INFO:root:Tuned parameters: {'max_depth': 19, 'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 3}
INFO:root:Random Forest Feature Importance [0.         0.112022   0.23594367 0.28353048 0.26532355 0.10318031]


{'max_depth': 19, 'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 3}


In [14]:
punts_dummies = fourth_down[
    fourth_down["play_type"].isin(
        ["Punt", "Blocked Punt", "Punt Return Touchdown", "Blocked Punt Touchdown"]
    )
]


predictors = punts_dummies.columns.drop(
    ["xPa", "play_type"]
)  # predictor variables used: all variables besides the target variable
target = punts_dummies["xPa"].values  # target variable


# splits the subset into a training set to fit the models on and a testing set to test the models on for their accuracy
punts_train_data, punts_test_data, punts_train_sln, punts_test_sln = train_test_split(
    punts_dummies[predictors], target, test_size=0.2, random_state=0
)

punts_params = rf_regress_params_tuner(
    punts_train_data, punts_test_data, punts_train_sln, punts_test_sln, punts_dummies
)

INFO:root:Best r2 after tuning max depth is: 0.43809200058934317
INFO:root:Best r2 after tuning n_estimators is: 0.45328336851004947
INFO:root:Best r2 after tuning min samples split is: 0.45328336851004947
INFO:root:Best r2 after tuning min samples leaf is: 0.45328336851004947
INFO:root:Best r2 after tuning max features is: 0.458897749691072
INFO:root:Final Model Accuracy 0.458897749691072
INFO:root:Tuned parameters: {'max_depth': 25, 'n_estimators': 600, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 1}
INFO:root:Random Forest Feature Importance [0.         0.12686394 0.18906139 0.30806678 0.22413285 0.15187505]


In [15]:
go_for_it_dummies = fourth_down[
    ~fourth_down["play_type"].isin(
        [
            "Field Goal Good",
            "Missed Field Goal Return Touchdown",
            "Missed Field Goal Return",
            "Blocked Field Goal",
            "Field Goal Missed",
            "Blocked Field Goal Touchdown",
            "Punt",
            "Blocked Punt",
            "Punt Return Touchdown",
            "Blocked Punt Touchdown",
            "Penalty",
            "Timeout",
            "Kickoff Return (Offense)",
            "Kickoff Return Touchdown",
        ]
    )
]


predictors = go_for_it_dummies.columns.drop(
    ["xPa", "play_type"]
)  # predictor variables used: all variables besides the target variable
target = go_for_it_dummies["xPa"].values  # target variable


# splits the subset into a training set to fit the models on and a testing set to test the models on for their accuracy
(
    go_for_it_train_data,
    go_for_it_test_data,
    go_for_it_train_sln,
    go_for_it_test_sln,
) = train_test_split(
    go_for_it_dummies[predictors], target, test_size=0.2, random_state=0
)

go_for_it_params = rf_regress_params_tuner(
    go_for_it_train_data,
    go_for_it_test_data,
    go_for_it_train_sln,
    go_for_it_test_sln,
    field_goals_dummies,
)

INFO:root:Best r2 after tuning max depth is: 0.3441703534050119
INFO:root:Best r2 after tuning n_estimators is: 0.3488796405486472
INFO:root:Best r2 after tuning min samples split is: 0.3488796405486472
INFO:root:Best r2 after tuning min samples leaf is: 0.3488796405486472
INFO:root:Best r2 after tuning max features is: 0.4019145734003988
INFO:root:Final Model Accuracy 0.4019145734003988
INFO:root:Tuned parameters: {'max_depth': 17, 'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 1}
INFO:root:Random Forest Feature Importance [0.         0.09193569 0.24418779 0.29077146 0.27616324 0.09694181]


In [16]:
from sklearn.ensemble import RandomForestRegressor


fg_forest = RandomForestRegressor(
    max_depth=field_goals_params["max_depth"],
    n_estimators=field_goals_params["n_estimators"],
    min_samples_split=field_goals_params["min_samples_split"],
    min_samples_leaf=field_goals_params["min_samples_leaf"],
    max_features=field_goals_params["max_features"],
    random_state=0,
)

fg_forest.fit(fg_train_data, fg_train_sln)
predictions = fg_forest.predict(prediction_set)

fourth_down["field_goal_xPa"] = predictions
fourth_down.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fourth_down["field_goal_xPa"] = predictions


Unnamed: 0,play_type,down,distance,soccer_time,yard_line,score_differential,first_down_prob,xPa,field_goal_xPa
10,Punt,4,3,2,53,0,0.407783,0.0,1.62
33,Punt,4,6,9,28,7,0.341637,0.0,0.602773
64,Field Goal Missed,4,4,10,68,7,0.353952,0.0,0.349773
105,Punt,4,7,3,22,7,0.314798,0.0,1.585
116,Punt,4,5,12,42,7,0.355392,0.0,0.876023


In [17]:
punt_forest = RandomForestRegressor(
    max_depth=punts_params["max_depth"],
    n_estimators=punts_params["n_estimators"],
    min_samples_split=punts_params["min_samples_split"],
    min_samples_leaf=punts_params["min_samples_leaf"],
    max_features=punts_params["max_features"],
    random_state=0,
)

punt_forest.fit(punts_train_data, punts_train_sln)
predictions = punt_forest.predict(prediction_set)

fourth_down["punt_xPa"] = predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fourth_down["punt_xPa"] = predictions


In [18]:
gfi_forest = RandomForestRegressor(
    max_depth=go_for_it_params["max_depth"],
    n_estimators=go_for_it_params["n_estimators"],
    min_samples_split=go_for_it_params["min_samples_split"],
    min_samples_leaf=go_for_it_params["min_samples_leaf"],
    max_features=go_for_it_params["max_features"],
    random_state=0,
)

gfi_forest.fit(go_for_it_train_data, go_for_it_train_sln)
predictions = gfi_forest.predict(prediction_set)

fourth_down["go_for_it_xPa"] = predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fourth_down["go_for_it_xPa"] = predictions


In [19]:
fourth_down.head()

Unnamed: 0,play_type,down,distance,soccer_time,yard_line,score_differential,first_down_prob,xPa,field_goal_xPa,punt_xPa,go_for_it_xPa
10,Punt,4,3,2,53,0,0.407783,0.0,1.62,0.0,-0.06643
33,Punt,4,6,9,28,7,0.341637,0.0,0.602773,0.0,0.289545
64,Field Goal Missed,4,4,10,68,7,0.353952,0.0,0.349773,0.0,-0.437062
105,Punt,4,7,3,22,7,0.314798,0.0,1.585,0.0,-0.065
116,Punt,4,5,12,42,7,0.355392,0.0,0.876023,0.0,-0.090682


In [None]:
conditions = [
    (fourth_down["field_goal_xPa"] > fourth_down["punt_xPa"])
    & (fourth_down["field_goal_xPa"] > fourth_down["go_for_it_xPa"]),
    (fourth_down["punt_xPa"] > fourth_down["field_goal_xPa"])
    & (fourth_down["punt_xPa"] > fourth_down["go_for_it_xPa"]),
    (fourth_down["go_for_it_xPa"] > fourth_down["field_goal_xPa"])
    & (fourth_down["go_for_it_xPa"] > fourth_down["punt_xPa"]),
]

values = ["Attempt FG", "Punt", "Go For It"]

fourth_down["suggested_decision"] = np.select(conditions, values)

fourth_down.to_excel("fourth_down_decision_making.xlsx", index=False)