In [15]:
import pandas as pd
import numpy as np

dataframe = pd.read_csv("big_ten_pbp.csv")

dataframe["soccer_time"] = dataframe["period"] * (
    16 - (dataframe["minutes"] + (dataframe["seconds"] / 60)).round(0)
).astype("int32")

soccer_time = range(1, 64, 3)
intervals = []
conditions = []

for i in soccer_time:
    if i > 60:
        intervals.append(f"{i}")
        conditions.append((dataframe["soccer_time"] == i))
    if i == 58:
        intervals.append(f"{i}-{i+2}")
        conditions.append(
            (dataframe["soccer_time"] >= i) & (dataframe["soccer_time"] <= (i + 2))
        )
    else:
        intervals.append(f"{i}-{i+3}")
        conditions.append(
            (dataframe["soccer_time"] >= i) & (dataframe["soccer_time"] <= (i + 3))
        )


dataframe["time_intervals"] = np.select(conditions, intervals)
dataframe["score_differential"] = (
    dataframe["offense_score"] - dataframe["defense_score"]
)
big_ten = dataframe[dataframe["offense_conference"] == "Big Ten"]

In [16]:
field_pos_range = range(1, 100, 5)

intervals = []
conditions = []

for i in field_pos_range:
    intervals.append(f"{i}-{i+4}")
    conditions.append((big_ten["yard_line"] >= i) & (big_ten["yard_line"] <= (i + 4)))

big_ten["field_position_intervals"] = np.select(conditions, intervals)
big_ten["field_position_intervals"].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  big_ten["field_position_intervals"] = np.select(conditions, intervals)


array(['31-35', '46-50', '51-55', '36-40', '21-25', '26-30', '41-45',
       '61-65', '66-70', '71-75', '81-85', '86-90', '91-95', '96-100',
       '16-20', '11-15', '0', '6-10', '56-60', '76-80', '1-5'],
      dtype=object)

In [17]:
conditions = [
    (big_ten["play_type"] == "Field Goal Missed"),
    (big_ten["play_type"] == "Field Goal Good"),
    (big_ten["play_type"] == "Passing Touchdown")
    | (big_ten["play_type"] == "Rusing Touchdown"),
    (big_ten["play_type"] == "Interception Return Touchdown")
    | (big_ten["play_type"] == "Fumble Return Touchdown")
    | (big_ten["play_type"] == "Punt Return Touchdown")
    | (big_ten["play_type"] == "Blocked Punt Touchdown")
    | (big_ten["play_type"] == "Missed Field Goal Return Touchdown")
    | (big_ten["play_type"] == "Blocked Field Goal Touchdown"),
    (big_ten["play_type"] == "Safety"),
]

values = [0, 3, 7, -7, 2]

big_ten["points_scored"] = np.select(conditions, values)

big_ten = big_ten[(big_ten["down"] > 0)]
big_ten = big_ten[(big_ten["distance"] > 0)]
big_ten = big_ten[(big_ten["time_intervals"] != "0")]
big_ten = big_ten[(big_ten["field_position_intervals"] != "0")]
print(big_ten["field_position_intervals"].unique())

big_ten[["time_lower", "time_upper"]] = big_ten.time_intervals.str.split(
    "-", expand=True
).astype("int")
big_ten[["field_position_lower", "field_position_upper"]] = (
    big_ten.field_position_intervals.str.split("-", expand=True).astype("int")
)

big_ten["points_scored"].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  big_ten["points_scored"] = np.select(conditions, values)


['31-35' '46-50' '51-55' '36-40' '21-25' '26-30' '41-45' '61-65' '66-70'
 '71-75' '81-85' '86-90' '91-95' '96-100' '16-20' '11-15' '6-10' '56-60'
 '76-80' '1-5']


array([ 0,  7,  3, -7,  2])

In [18]:
grouped = (
    big_ten.groupby(
        [
            "down",
            "distance",
            "time_intervals",
            "field_position_intervals",
            "score_differential",
        ]
    )["points_scored"]
    .agg(["sum", "count", "mean", "median"])
    .reset_index()
)
grouped["mean"].count()

49758

In [19]:
# this cell is placing xP values in the right rows and then subtracting the points scored from the xp to get the xP added for the play.
# I wanted to see the average difference between my xPa model compared to the College football data's version of it
intervals = []
conditions = []
i = 0
for row in grouped.to_dict("records"):
    if i % 10000 == 0:
        print(f"{i} completed")
    intervals.append(row["mean"])
    conditions.append(
        (big_ten["down"] == row["down"])
        & (big_ten["distance"] == row["distance"])
        & (big_ten["time_intervals"] == row["time_intervals"])
        & (big_ten["field_position_intervals"] == row["field_position_intervals"])
        & (big_ten["score_differential"] == row["score_differential"])
    )
    i += 1

big_ten["xP"] = np.select(conditions, intervals)
big_ten["xPa"] = big_ten["points_scored"] - big_ten["xP"]

epa_diff = big_ten["xPa"] - big_ten["ppa"]
epa_diff.mean()

0 completed
10000 completed
20000 completed
30000 completed
40000 completed


-0.1566256556736183

In [20]:
# This cell is pulling and calculating the probability a first down is acheived by down and distance
# This will be used in the model making process to gauge how probable it is to make a first down

big_ten["got_first_down"] = big_ten["yards_gained"] >= big_ten["distance"]

first_down_prob = (
    big_ten.groupby(["down", "distance"])["got_first_down"]
    .agg(["sum", "count"])
    .reset_index()
)
first_down_prob["first_down_prob"] = first_down_prob["sum"] / first_down_prob["count"]

intervals = []
conditions = []

for i in range(len(first_down_prob)):
    intervals.append(first_down_prob["first_down_prob"][i])
    conditions.append(
        (big_ten["down"] == first_down_prob["down"][i])
        & (big_ten["distance"] == first_down_prob["distance"][i])
    )

big_ten["first_down_prob"] = np.select(conditions, intervals)

In [21]:
# Now let's create the models for the usual 4th down decision making. I will make 3 different models (Field Goals, Go For It, and Punting)

subset = big_ten[
    [
        "play_type",
        "down",
        "distance",
        "time_intervals",
        "field_position_intervals",
        "score_differential",
        "first_down_prob",
        "xPa",
    ]
]
subset = subset[~subset["play_type"].isin(["Kickoff", "Uncategorized"])]

fourth_down = subset[subset["down"] == 4]
fourth_down_dummies = pd.get_dummies(
    fourth_down, columns=["time_intervals", "field_position_intervals"]
)
predictiors = fourth_down_dummies.columns.drop(["xPa", "play_type"])
prediction_set = fourth_down_dummies[predictiors]

The models will be split up by the 3 decisions that could be made. One model for Field Goals, one for punting, and one for going for it.
I will then use each model to predict the xPa value for each 4th down situation for each deicison.

In [22]:
from sklearn.model_selection import train_test_split
from utils import rf_regress_params_tuner


field_goals_dummies = fourth_down_dummies[
    fourth_down_dummies["play_type"].isin(
        [
            "Field Goal Good",
            "Missed Field Goal Return Touchdown",
            "Missed Field Goal Return",
            "Blocked Field Goal",
            "Field Goal Missed",
            "Blocked Field Goal Touchdown",
        ]
    )
]

predictors = field_goals_dummies.columns.drop(
    ["xPa", "play_type"]
)  # predictor variables used: all variables besides the target variable
target = field_goals_dummies["xPa"].values  # target variable


# splits the subset into a training set to fit the models on and a testing set to test the models on for their accuracy
fg_train_data, fg_test_data, fg_train_sln, fg_test_sln = train_test_split(
    field_goals_dummies[predictors], target, test_size=0.2, random_state=0
)

field_goals_params = rf_regress_params_tuner(
    fg_train_data, fg_test_data, fg_train_sln, fg_test_sln, field_goals_dummies
)
print(field_goals_params)

INFO:root:Best r2 after tuning max depth is: 0.4691997349066829
INFO:root:Best r2 after tuning n_estimators is: 0.4803799724824406
INFO:root:Best r2 after tuning min samples split is: 0.4803799724824406
INFO:root:Best r2 after tuning min samples leaf is: 0.4803799724824406
INFO:root:Best r2 after tuning max features is: 0.7021652995984351
INFO:root:Final Model Accuracy 0.7021652995984351
INFO:root:Tuned parameters: {'max_depth': 23, 'n_estimators': 150, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 5}
INFO:root:Random Forest Feature Importance [0.00000000e+00 1.57967035e-01 1.87195214e-01 1.95967986e-01
 4.24533117e-02 2.44579213e-02 3.82462204e-02 5.75786875e-02
 1.93302861e-04 0.00000000e+00 6.67162143e-03 0.00000000e+00
 8.75361488e-04 2.04806125e-02 4.64403039e-04 2.95747434e-02
 0.00000000e+00 1.62375983e-04 5.75131413e-03 0.00000000e+00
 3.62398716e-03 0.00000000e+00 0.00000000e+00 3.95917476e-02
 3.94411722e-04 2.59749050e-02 1.36438572e-02 1.59103996e-02
 1.905

{'max_depth': 23, 'n_estimators': 150, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 5}


In [23]:
punts_dummies = fourth_down_dummies[
    fourth_down_dummies["play_type"].isin(
        ["Punt", "Blocked Punt", "Punt Return Touchdown", "Blocked Punt Touchdown"]
    )
]


predictors = punts_dummies.columns.drop(
    ["xPa", "play_type"]
)  # predictor variables used: all variables besides the target variable
target = punts_dummies["xPa"].values  # target variable


# splits the subset into a training set to fit the models on and a testing set to test the models on for their accuracy
punts_train_data, punts_test_data, punts_train_sln, punts_test_sln = train_test_split(
    punts_dummies[predictors], target, test_size=0.2, random_state=0
)

punts_params = rf_regress_params_tuner(
    punts_train_data, punts_test_data, punts_train_sln, punts_test_sln, punts_dummies
)

INFO:root:Best r2 after tuning max depth is: 0.0027442533117317813
INFO:root:Best r2 after tuning n_estimators is: 0.0027442533117317813
INFO:root:Best r2 after tuning min samples split is: 0.010531376336296083
INFO:root:Best r2 after tuning min samples leaf is: 0.0151613940588875
INFO:root:Best r2 after tuning max features is: 0.025811912808242354
INFO:root:Final Model Accuracy 0.025811912808242354
INFO:root:Tuned parameters: {'max_depth': 9, 'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 13}
INFO:root:Random Forest Feature Importance [0.         0.18501133 0.41108216 0.11105338 0.01470847 0.06370977
 0.0449716  0.05909482 0.         0.         0.         0.
 0.         0.04613651 0.         0.01215118 0.         0.
 0.         0.         0.         0.         0.         0.01841929
 0.         0.         0.00957911 0.         0.01801586 0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.    

In [29]:
go_for_it_dummies = fourth_down_dummies[
    ~fourth_down_dummies["play_type"].isin(
        [
            "Field Goal Good",
            "Missed Field Goal Return Touchdown",
            "Missed Field Goal Return",
            "Blocked Field Goal",
            "Field Goal Missed",
            "Blocked Field Goal Touchdown",
            "Punt",
            "Blocked Punt",
            "Punt Return Touchdown",
            "Blocked Punt Touchdown",
            "Penalty",
            "Timeout",
            "Kickoff Return (Offense)",
            "Kickoff Return Touchdown",
        ]
    )
]


predictors = go_for_it_dummies.columns.drop(
    ["xPa", "play_type"]
)  # predictor variables used: all variables besides the target variable
target = go_for_it_dummies["xPa"].values  # target variable


# splits the subset into a training set to fit the models on and a testing set to test the models on for their accuracy
(
    go_for_it_train_data,
    go_for_it_test_data,
    go_for_it_train_sln,
    go_for_it_test_sln,
) = train_test_split(
    go_for_it_dummies[predictors], target, test_size=0.2, random_state=0
)

go_for_it_params = rf_regress_params_tuner(
    go_for_it_train_data,
    go_for_it_test_data,
    go_for_it_train_sln,
    go_for_it_test_sln,
    field_goals_dummies,
)

INFO:root:Best r2 after tuning max depth is: -0.01210399989238753
INFO:root:Best r2 after tuning n_estimators is: -0.010968486474240091
INFO:root:Best r2 after tuning min samples split is: -0.010968486474240091
INFO:root:Best r2 after tuning min samples leaf is: -0.010968486474240091
INFO:root:Best r2 after tuning max features is: -0.00014295168461808672
INFO:root:Final Model Accuracy -0.00014295168461808672
INFO:root:Tuned parameters: {'max_depth': 1, 'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 1}
INFO:root:Random Forest Feature Importance [0.    0.    0.01  0.    0.    0.    0.    0.    0.12  0.09  0.    0.
 0.    0.    0.    0.    0.    0.115 0.    0.    0.    0.    0.    0.
 0.02  0.07  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.22
 0.    0.    0.    0.015 0.01  0.    0.    0.33 ]


In [25]:
from sklearn.ensemble import RandomForestRegressor


fg_forest = RandomForestRegressor(
    max_depth=field_goals_params["max_depth"],
    n_estimators=field_goals_params["n_estimators"],
    min_samples_split=field_goals_params["min_samples_split"],
    min_samples_leaf=field_goals_params["min_samples_leaf"],
    max_features=field_goals_params["max_features"],
    random_state=0,
)

fg_forest.fit(fg_train_data, fg_train_sln)
predictions = fg_forest.predict(prediction_set)

fourth_down["field_goal_xPa"] = predictions
fourth_down.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fourth_down["field_goal_xPa"] = predictions


Unnamed: 0,play_type,down,distance,time_intervals,field_position_intervals,score_differential,first_down_prob,xPa,field_goal_xPa
10,Punt,4,3,1-4,51-55,0,0.407906,0.0,0.329731
33,Punt,4,6,7-10,26-30,7,0.342806,0.0,0.0
64,Field Goal Missed,4,4,7-10,66-70,7,0.35269,0.0,0.000364
105,Punt,4,7,1-4,21-25,7,0.315556,0.0,0.000149
116,Punt,4,5,10-13,41-45,7,0.354812,0.0,0.001616


In [26]:
punt_forest = RandomForestRegressor(
    max_depth=punts_params["max_depth"],
    n_estimators=punts_params["n_estimators"],
    min_samples_split=punts_params["min_samples_split"],
    min_samples_leaf=punts_params["min_samples_leaf"],
    max_features=punts_params["max_features"],
    random_state=0,
)

punt_forest.fit(punts_train_data, punts_train_sln)
predictions = punt_forest.predict(prediction_set)

fourth_down["punt_xPa"] = predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fourth_down["punt_xPa"] = predictions


In [27]:
gfi_forest = RandomForestRegressor(
    max_depth=go_for_it_params["max_depth"],
    n_estimators=go_for_it_params["n_estimators"],
    min_samples_split=go_for_it_params["min_samples_split"],
    min_samples_leaf=go_for_it_params["min_samples_leaf"],
    max_features=go_for_it_params["max_features"],
    random_state=0,
)

gfi_forest.fit(go_for_it_train_data, go_for_it_train_sln)
predictions = gfi_forest.predict(prediction_set)

fourth_down["go_for_it_xPa"] = predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fourth_down["go_for_it_xPa"] = predictions


In [28]:
fourth_down.head()

Unnamed: 0,play_type,down,distance,time_intervals,field_position_intervals,score_differential,first_down_prob,xPa,field_goal_xPa,punt_xPa,go_for_it_xPa
10,Punt,4,3,1-4,51-55,0,0.407906,0.0,0.329731,0.0,-0.001656
33,Punt,4,6,7-10,26-30,7,0.342806,0.0,0.0,-0.000524,-0.001558
64,Field Goal Missed,4,4,7-10,66-70,7,0.35269,0.0,0.000364,-0.000169,-0.00149
105,Punt,4,7,1-4,21-25,7,0.315556,0.0,0.000149,-0.0016,-0.001629
116,Punt,4,5,10-13,41-45,7,0.354812,0.0,0.001616,0.0,-0.00246


In [None]:
conditions = [
    (fourth_down["field_goal_xPa"] > fourth_down["punt_xPa"])
    & (fourth_down["field_goal_xPa"] > fourth_down["go_for_it_xPa"]),
    (fourth_down["punt_xPa"] > fourth_down["field_goal_xPa"])
    & (fourth_down["punt_xPa"] > fourth_down["go_for_it_xPa"]),
    (fourth_down["go_for_it_xPa"] > fourth_down["field_goal_xPa"])
    & (fourth_down["go_for_it_xPa"] > fourth_down["punt_xPa"]),
]

values = ["Attempt FG", "Punt", "Go For It"]

fourth_down["suggested_decision"] = np.select(conditions, values)

fourth_down.to_excel("fourth_down_decision_making.xlsx", index=False)

: 

: 