### Description:
This file shows the results for not using categorical intervals for time and field position.

In [5]:
import pandas as pd
import numpy as np

dataframe = pd.read_csv("big_ten_pbp.csv")

dataframe["soccer_time"] = dataframe["period"] * (
    16 - (dataframe["minutes"] + (dataframe["seconds"] / 60)).round(0)
).astype("int32")

dataframe["score_differential"] = (
    dataframe["offense_score"] - dataframe["defense_score"]
)
big_ten = dataframe[dataframe["offense_conference"] == "Big Ten"]

In [6]:
conditions = [
    (big_ten["play_type"] == "Field Goal Missed"),
    (big_ten["play_type"] == "Field Goal Good"),
    (big_ten["play_type"] == "Passing Touchdown")
    | (big_ten["play_type"] == "Rusing Touchdown"),
    (big_ten["play_type"] == "Interception Return Touchdown")
    | (big_ten["play_type"] == "Fumble Return Touchdown")
    | (big_ten["play_type"] == "Punt Return Touchdown")
    | (big_ten["play_type"] == "Blocked Punt Touchdown")
    | (big_ten["play_type"] == "Missed Field Goal Return Touchdown")
    | (big_ten["play_type"] == "Blocked Field Goal Touchdown"),
    (big_ten["play_type"] == "Safety"),
]

values = [0, 3, 7, -7, 2]

big_ten["points_scored"] = np.select(conditions, values)

big_ten = big_ten[(big_ten["down"] > 0)]
big_ten = big_ten[(big_ten["distance"] > 0)]


big_ten["points_scored"].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  big_ten["points_scored"] = np.select(conditions, values)


array([ 0,  7,  3, -7,  2])

In [7]:
grouped = (
    big_ten.groupby(
        ["down", "distance", "soccer_time", "yard_line", "score_differential"]
    )["points_scored"]
    .agg(["sum", "count", "mean", "median"])
    .reset_index()
)
grouped["mean"].unique()

array([ 0.        ,  7.        ,  3.        ,  3.5       ,  4.66666667,
        2.33333333, -1.4       ,  1.4       , -7.        ,  2.8       ,
        2.        ,  0.875     ,  1.75      , -3.5       ,  1.16666667,
        1.55555556, -1.27272727,  1.27272727,  0.75      ,  1.        ,
        1.5       ,  5.25      ])

In [8]:
# this cell is placing xP values in the right rows and then subtracting the points scored from the xp to get the xP added for the play.
# I wanted to see the average difference between my xPa model compared to the College football data's version of it
intervals = []
conditions = []

i = 0
for row in grouped.to_dict("records"):
    if i % 10000 == 0:
        print(f"{i} completed")
    intervals.append(row["mean"])
    conditions.append(
        (big_ten["down"] == row["down"])
        & (big_ten["distance"] == row["distance"])
        & (big_ten["soccer_time"] == row["soccer_time"])
        & (big_ten["yard_line"] == row["yard_line"])
        & (big_ten["score_differential"] == row["score_differential"])
    )
    i += 1

big_ten["xP"] = np.select(conditions, intervals)
big_ten["xPa"] = big_ten["points_scored"] - big_ten["xP"]

epa_diff = big_ten["xPa"] - big_ten["ppa"]
epa_diff.mean()

0 completed
10000 completed
20000 completed
30000 completed
40000 completed
50000 completed
60000 completed


: 

: 

In [None]:
# This cell is pulling and calculating the probability a first down is acheived by down and distance
# This will be used in the model making process to gauge how probable it is to make a first down

big_ten["got_first_down"] = big_ten["yards_gained"] >= big_ten["distance"]

first_down_prob = (
    big_ten.groupby(["down", "distance"])["got_first_down"]
    .agg(["sum", "count"])
    .reset_index()
)
first_down_prob["first_down_prob"] = first_down_prob["sum"] / first_down_prob["count"]

intervals = []
conditions = []

for i in range(len(first_down_prob)):
    intervals.append(first_down_prob["first_down_prob"][i])
    conditions.append(
        (big_ten["down"] == first_down_prob["down"][i])
        & (big_ten["distance"] == first_down_prob["distance"][i])
    )

big_ten["first_down_prob"] = np.select(conditions, intervals)

In [None]:
# Now let's create the models for the usual 4th down decision making. I will make 3 different models (Field Goals, Go For It, and Punting)

subset = big_ten[
    [
        "play_type",
        "down",
        "distance",
        "soccer_time",
        "yard_line",
        "score_differential",
        "first_down_prob",
        "xPa",
    ]
]
subset = subset[~subset["play_type"].isin(["Kickoff", "Uncategorized"])]

fourth_down = subset[subset["down"] == 4]
predictiors = fourth_down.columns.drop(["xPa", "play_type"])
prediction_set = fourth_down[predictiors]

The models will be split up by the 3 decisions that could be made. One model for Field Goals, one for punting, and one for going for it.
I will then use each model to predict the xPa value for each 4th down situation for each deicison.

In [None]:
from sklearn.model_selection import train_test_split
from utils.utils import rf_regress_params_tuner


field_goals_dummies = fourth_down[
    fourth_down["play_type"].isin(
        [
            "Field Goal Good",
            "Missed Field Goal Return Touchdown",
            "Missed Field Goal Return",
            "Blocked Field Goal",
            "Field Goal Missed",
            "Blocked Field Goal Touchdown",
        ]
    )
]

predictors = field_goals_dummies.columns.drop(
    ["xPa", "play_type"]
)  # predictor variables used: all variables besides the target variable
target = field_goals_dummies["xPa"].values  # target variable


# splits the subset into a training set to fit the models on and a testing set to test the models on for their accuracy
fg_train_data, fg_test_data, fg_train_sln, fg_test_sln = train_test_split(
    field_goals_dummies[predictors], target, test_size=0.2, random_state=0
)

field_goals_params = rf_regress_params_tuner(
    fg_train_data, fg_test_data, fg_train_sln, fg_test_sln, field_goals_dummies
)
print(field_goals_params)

INFO:root:Best r2 after tuning max depth is: 0.5332176374510551
INFO:root:Best r2 after tuning n_estimators is: 0.5368776261798488
INFO:root:Best r2 after tuning min samples split is: 0.5368776261798488
INFO:root:Best r2 after tuning min samples leaf is: 0.5368776261798488
INFO:root:Best r2 after tuning max features is: 0.5694320688930051
INFO:root:Final Model Accuracy 0.5694320688930051
INFO:root:Tuned parameters: {'max_depth': 21, 'n_estimators': 150, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 1}
INFO:root:Random Forest Feature Importance [0.         0.0908242  0.21889899 0.3840234  0.20340367 0.10284974]


{'max_depth': 21, 'n_estimators': 150, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 1}


In [None]:
punts_dummies = fourth_down[
    fourth_down["play_type"].isin(
        ["Punt", "Blocked Punt", "Punt Return Touchdown", "Blocked Punt Touchdown"]
    )
]


predictors = punts_dummies.columns.drop(
    ["xPa", "play_type"]
)  # predictor variables used: all variables besides the target variable
target = punts_dummies["xPa"].values  # target variable


# splits the subset into a training set to fit the models on and a testing set to test the models on for their accuracy
punts_train_data, punts_test_data, punts_train_sln, punts_test_sln = train_test_split(
    punts_dummies[predictors], target, test_size=0.2, random_state=0
)

punts_params = rf_regress_params_tuner(
    punts_train_data, punts_test_data, punts_train_sln, punts_test_sln, punts_dummies
)

INFO:root:Best r2 after tuning max depth is: 0.5941965164797006
INFO:root:Best r2 after tuning n_estimators is: 0.5976912009036544
INFO:root:Best r2 after tuning min samples split is: 0.5976912009036544
INFO:root:Best r2 after tuning min samples leaf is: 0.5976912009036544
INFO:root:Best r2 after tuning max features is: 0.6179211505967168
INFO:root:Final Model Accuracy 0.6179211505967168
INFO:root:Tuned parameters: {'max_depth': 23, 'n_estimators': 700, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 1}
INFO:root:Random Forest Feature Importance [0.         0.10791966 0.23842779 0.32399791 0.23678583 0.0928688 ]


In [None]:
go_for_it_dummies = fourth_down[
    ~fourth_down["play_type"].isin(
        [
            "Field Goal Good",
            "Missed Field Goal Return Touchdown",
            "Missed Field Goal Return",
            "Blocked Field Goal",
            "Field Goal Missed",
            "Blocked Field Goal Touchdown",
            "Punt",
            "Blocked Punt",
            "Punt Return Touchdown",
            "Blocked Punt Touchdown",
            "Penalty",
            "Timeout",
            "Kickoff Return (Offense)",
            "Kickoff Return Touchdown",
        ]
    )
]


predictors = go_for_it_dummies.columns.drop(
    ["xPa", "play_type"]
)  # predictor variables used: all variables besides the target variable
target = go_for_it_dummies["xPa"].values  # target variable


# splits the subset into a training set to fit the models on and a testing set to test the models on for their accuracy
(
    go_for_it_train_data,
    go_for_it_test_data,
    go_for_it_train_sln,
    go_for_it_test_sln,
) = train_test_split(
    go_for_it_dummies[predictors], target, test_size=0.2, random_state=0
)

go_for_it_params = rf_regress_params_tuner(
    go_for_it_train_data,
    go_for_it_test_data,
    go_for_it_train_sln,
    go_for_it_test_sln,
    field_goals_dummies,
)

INFO:root:Best r2 after tuning max depth is: 0.3619353969636485
INFO:root:Best r2 after tuning n_estimators is: 0.4501139148598493
INFO:root:Best r2 after tuning min samples split is: 0.4501139148598493
INFO:root:Best r2 after tuning min samples leaf is: 0.4501139148598493
INFO:root:Best r2 after tuning max features is: 0.6286659983253435
INFO:root:Final Model Accuracy 0.6286659983253435
INFO:root:Tuned parameters: {'max_depth': 17, 'n_estimators': 600, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 1}
INFO:root:Random Forest Feature Importance [0.         0.08451227 0.23533226 0.29779051 0.30836604 0.07399893]


In [None]:
from sklearn.ensemble import RandomForestRegressor


fg_forest = RandomForestRegressor(
    max_depth=field_goals_params["max_depth"],
    n_estimators=field_goals_params["n_estimators"],
    min_samples_split=field_goals_params["min_samples_split"],
    min_samples_leaf=field_goals_params["min_samples_leaf"],
    max_features=field_goals_params["max_features"],
    random_state=0,
)

fg_forest.fit(fg_train_data, fg_train_sln)
predictions = fg_forest.predict(prediction_set)

fourth_down["field_goal_xPa"] = predictions
fourth_down.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fourth_down["field_goal_xPa"] = predictions


Unnamed: 0,play_type,down,distance,soccer_time,yard_line,score_differential,first_down_prob,xPa,field_goal_xPa
10,Punt,4,3,2,53,0,0.406198,0.0,0.34
33,Punt,4,6,9,28,7,0.342199,0.0,0.0
64,Field Goal Missed,4,4,10,68,7,0.353091,0.0,0.331058
105,Punt,4,7,3,22,7,0.314996,0.0,0.04
116,Punt,4,5,12,42,7,0.355285,0.0,0.16


In [None]:
punt_forest = RandomForestRegressor(
    max_depth=punts_params["max_depth"],
    n_estimators=punts_params["n_estimators"],
    min_samples_split=punts_params["min_samples_split"],
    min_samples_leaf=punts_params["min_samples_leaf"],
    max_features=punts_params["max_features"],
    random_state=0,
)

punt_forest.fit(punts_train_data, punts_train_sln)
predictions = punt_forest.predict(prediction_set)

fourth_down["punt_xPa"] = predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fourth_down["punt_xPa"] = predictions


In [None]:
gfi_forest = RandomForestRegressor(
    max_depth=go_for_it_params["max_depth"],
    n_estimators=go_for_it_params["n_estimators"],
    min_samples_split=go_for_it_params["min_samples_split"],
    min_samples_leaf=go_for_it_params["min_samples_leaf"],
    max_features=go_for_it_params["max_features"],
    random_state=0,
)

gfi_forest.fit(go_for_it_train_data, go_for_it_train_sln)
predictions = gfi_forest.predict(prediction_set)

fourth_down["go_for_it_xPa"] = predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fourth_down["go_for_it_xPa"] = predictions


In [None]:
fourth_down.head()

Unnamed: 0,play_type,down,distance,soccer_time,yard_line,score_differential,first_down_prob,xPa,field_goal_xPa,punt_xPa,go_for_it_xPa
10,Punt,4,3,2,53,0,0.406198,0.0,0.34,0.0,0.0
33,Punt,4,6,9,28,7,0.342199,0.0,0.0,-0.03,-0.146667
64,Field Goal Missed,4,4,10,68,7,0.353091,0.0,0.331058,-0.304286,-0.231852
105,Punt,4,7,3,22,7,0.314996,0.0,0.04,-0.021429,0.136167
116,Punt,4,5,12,42,7,0.355285,0.0,0.16,-0.329549,-0.646851


In [None]:
conditions = [
    (fourth_down["field_goal_xPa"] > fourth_down["punt_xPa"])
    & (fourth_down["field_goal_xPa"] > fourth_down["go_for_it_xPa"]),
    (fourth_down["punt_xPa"] > fourth_down["field_goal_xPa"])
    & (fourth_down["punt_xPa"] > fourth_down["go_for_it_xPa"]),
    (fourth_down["go_for_it_xPa"] > fourth_down["field_goal_xPa"])
    & (fourth_down["go_for_it_xPa"] > fourth_down["punt_xPa"]),
]

values = ["Attempt FG", "Punt", "Go For It"]

fourth_down["suggested_decision"] = np.select(conditions, values)

fourth_down.to_csv("fourth_down_no_categories_results.csv", index=False)

: 

: 