In [1]:
import pandas as pd
import numpy as np

dataframe = pd.read_csv('big_ten_pbp.csv')

dataframe["soccer_time"] = dataframe['period'] * ( 16 - (dataframe['minutes'] + (dataframe['seconds']/60)).round(0)).astype("int32")

soccer_time = range(1,64, 3)
intervals = []
conditions = []

for i in soccer_time:
    if i > 60:
        intervals.append(f"{i}")
        conditions.append( (dataframe["soccer_time"] == i) )
    if i == 58:
        intervals.append(f"{i}-{i+2}")
        conditions.append( (dataframe["soccer_time"] >= i) & (dataframe["soccer_time"] <= (i+2)))
    else:
        intervals.append(f"{i}-{i+3}")
        conditions.append( (dataframe["soccer_time"] >= i) & (dataframe["soccer_time"] <= (i+3)))



dataframe["time_intervals"] = np.select(conditions, intervals)
dataframe["score_differential"] = dataframe["offense_score"] - dataframe["defense_score"]
big_ten = dataframe[dataframe["offense_conference"] == "Big Ten"]

In [2]:
big_ten['offense'].unique()

array(['Minnesota', 'Maryland', 'Purdue', 'Michigan State', 'Illinois',
       'Iowa', 'Michigan', 'Nebraska', 'Northwestern', 'Wisconsin',
       'Ohio State', 'Indiana', 'Rutgers', 'Penn State'], dtype=object)

In [3]:
field_pos_range = range(1,100,5)

intervals = []
conditions = []

for i in field_pos_range:
    intervals.append(f"{i}-{i+4}")
    conditions.append( (big_ten["yard_line"] >= i) & (big_ten["yard_line"] <= (i+4)))

big_ten["field_position_intervals"] = np.select(conditions, intervals)
big_ten["field_position_intervals"].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  big_ten["field_position_intervals"] = np.select(conditions, intervals)


array(['31-35', '46-50', '51-55', '36-40', '21-25', '26-30', '41-45',
       '61-65', '66-70', '71-75', '81-85', '86-90', '91-95', '96-100',
       '16-20', '11-15', '0', '6-10', '56-60', '76-80', '1-5'],
      dtype=object)

In [4]:
conditions = [
    (big_ten["play_type"] == "Field Goal Missed"),
    (big_ten["play_type"] == "Field Goal Good"),
    (big_ten["play_type"] == "Passing Touchdown") | (big_ten["play_type"] == "Rusing Touchdown"),
    (big_ten["play_type"] == "Interception Return Touchdown") 
    | (big_ten["play_type"] == "Fumble Return Touchdown") 
    | (big_ten["play_type"] == "Punt Return Touchdown") 
    | (big_ten["play_type"] == "Blocked Punt Touchdown")
    | (big_ten["play_type"] == "Missed Field Goal Return Touchdown")
    | (big_ten["play_type"] == "Blocked Field Goal Touchdown"),
    (big_ten["play_type"] == "Safety"),
]

values = [0, 3, 7, -7, 2]

big_ten["points_scored"] = np.select(conditions, values)

big_ten = big_ten[(big_ten["down"] > 0) ]
big_ten = big_ten[(big_ten["distance"] > 0)]
big_ten = big_ten[(big_ten["time_intervals"] != '0')]

big_ten[['time_lower', 'time_upper']] = big_ten.time_intervals.str.split('-', expand=True).astype('int')
big_ten[['field_position_lower', 'field_position_upper']] = big_ten.time_intervals.str.split('-', expand=True).astype('int')

big_ten["points_scored"].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  big_ten["points_scored"] = np.select(conditions, values)


array([ 0,  7,  3, -7,  2])

In [5]:
grouped = big_ten.groupby(["down", "distance", "soccer_time", "yard_line", "score_differential"])["points_scored"].agg(["sum","count","mean","median"]).reset_index()
#grouped[['time_lower', 'time_upper']] = grouped.time_intervals.str.split('-', expand=True).astype('int')
#grouped[['field_position_lower', 'field_position_upper']] = grouped.time_intervals.str.split('-', expand=True).astype('int')
grouped['mean'].unique()

array([ 0.        ,  7.        ,  3.        ,  3.5       ,  4.66666667,
        2.33333333, -1.4       ,  1.4       , -7.        ,  2.8       ,
        2.        ,  0.875     ,  1.75      , -3.5       ,  1.16666667,
        1.55555556, -1.27272727,  1.27272727,  0.75      ,  1.        ,
        1.5       ,  5.25      ])

In [6]:
# this cell is placing xP values in the right rows and then subtracting the points scored from the xp to get the xP added for the play.
# I wanted to see the average difference between my xPa model compared to the College football data's version of it
intervals = []
conditions = []

for i in range(len(grouped)):
    intervals.append(grouped["mean"][i])
    conditions.append( (big_ten["down"] == grouped["down"][i]) 
                        & (big_ten["distance"] == grouped['distance'][i]) 
                        & (big_ten['soccer_time'] == grouped['soccer_time'][i])
                        & (big_ten['yard_line'] == grouped['yard_line'][i])
                        #& (big_ten['time_upper'] <= grouped['time_upper'][i])
                        #& (big_ten['field_position_lower'] >= grouped['field_position_lower'][i])
                        #& (big_ten['field_position_upper'] <= grouped['field_position_upper'][i])
                        & (big_ten['score_differential'] == grouped['score_differential'][i]) )

big_ten["xP"] = np.select(conditions, intervals)
big_ten['xPa'] = big_ten["points_scored"] - big_ten['xP']

epa_diff = big_ten['xPa'] - big_ten['ppa']
epa_diff.mean()

In [None]:
# This cell is pulling and calculating the probability a first down is acheived by down and distance
# This will be used in the model making process to gauge how probable it is to make a first down

big_ten['got_first_down'] = big_ten['yards_gained'] >= big_ten["distance"]

first_down_prob = big_ten.groupby(['down', 'distance'])['got_first_down'].agg(['sum', 'count']).reset_index()
first_down_prob['first_down_prob'] = first_down_prob['sum']/first_down_prob['count']

intervals = []
conditions = []

for i in range(len(first_down_prob)):
    intervals.append(first_down_prob['first_down_prob'][i])
    conditions.append( (big_ten["down"] == first_down_prob["down"][i]) & (big_ten["distance"] == first_down_prob['distance'][i]))

big_ten["first_down_prob"] = np.select(conditions, intervals)


In [None]:
# Now let's create the models for the usual 4th down decision making. I will make 3 different models (Field Goals, Go For It, and Punting)

subset = big_ten[['play_type', 'down', 'distance', 'soccer_time', 'yard_line', 'first_down_prob', 'xPa']]
                    #'time_intervals', 'field_position_intervals', 'score_differential', 'first_down_prob', 'xPa']]
subset = subset[~subset['play_type'].isin(['Kickoff', 'Uncategorized'])]

fourth_down_dummies = subset[subset['down'] == 4]
#fourth_down_dummies = pd.get_dummies(fourth_down, columns=['time_intervals', "field_position_intervals"])
predictiors = fourth_down_dummies.columns.drop(['xPa', 'play_type'])
prediction_set = fourth_down_dummies[predictiors]

The models will be split up by the 3 decisions that could be made. One model for Field Goals, one for punting, and one for going for it.
I will then use each model to predict the xPa value for each 4th down situation for each deicison.

In [None]:
from sklearn.model_selection import train_test_split
from utils.utils import rf_regress_params_tuner


field_goals_dummies = fourth_down_dummies[fourth_down_dummies["play_type"].isin(['Field Goal Good', 'Missed Field Goal Return Touchdown', 'Missed Field Goal Return', 'Blocked Field Goal', 'Field Goal Missed', 'Blocked Field Goal Touchdown'])]

predictors = field_goals_dummies.columns.drop(['xPa', 'play_type']) #predictor variables used: all variables besides the target variable
target = field_goals_dummies['xPa'].values #target variable


#splits the subset into a training set to fit the models on and a testing set to test the models on for their accuracy
fg_train_data, fg_test_data, fg_train_sln, fg_test_sln = train_test_split(field_goals_dummies[predictors], target, test_size = 0.2, random_state=0)

field_goals_params = rf_regress_params_tuner(fg_train_data, fg_test_data, fg_train_sln, fg_test_sln, field_goals_dummies)
print(field_goals_params)

ModuleNotFoundError: No module named 'utils'

In [None]:
punts_dummies = fourth_down_dummies[fourth_down_dummies["play_type"].isin(['Punt', 'Blocked Punt', 'Punt Return Touchdown', 'Blocked Punt Touchdown'])]


predictors = punts_dummies.columns.drop(['xPa', '_play_type']) #predictor variables used: all variables besides the target variable
target = punts_dummies['xPa'].values #target variable


#splits the subset into a training set to fit the models on and a testing set to test the models on for their accuracy
punts_train_data, punts_test_data, punts_train_sln, punts_test_sln = train_test_split(punts_dummies[predictors], target, test_size = 0.2, random_state=0)

punts_params = rf_regress_params_tuner(punts_train_data, punts_test_data, punts_train_sln, punts_test_sln, punts_dummies)

: 

In [None]:
go_for_it_dummies = fourth_down_dummies[~fourth_down_dummies["play_type"].isin(['Field Goal Good', 'Missed Field Goal Return Touchdown', 'Missed Field Goal Return', 'Blocked Field Goal', 'Field Goal Missed', 'Blocked Field Goal Touchdown',
                                                            'Punt', 'Blocked Punt', 'Punt Return Touchdown', 'Blocked Punt Touchdown', 'Penalty', 'Timeout',  'Kickoff Return (Offense)', 'Kickoff Return Touchdown'])]


predictors = go_for_it_dummies.columns.drop(['xPa', 'play_type']) #predictor variables used: all variables besides the target variable
target = go_for_it_dummies['xPa'].values #target variable


#splits the subset into a training set to fit the models on and a testing set to test the models on for their accuracy
go_for_it_train_data, go_for_it_test_data, go_for_it_train_sln, go_for_it_test_sln = train_test_split(go_for_it_dummies[predictors], target, test_size = 0.2, random_state=0)

go_for_it_params = rf_regress_params_tuner(go_for_it_train_data, go_for_it_test_data, go_for_it_train_sln, go_for_it_test_sln, field_goals_dummies)

: 

In [None]:
from sklearn.ensemble import RandomForestRegressor


fg_forest = RandomForestRegressor(
        max_depth=field_goals_params['max_depth'],
        n_estimators=field_goals_params['n_estimators'],
        min_samples_split=field_goals_params['min_samples_split'],
        min_samples_leaf=field_goals_params['min_samples_leaf'],
        max_features=field_goals_params['max_features'],
        random_state=0,
    )

fg_forest.fit(fg_train_data, fg_train_sln)
predictions = fg_forest.predict(prediction_set)

fourth_down["field_goal_xPa"] = predictions
fourth_down.head()

: 

In [None]:
punt_forest = RandomForestRegressor(
        max_depth=punts_params['max_depth'],
        n_estimators=punts_params['n_estimators'],
        min_samples_split=punts_params['min_samples_split'],
        min_samples_leaf=punts_params['min_samples_leaf'],
        max_features=punts_params['max_features'],
        random_state=0,
    )

punt_forest.fit(punts_train_data, punts_train_sln)
predictions = punt_forest.predict(prediction_set)

fourth_down["punt_xPa"] = predictions

: 

In [None]:
gfi_forest = RandomForestRegressor(
        max_depth=go_for_it_params['max_depth'],
        n_estimators=go_for_it_params['n_estimators'],
        min_samples_split=go_for_it_params['min_samples_split'],
        min_samples_leaf=go_for_it_params['min_samples_leaf'],
        max_features=go_for_it_params['max_features'],
        random_state=0,
    )

gfi_forest.fit(go_for_it_train_data, go_for_it_train_sln)
predictions = gfi_forest.predict(prediction_set)

fourth_down["go_for_it_xPa"] = predictions

: 

In [None]:
fourth_down.head()

: 

In [None]:
conditions = [
    (fourth_down['field_goal_xPa'] > fourth_down['punt_xPa']) & (fourth_down['field_goal_xPa'] > fourth_down['go_for_it_xPa']),
    (fourth_down['punt_xPa'] > fourth_down['field_goal_xPa']) & (fourth_down['punt_xPa'] > fourth_down['go_for_it_xPa']),
    (fourth_down['go_for_it_xPa'] > fourth_down['field_goal_xPa']) & (fourth_down['go_for_it_xPa'] > fourth_down['punt_xPa'])
]

values = ['Attempt FG', 'Punt', 'Go For It']

fourth_down["suggested_decision"] = np.select(conditions, values)

fourth_down.to_excel('fourth_down_decision_making.xlsx', index=False)


: 

: 