# Empirical Expected Points Model

**Authors:**  
Iain Muir, iam9ez  
Hriday Singh,  
Connor Smith

## Table of Contents

* 0. Import Libraries
* 1. Load Data
* 2. Transform Data
    * 2.1 Remove Garbage Time
    * 2.2 Split Score to Home/Away
    * 2.3 Cast datetime Variables
    * 2.4 Group Point Periods
* 3. Expected Points Model

### 0. Import Libraries

In [2]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.mixture import GaussianMixture
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [19]:
from pandas.core.common import SettingWithCopyWarning
import warnings

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

### 1.0 Load Data

#### 1.1 Aggregate Data across Years

In [1]:
ROOT = '/Users/iainmuir/Desktop/4Y 1S/STAT 4800/Dataset/{} PFF All Plays.csv'

In [6]:
DATA = pd.DataFrame()

In [8]:
%%time

for year in range(2015, 2020):
    PATH = ROOT.format(year)
    df = pd.read_csv(PATH, low_memory=False)
    DATA = pd.concat([DATA, df])

CPU times: user 42.7 s, sys: 11.3 s, total: 53.9 s
Wall time: 59.9 s


In [9]:
DATA.shape

(758971, 177)

In [10]:
DATA.head()

Unnamed: 0,pff_PLAYID,pff_GAMEID,pff_GAMEDATE,pff_GAMESEASON,pff_WEEK,pff_GSISGAMEKEY,pff_GSISPLAYID,pff_QUARTER,pff_DOWN,pff_CLOCK,...,pff_STSAFETIES,pff_TACKLE,pff_TACKLEASSIST,pff_TEALIGNMENT,pff_TOUCHDOWN,pff_UNBLOCKEDPRESSURE,pff_VISE,pff_WRALIGNMENT,pff_PLAYCLOCK,pff_RUNPASSOPTION
0,1455513,5573,12/31/15,2015,PO,60761,35,1,0,15:00,...,,,,,,,,,,0.0
1,1455434,5573,12/31/15,2015,PO,60761,49,1,1,15:00,...,,,,,,,,LWR^; SRiWR; SRoWR^; RWR,,0.0
2,1455470,5573,12/31/15,2015,PO,60761,68,1,2,14:33,...,,SCCL D15,,,,,,LWR^; SRiWR; SRoWR^; RWR,,0.0
3,1455450,5573,12/31/15,2015,PO,60761,90,1,3,14:03,...,,SCCL D18,,,,,,LWR; SLWR^; SRWR^; RWR,,0.0
4,1455419,5573,12/31/15,2015,PO,60761,112,1,1,13:51,...,,SCCL D91,,,,,,LWR; SLWR^; SRWR^; RWR,,0.0


In [12]:
DATA.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 758971 entries, 0 to 167026
Columns: 177 entries, pff_PLAYID to pff_RUNPASSOPTION
dtypes: float64(30), int64(30), object(117)
memory usage: 1.0+ GB


In [13]:
d = DATA.copy()

### 2.0 Feature Selection and Engineering

#### 2.1 Remove Garbage Time

In [77]:
d = d.loc[d['pff_GARBAGETIME'] == 0]

In [78]:
d = d.reset_index(drop=True)

In [79]:
d.shape

(155390, 177)

#### 2.2 Initial Feature Selection

In [80]:
# Manual Selection based on Perceived Importance... simply the dataset
SUBSET_COLS = [
    'pff_DOWN', 'pff_CLOCK', 'pff_DRIVE', 'pff_DRIVEPLAY', 'pff_GAINLOSSNET',
    'pff_KICKYARDS', 'pff_OFFSCORE', 'pff_OPTION', 'pff_PENALTYYARDS',
    'pff_SCORE', 'pff_DEFTEAM', 'pff_DISTANCE', 'pff_DRIVEENDEVENT',
    'pff_DRIVEENDFIELDPOSITION', 'pff_DRIVEENDPLAYNUMBER',
    'pff_DRIVESTARTEVENT', 'pff_DRIVESTARTFIELDPOSITION', 'pff_FIELDPOSITION',
    'pff_GAINLOSS', 'pff_HASH', 'pff_KICKRESULT', 'pff_KICKTYPE',
    'pff_OFFTEAM', 'pff_PASSRESULT', 'pff_PASSRUSHRESULT', 'pff_PENALTY',
    'pff_PLAYENDFIELDPOSITION', 'pff_RUNPASS', 'pff_SCOREDIFFERENTIAL',
    'pff_SPECIALTEAMSTYPE', 'pff_TOUCHDOWN', 'pff_PLAYCLOCK'
]

In [81]:
d = d[SUBSET_COLS]

In [82]:
d.shape

(155390, 32)

#### 2.2 Split Score to Home/Away

In [83]:
def split_score(row):
    """
    
    """
    score = row['pff_SCORE']
    
    if '.' in score:
        h, a = score.split('.')
        if len(a) == 1:
            a = int(a)
            a *= 10
    else:
        h = score
        a = 0
    
    return int(h), int(a)

In [84]:
%%time

d['pff_SCORE'] = d['pff_SCORE'].astype(str)
s = pd.DataFrame(d['pff_SCORE'])
scores = s.apply(
    lambda r: split_score(r),
    axis=1,
    result_type='expand'
)

CPU times: user 40.2 s, sys: 633 ms, total: 40.8 s
Wall time: 43.5 s


In [85]:
scores.columns = ['homeScore', 'awayScore']

In [86]:
d['homeScore'], d['awayScore'] = scores['homeScore'], scores['awayScore']

#### 2.3 Create Point Periods

### 3.0 Field Goal Model

#### 3.1 Subset Field Goal Attempts

In [None]:
FGS = d.loc[
    d['pff_SPECIALTEAMSTYPE'].astype(str) == 'FIELD GOAL'
]

In [None]:
FGS[['kickResult', 'kickLocation']] = FGS['pff_KICKRESULT'].str.split(' - ', expand=True)
FGS['kickResult'] = np.where(FGS['kickResult'] == 'MISSED', 0, 1)

In [None]:
FGS =  FGS[
    ['pff_HASH', 'pff_KICKYARDS', 'kickResult']
]

In [None]:
FGS.shape

In [None]:
FGS.head()

In [None]:
fgs = FGS.copy()

In [None]:
#### 3.2

### 4.0 Transform All Data

### 5.0 Run Models

In [None]:
r1 = RUN.loc[RUN['pff_DOWN'] == 1]
r2 = RUN.loc[RUN['pff_DOWN'] == 2]
r3 = RUN.loc[RUN['pff_DOWN'] == 3]
r4 = RUN.loc[RUN['pff_DOWN'] == 4]

In [None]:
print("Running Downs:")
print("\t1st:", r1.shape)
print("\t2nd:", r2.shape)
print("\t3rd:", r3.shape)
print("\t4th:", r4.shape)

### 6.0 Pass Models

In [None]:
p1 = PASS.loc[PASS['pff_DOWN'] == 1]
p2 = PASS.loc[PASS['pff_DOWN'] == 2]
p3 = PASS.loc[PASS['pff_DOWN'] == 3]
p4 = PASS.loc[PASS['pff_DOWN'] == 4]

In [None]:
print("Passing Downs:")
print("\t1st:", p1.shape)
print("\t2nd:", p2.shape)
print("\t3rd:", p3.shape)
print("\t4th:", p4.shape)

### 3.0 Mixture Models

Ref: 
* [sklearn.mixture.GaussianMixture](https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html#sklearn.mixture.GaussianMixture)  
* [sklearn.linear_model.LinearRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html)
* [sklearn.compose.ColumnTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html)
* [sklearn.pipeline.Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html)
* [sklearn.preprocessing.StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)
* [sklearn.preprocessing.OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html)

#### 3.1 Feature Importance via Regression

In [90]:
INPUTS = [
    'pff_DOWN', 'pff_DISTANCE', 'pff_FIELDPOSITION', 'pff_RUNPASS',
    'homeScore','awayScore', 'pff_GAINLOSSNET'
]

In [104]:
model_d = d[INPUTS]
model_d = model_d.dropna()

In [105]:
model_d.shape

(127522, 7)

In [106]:
model_d.isnull().sum()

pff_DOWN             0
pff_DISTANCE         0
pff_FIELDPOSITION    0
pff_RUNPASS          0
homeScore            0
awayScore            0
pff_GAINLOSSNET      0
dtype: int64

In [115]:
model_d['pff_RUNPASS'].value_counts()

P    67015
R    57509
X     2998
Name: pff_RUNPASS, dtype: int64

In [107]:
TARGET = 'pff_GAINLOSSNET'
X = model_d.drop(TARGET, axis=1)
y = model_d[TARGET]

In [30]:
num_fields = X.select_dtypes(include=int).columns
cat_fields = X.columns.tolist().remove(num_fields)

In [110]:
num_pipeline = Pipeline([
    ('std_scaler', StandardScaler())
])
cat_pipeline = Pipeline([
    ('one_hot', OneHotEncoder())
])
pipeline = ColumnTransformer([
    ('cat', cat_pipeline, )
])

In [111]:
X_prep = pipeline.fit_transform(X)

In [112]:
X_prep.shape

(127522, 271)

In [94]:
print("X:", X.shape)
print("y:", y.shape)

X: (155390, 6)
y: (155390,)


In [29]:
regr = LinearRegression()
regr.fit(X, y)

#### 3.2 Split Data into Categories

In [39]:
PASS = d.loc[d['pff_RUNPASS'] == 'P']
RUN = d.loc[d['pff_RUNPASS'] == 'R']

In [40]:
print("Pass:", PASS.shape)
print("Run:", RUN.shape)

Pass: (67015, 39)
Run: (57509, 39)


### 7.0 Expected Points Added Model

In [233]:
points = list()
MAX = max(d['pff_DISTANCE'])
SEED = 42

YTG_DEV = {
    'long': 4,
    'medium': 3,
    'short': 2,
    'inches': 1
}
YTG_MIN = {
    'long': 10,
    'medium': 6,
    'short': 3,
    'inches': 1
}
POSITION_DEV = 5 # yards

In [238]:
def EPA(down, ytg, position, own):
    """
    
    """
    global points
    
    ytg_dev = 4 if ytg in range(10, MAX) else \
              3 if ytg in range(6, 10) else \
              2 if ytg in range(3, 6) else \
              1
                
    SIMULATIONS = 1000
    for i in range(SIMULATIONS):
        point = run_play(down, ytg, position, own)
        points.append(point)
    
    return np.array(points).mean()

In [239]:
def run_play(down, ytg, position, own):
    dist_ = 'long' if ytg in range(10, MAX) else \
            'medium' if ytg in range(6, 10) else \
            'short' if ytg in range(3, 6) else \
            'inches'
    ytg_dev = YTG_DEV[dist_]
    ytg_min = YTG_MIN[dist_]
    
    position *= -1 if own else 1
    fp_right = position - POSITION_DEV
    fp_left = position + POSITION_DEV
    
    # Off the field left
    if fp_left > 0 and fp_right < 0:
        fp_left = -1
    # Redzone -- shrink deviation to 3 yards
    elif 0 < position < 20:
        fp_right += 2
        fp_left -= 2
    
    samples = d.loc[
        (d['pff_DOWN'] == down) &
        (d['pff_DISTANCE'] >= max(ytg_min, ytg - ytg_dev)) &
        (d['pff_DISTANCE'] <= ytg + ytg_dev) &
        (d['pff_FIELDPOSITION'] >= fp_right) &
        (d['pff_FIELDPOSITION'] <= fp_left)
    ]
    if len(samples) == 0:
        return None
    else:
        sample = samples.sample(n=1, random_state=SEED)
        
        # Define conditions for a TD, FG
        if True:
            score = True
            scored = 6
            
            xps = d.loc[
                (d['pff_DOWN'] == 0) &
                (d['pff_DISTANCE'] == 0) &
                (d['pff_FIELDPOSITION'] == 0)
            ]
            xp = xps.sample(n=1, random_state=SEED)
            result = xp['pff_KICKRESULT']
            scored += 1 if 'MADE' in result else 0
        elif True:
            score = True
            scored = 3
        else:
            score = False
    
    if score:
        return scored
    else:
        return run_play(d, y, p)

In [None]:
down = int(input('Down: ').strip())
ytg = int(input('Yards to Go: ').strip())
position = int(input('Field Position: ').strip())
own = input('Possession (home/away): ').strip()
own = True if own == 'home' else False

In [None]:
STATE = {
    'down': down,
    'ytg': ytg,
    'position': position,
    'own': own
}

In [None]:
epa = EPA(**STATE)

In [None]:
print('''
    Expected Points Model:
    S (Down: {}, YTG: {}, Position: {}) == {} EP
'''.format(
    STATE['down'],
    STATE['ytg'],
    ('Own ' if STATE['own'] else '') + str(STATE['position']),
    epa
))