# Passer Ratings: Physical attributes 🤙 💪 🧍

Hypothesis: Hand size/height/arm length affects average career passer rating.

In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import sys
import pickle
from datetime import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-darkgrid')
import pymc3 as pm
sys.path.append('..')

In [3]:
df = pd.read_csv('../data/passer_ratings.csv', index_col=0)
df

Unnamed: 0,year,attempts,avg_rating,sack_rate,alt_rate,precip_rate,turf_rate,wind_rate,away_rate,temp,height,arm,hand,dpos,seasons
DB-3800-2011,2011,763.0,110.6,0.0365,0.0000,0.0000,0.7222,2.333333,0.5000,68.944444,72.0,31.250,10.00,32,11
PM-0200-2013,2013,787.0,111.7,0.0223,0.5263,0.0526,0.2632,7.526316,0.4211,57.578947,77.0,31.500,10.13,1,16
EM-0200-2011,2011,752.0,95.1,0.0493,0.0000,0.1000,0.7500,7.850000,0.5500,57.550000,77.0,30.750,9.75,1,8
TB-2300-2011,2011,722.0,104.8,0.0462,0.0526,0.0000,0.6316,7.842105,0.4211,55.000000,76.0,32.750,9.38,199,12
MR-2500-2016,2016,632.0,119.9,0.0661,0.0526,0.0526,0.7368,2.052632,0.4211,68.611111,77.0,32.375,9.50,3,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TD-1400-2007,2007,219.0,55.1,0.1093,0.0000,0.0000,0.1429,5.285714,0.4286,60.000000,76.0,32.250,9.63,6,14
MM-2800-2005,2005,207.0,55.2,0.0841,0.0000,0.1111,0.2222,4.888889,0.3333,55.555556,74.0,32.000,9.75,149,5
JS-3700-2012,2012,201.0,55.4,0.0694,0.0000,0.1429,0.5714,2.857143,0.5714,64.428571,77.0,32.000,9.75,285,3
SM-2400-2007,2007,205.0,73.9,0.0463,0.0000,0.1667,0.1667,5.833333,0.6667,66.500000,74.0,32.250,10.50,3,13


## Variable of Interest

First we dichotamise the variable of interest. For example, we may filter to only include big or small handed players. We say small hand QB's are those in the bottom 1/3rd, and large hands are the top 1/3rd.

Height and arm length are postively correlated (0.45), so it's a little difficult to control for one when investigating the other. Height is of bigger interest than arm, and we suspect it is a bigger influence. However we see the drawback in not controlling for this. 

In [None]:
df_exp = df_all

In [None]:
on = 'warm' # the name of the dichotomous variable
original = 'temp' # the original variable name

# specifically for height study
# df_exp.drop('arm', axis=1, inplace=True)

In [None]:
big = np.percentile(df_exp[original], q=66)
small = np.percentile(df_exp[original], q=33)
df_exp = df_exp.loc[(df_exp[original]>=big) | (df_exp[original]<=small), :] # only extreme values
df_exp[on] = df_exp[original]>=big
df_exp = df_exp.drop(original, axis=1)
print(big, small)
df_exp[on].value_counts()

In [None]:
df_exp

## Matching

## To be done in R

### Coarsened Exact Matching

In [None]:
# TODO: Fill in from field_goals.ipynb

### Mahalanobis Matching

We can calculate the mahalanobis frontier to see how the aggregate mahalanobis distance is changing with radius.

In [None]:
from util.matching import mahalanobis_frontier, match_by_distance
df_exp[on] = df_exp[on].astype(bool)
df_mf = mahalanobis_frontier(df_exp.drop('avg_rating', axis=1), on)
df_mf

In [None]:
sns.lineplot(x='pruned controls', y='AMD', data=df_mf)
plt.title('Mahalanobis Matching Frontier')
plt.show()
plt.title('Mahalanobis Matching Frontier')
sns.lineplot(x='pruned treatments', y='AMD', data=df_mf)
plt.show()
plt.title('Mahalanobis Matching Frontier')
sns.lineplot(x='radius', y='AMD', data=df_mf)

We choose a reasonable radius/caliper to minimise difference in marginal distributions but also keep as many controls as possible. This is a little difficult because we're not working with very large sample sizes as it is.

In [None]:
df_matched = match_by_distance(df_exp, on, 'avg_rating', 'mahalanobis', caliper=2)

In [None]:
from util.matching import covariate_dists
covariate_dists(df_matched.drop('avg_rating', axis=1), on=on, kde=False, hist=True, n_bins=10)

We can perform a t-test for difference of sample means. We're more interested in distribution similarity, but large difference of means can indicate problems. The p value is the probability of observing such a difference between sample means given their population means are equal.

In [None]:
from scipy.stats import ttest_ind

for col in df_matched.drop(['avg_rating', on], axis=1).columns:
    rvs1 = df_matched.loc[df_matched[on], col]
    rvs2 = df_matched.loc[~df_matched[on], col]
    _, p = ttest_ind(rvs1, rvs2, equal_var = False)
    print(f'P(x|H0) for {col}: {round(p,2)}')

In [None]:
df_treatment = df_matched.loc[df_matched[on],:]
df_control = df_matched.loc[~df_matched[on], :]
print(len(df_treatment), f'{on} samples.', len(df_control), f'not {on} samples.')

## Model

We'll use the BEST method for comparing means of the two groups.

In [None]:
treatment = df_treatment['avg_rating']
control = df_control['avg_rating']

In [None]:
f, axes = plt.subplots(1,2, sharey=True, sharex=True)
sns.distplot(treatment, ax=axes[0])
axes[0].set_title(f'Passer rating - {on}')
sns.distplot(control, ax=axes[1])
axes[1].set_title(f'Passer rating - not {on}')
plt.show()

Because of the relatively small sample sizes we assume our distributions are of the students-t distribution (Kruschke).
The students-t has a mean, variance, and degree-of-freedom.
The degree of freedom control the normality of the data (larger dof converges to normal distribution).

Lets set up the model

In [None]:
# priors on the mean
m_mu = pd.concat([control,treatment]).mean()
m_sd = pd.concat([control,treatment]).std()

with pm.Model() as model:
    treatment_mean = pm.Normal('treatment_mean', mu=m_mu, sd=m_sd)
    control_mean = pm.Normal('control_mean', mu=m_mu, sd=m_sd)

In [None]:
# priors on the standard deviation
sd_low = 1
sd_high = 30

with model:
    treatment_std = pm.Uniform('treatment_std', lower=sd_low, upper=sd_high)
    control_std = pm.Uniform('control_std', lower=sd_low, upper=sd_high)

In [None]:
# shared prior on the degree of freedom parameter
with model:
    v = pm.Exponential('v_minus_one', 1/29.) + 1

In [None]:
# pymc3 paramaterises students t with precision, rather than standard deviation (lambda = 1/sigma^2)
with model:
    treatment_lambda = treatment_std**-2  # deterministic
    control_lambda = control_std**-2 # deterministic

    treatment_rating = pm.StudentT(f'{on}', nu=v, mu=treatment_mean, lam=treatment_lambda, observed=treatment)
    control_rating = pm.StudentT(f'not {on}', nu=v, mu=control_mean, lam=control_lambda, observed=control)

In [None]:
with model:
    # our deterministic values, we could just have easily done this with the traces.
    diff_of_means = pm.Deterministic('difference_of_means', treatment_mean - control_mean)
    diff_of_stds = pm.Deterministic('difference_of_stds', treatment_std - control_std)
    effect_size = pm.Deterministic('effect_size',
                                   diff_of_means / np.sqrt((treatment_std**2 + control_std**2) / 2)) # hard to interpret but is difference scaled by pooled variance

In [None]:
with model:
    trace = pm.sample(2000) # NUTS sampling for filling our posterior

    # dump the trace
    today = dt.now().strftime('%y%m%d')
    with open(f'../results/trace_{on}_{today}.pckle', 'wb') as f:
        pickle.dump(trace, f)

In [None]:
pm.plot_posterior(trace)

In [None]:
pm.plot_posterior(trace, var_names=['difference_of_means','difference_of_stds', 'effect_size'],
                  ref_val=0,
                  color='#87ceeb')

In [None]:
# the diagnostics show that the sampling went as suspected with no issues (Rhat ~ 1)
from util.stats import summary
summary_ = summary(trace)
summary_ # error out

## TODO: Relative difference in means/lift.