# Modeling Continued

## Analyzing Defender Fixed Effects

** Chip **

In [102]:
## Load necessary libraries
import numpy as np
import pandas as pd
import scipy as sp
import statsmodels.api as sm
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn import discriminant_analysis
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn import tree
from sklearn import ensemble
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
from datetime import datetime, timedelta
from dateutil.parser import *
import StringIO
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [103]:
## Goals:
# Demonstrate past ability of model 
# Create dummy var for each player id
# Create new model using defender fixed effects
# What about player fixed effects even?

In [104]:
## Load in dataset with proper features

# Read in data
data = pd.read_csv('datasets/dataset_post_model', index_col=0)

# Preview data
print 'Number of variables:', np.shape(data)[1]
print 'Variable names:', data.columns.values
print 'Number of observations/shots:', np.shape(data)[0]
data.head()

Number of variables: 25
Variable names: ['game_id' 'matchup' 'location' 'w' 'final_margin' 'shot_number' 'period'
 'game_clock' 'shot_clock' 'dribbles' 'touch_time' 'shot_dist' 'pts_type'
 'shot_result' 'closest_defender' 'closest_defender_player_id'
 'close_def_dist' 'fgm' 'pts' 'player_name' 'player_id' 'fg_percent'
 'current_streak' 'previous_streak' 'W']
Number of observations/shots: 118033


Unnamed: 0,game_id,matchup,location,w,final_margin,shot_number,period,game_clock,shot_clock,dribbles,...,closest_defender_player_id,close_def_dist,fgm,pts,player_name,player_id,fg_percent,current_streak,previous_streak,W
28621,21400054,"NOV 04, 2014 - NOP vs. CHA",H,W,9,1,1,2:29,22.1,0,...,203148,4.4,1,3,ryan anderson,201583,0.407407,1,0,1.0
28622,21400054,"NOV 04, 2014 - NOP vs. CHA",H,W,9,2,2,11:19,5.8,2,...,203469,6.1,1,2,ryan anderson,201583,0.407407,2,1,1.0
28623,21400054,"NOV 04, 2014 - NOP vs. CHA",H,W,9,3,2,10:51,10.2,2,...,101131,5.7,0,0,ryan anderson,201583,0.407407,-1,2,1.0
28624,21400054,"NOV 04, 2014 - NOP vs. CHA",H,W,9,4,2,8:21,24.0,0,...,203469,2.6,1,2,ryan anderson,201583,0.407407,1,-1,1.0
28625,21400054,"NOV 04, 2014 - NOP vs. CHA",H,W,9,5,2,7:59,16.0,0,...,203469,6.5,0,0,ryan anderson,201583,0.407407,-1,1,1.0


In [105]:
## Past model

# Create array of possible confounders
possible_confounders = ['previous_streak', 'final_margin', 'dribbles', 'touch_time', 'shot_dist', 'close_def_dist', 
                        'fg_percent', 'shot_clock']

# Choose x_set
x = data[possible_confounders]
y = data['fgm'].values

# Look at results
logit = sm.Logit(y, x)
result = logit.fit(disp=0)
result.summary()

## Score model

# Split robust predictors
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.6, random_state=42)

# Fit model
robust_logit = LogisticRegression(C=100000000.0)
robust_logit.fit(x_train, y_train)

# Score and print
score = robust_logit.score(x_test, y_test)
print 'Classification rate:', score

Classification rate: 0.608061168298


In [106]:
# How many unique defenders are there?
unique_defender_id = data['closest_defender_player_id'].unique()
print len(unique_defender_id)

474


In [107]:
## Create dummy var for each player id
encoding = pd.get_dummies(data['closest_defender_player_id'])
data_fe = pd.concat((data, encoding), axis=1)

In [108]:
# New number of variables
print 'Number of variables:', np.shape(data_fe)[1]

Number of variables: 499


In [109]:
## Fit new model w/ defender fe's

# Choose x_set
x_old = data[possible_confounders]
defender_fe = data_fe.iloc[:, -474:]
x_new = pd.concat((x_old, defender_fe), axis=1)

# Split robust predictors
x_train, x_test, y_train, y_test = train_test_split(x_new, y, train_size=0.6, random_state=42)

# Fit model
robust_logit = LogisticRegression()
robust_logit.fit(x_train, y_train)

# Score and print
score = robust_logit.score(x_test, y_test)
print 'Classification rate:', score


Classification rate: 0.606557377049


** lol **
No increase in class rate whatsoever. Let's try some other fe's tho!

In [110]:
## player_id fe's

# How many unique players are there?
unique_player_id = data['player_id'].unique()
num = len(unique_player_id)
print 'Number of unique shooters:', num

# Create dummy var for each player id
encoding = pd.get_dummies(data['player_id'])
data_fe = pd.concat((data, encoding), axis=1)

# New number of variables
print 'Number of variables:', np.shape(data_fe)[1]

# Choose x_set
player_fe = data_fe.iloc[:, -num:]
x_new = pd.concat((x_old, player_fe), axis=1)

# Split robust predictors
x_train, x_test, y_train, y_test = train_test_split(x_new, y, train_size=0.6, random_state=42)

# Fit model
robust_logit = LogisticRegression()
robust_logit.fit(x_train, y_train)

# Score and print
score = robust_logit.score(x_test, y_test)
print 'Classification rate:', score

Number of unique shooters: 259
Number of variables: 284
Classification rate: 0.606472656415


In [111]:
## game_id fe's

# How many unique games are there?
unique_game_id = data['game_id'].unique()
num = len(unique_game_id)
print 'Number of unique games:', num

# Create dummy var for each player id
encoding = pd.get_dummies(data['game_id'])
data_fe = pd.concat((data, encoding), axis=1)

# New number of variables
print 'Number of variables:', np.shape(data_fe)[1]

# Choose x_set
game_fe = data_fe.iloc[:, -num:]
x_new = pd.concat((x_old, game_fe), axis=1)

# Split robust predictors
x_train, x_test, y_train, y_test = train_test_split(x_new, y, train_size=0.6, random_state=42)

# Fit model
robust_logit = LogisticRegression()
robust_logit.fit(x_train, y_train)

# Score and print
score = robust_logit.score(x_test, y_test)
print 'Classification rate:', score

Number of unique games: 904
Number of variables: 929
Classification rate: 0.600521031897


In [112]:
## All fe's

# Choose x_set
x_new = pd.concat((x_old, player_fe, defender_fe, game_fe), axis=1)

# Split robust predictors
x_train, x_test, y_train, y_test = train_test_split(x_new, y, train_size=0.6, random_state=42)

# Fit model
robust_logit = LogisticRegression()
robust_logit.fit(x_train, y_train)

# Score and print
score = robust_logit.score(x_test, y_test)
print 'Classification rate:', score

Classification rate: 0.600266869996


In [152]:
## Encode game clock var

# grab old clock data
old_clock = data['game_clock'].values
new_time = []

# loop through and make new_time array
for x in old_clock:
    minutes, seconds = x.split(':')
    minutes, seconds = float(minutes), float(seconds)
    new_time.append(minutes*60 + seconds)
    
# create new predictor column
data['game_clock'] = new_time

In [153]:
## New model with game clock

# Create array of possible confounders
possible_confounders = ['previous_streak', 'final_margin', 'dribbles', 'touch_time', 'shot_dist', 'close_def_dist', 
                        'fg_percent', 'shot_clock', 'game_clock', 'period']

# Choose x_set
x = data[possible_confounders]

# Look at results
logit = sm.Logit(y, x)
result = logit.fit(disp=0)
result.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,118033.0
Model:,Logit,Df Residuals:,118023.0
Method:,MLE,Df Model:,9.0
Date:,"Wed, 07 Dec 2016",Pseudo R-squ.:,0.04316
Time:,15:35:51,Log-Likelihood:,-77818.0
converged:,True,LL-Null:,-81328.0
,,LLR p-value:,0.0

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
previous_streak,-0.0087,0.003,-2.877,0.004,-0.015 -0.003
final_margin,0.0093,0.000,20.130,0.000,0.008 0.010
dribbles,0.0245,0.005,5.190,0.000,0.015 0.034
touch_time,-0.0560,0.005,-10.212,0.000,-0.067 -0.045
shot_dist,-0.0597,0.001,-70.949,0.000,-0.061 -0.058
close_def_dist,0.0999,0.003,35.001,0.000,0.094 0.105
fg_percent,0.4240,0.051,8.376,0.000,0.325 0.523
shot_clock,0.0150,0.001,14.994,0.000,0.013 0.017
game_clock,-6.013e-05,2.94e-05,-2.045,0.041,-0.000 -2.5e-06


In [154]:
## Score model

# Split robust predictors
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.6, random_state=42)

# Fit model
robust_logit = LogisticRegression(C=100000000.0)
robust_logit.fit(x_train, y_train)

# Score and print
score = robust_logit.score(x_test, y_test)
print 'Classification rate:', score

Classification rate: 0.60789172703
