In [87]:
import pandas as pd
import numpy as np
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

In [2]:
data = pd.read_csv('../data/raw/data.csv')

In [3]:
data.columns

Index(['action_type', 'combined_shot_type', 'game_event_id', 'game_id', 'lat',
       'loc_x', 'loc_y', 'lon', 'minutes_remaining', 'period', 'playoffs',
       'season', 'seconds_remaining', 'shot_distance', 'shot_made_flag',
       'shot_type', 'shot_zone_area', 'shot_zone_basic', 'shot_zone_range',
       'team_id', 'team_name', 'game_date', 'matchup', 'opponent', 'shot_id'],
      dtype='object')

In [24]:
#print(data['shot_zone_basic'].value_counts())
#print(data['shot_zone_range'].value_counts())
#print(data['shot_zone_area'].value_counts())
data[['minutes_remaining','seconds_remaining','period']]

Unnamed: 0,minutes_remaining,seconds_remaining,period
0,10,27,1
1,10,22,1
2,7,45,1
3,6,52,1
4,6,19,2
5,9,32,3
6,8,52,3
7,8,5,3
8,6,12,3
9,3,36,3


In [72]:
cols = [
    'combined_shot_type','game_event_id','game_id','period','playoffs',
    'season','minutes_remaining','seconds_remaining','shot_made_flag',
    #'shot_distance',shot_zone_area','shot_zone_basic',
    'lat','lon','loc_x','loc_y',
    'matchup','opponent','game_date'
       ]

process = data[cols].copy()

## Feature engineering
1. get time remaining 
2. get home/away
3. get year & month from game_date
4. get consecutive (?)
5. pd.get_dummy

In [73]:
# 1. get time remaining
process['time_remaining'] = process['minutes_remaining'] * 60 + process['seconds_remaining']

In [74]:
# 2. get home/away
process['home'] = process['matchup'].apply(lambda x: 1 if (x.find('@') < 0) else 0)

In [75]:
# 3. get year 
#print(process['game_date'][0])
#print(process['game_date'][0][5:7]) # month
process['game_year'] = process['game_date'].apply(lambda x: x[:4]).astype(np.int32)
process['game_month'] = process['game_date'].apply(lambda x: x[5:7]).astype(np.int32)

In [76]:
process.head(5)

Unnamed: 0,combined_shot_type,game_event_id,game_id,period,playoffs,season,minutes_remaining,seconds_remaining,shot_made_flag,lat,lon,loc_x,loc_y,matchup,opponent,game_date,time_remaining,home,game_year,game_month
0,Jump Shot,10,20000012,1,0,2000-01,10,27,,33.9723,-118.1028,167,72,LAL @ POR,POR,2000-10-31,627,0,2000,10
1,Jump Shot,12,20000012,1,0,2000-01,10,22,0.0,34.0443,-118.4268,-157,0,LAL @ POR,POR,2000-10-31,622,0,2000,10
2,Jump Shot,35,20000012,1,0,2000-01,7,45,1.0,33.9093,-118.3708,-101,135,LAL @ POR,POR,2000-10-31,465,0,2000,10
3,Jump Shot,43,20000012,1,0,2000-01,6,52,0.0,33.8693,-118.1318,138,175,LAL @ POR,POR,2000-10-31,412,0,2000,10
4,Dunk,155,20000012,2,0,2000-01,6,19,1.0,34.0443,-118.2698,0,0,LAL @ POR,POR,2000-10-31,379,0,2000,10


In [77]:
# 5. get dummy
# to be dummified:
# 1. combined_shot_type
# 2. season
# 3. opponent
dummy_cols = ['combined_shot_type','season','opponent']
dummy = pd.get_dummies(process[dummy_cols])

In [78]:
dummy.head()

Unnamed: 0,combined_shot_type_Bank Shot,combined_shot_type_Dunk,combined_shot_type_Hook Shot,combined_shot_type_Jump Shot,combined_shot_type_Layup,combined_shot_type_Tip Shot,season_1996-97,season_1997-98,season_1998-99,season_1999-00,...,opponent_PHI,opponent_PHX,opponent_POR,opponent_SAC,opponent_SAS,opponent_SEA,opponent_TOR,opponent_UTA,opponent_VAN,opponent_WAS
0,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [79]:
process.drop(['combined_shot_type','season','opponent','minutes_remaining','seconds_remaining','matchup','game_date'],inplace=True, axis=1)

In [80]:
process.head()

Unnamed: 0,game_event_id,game_id,period,playoffs,shot_made_flag,lat,lon,loc_x,loc_y,time_remaining,home,game_year,game_month
0,10,20000012,1,0,,33.9723,-118.1028,167,72,627,0,2000,10
1,12,20000012,1,0,0.0,34.0443,-118.4268,-157,0,622,0,2000,10
2,35,20000012,1,0,1.0,33.9093,-118.3708,-101,135,465,0,2000,10
3,43,20000012,1,0,0.0,33.8693,-118.1318,138,175,412,0,2000,10
4,155,20000012,2,0,1.0,34.0443,-118.2698,0,0,379,0,2000,10


In [81]:
process = pd.concat([process, dummy], axis=1)

In [82]:
process.head()

Unnamed: 0,game_event_id,game_id,period,playoffs,shot_made_flag,lat,lon,loc_x,loc_y,time_remaining,...,opponent_PHI,opponent_PHX,opponent_POR,opponent_SAC,opponent_SAS,opponent_SEA,opponent_TOR,opponent_UTA,opponent_VAN,opponent_WAS
0,10,20000012,1,0,,33.9723,-118.1028,167,72,627,...,0,0,1,0,0,0,0,0,0,0
1,12,20000012,1,0,0.0,34.0443,-118.4268,-157,0,622,...,0,0,1,0,0,0,0,0,0,0
2,35,20000012,1,0,1.0,33.9093,-118.3708,-101,135,465,...,0,0,1,0,0,0,0,0,0,0
3,43,20000012,1,0,0.0,33.8693,-118.1318,138,175,412,...,0,0,1,0,0,0,0,0,0,0
4,155,20000012,2,0,1.0,34.0443,-118.2698,0,0,379,...,0,0,1,0,0,0,0,0,0,0


In [93]:
# divide train and test
test = process[process['shot_made_flag'].isnull()].copy()
train = process[~process['shot_made_flag'].isnull()].copy()

train_X = train.drop(['shot_made_flag'], axis=1)
train_y = train['shot_made_flag']

test_X = test.drop(['shot_made_flag'], axis=1)

In [94]:
rfc = RandomForestClassifier(n_estimators=30)
rfc.fit(train_X, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [97]:
# test_y = rfc.predict_proba(test_X)
test_y = rfc.predict_proba(train_X)

In [98]:
test_y[:,1]

array([0.13333333, 0.76666667, 0.23333333, ..., 0.76666667, 0.13333333,
       0.16666667])