In [47]:
########################
# CLEANING UP THE DATA #
########################

import os
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import neighbors, metrics, grid_search, cross_validation
from sklearn.feature_selection import VarianceThreshold, RFE, SelectKBest, chi2
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, VotingClassifier, RandomForestClassifier, AdaBoostClassifier

pd.set_option('display.max_rows', 162)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 162)

%matplotlib inline
plt.style.use('ggplot')

In [48]:
df = pd.read_csv('/Users/joshuagrossman/Desktop/DS-SF-23 Final Project/Kobe/kobe_data_1.csv') 

In [49]:
df.head()

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,playoffs,season,seconds_remaining,shot_distance,shot_made_flag,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id
0,Jump Shot,Jump Shot,10,20000012,33.9723,167,72,-118.1028,10,1,0,2000-01,27,18,,2PT Field Goal,Right Side(R),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,1
1,Jump Shot,Jump Shot,12,20000012,34.0443,-157,0,-118.4268,10,1,0,2000-01,22,15,0.0,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,2
2,Jump Shot,Jump Shot,35,20000012,33.9093,-101,135,-118.3708,7,1,0,2000-01,45,16,1.0,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,3
3,Jump Shot,Jump Shot,43,20000012,33.8693,138,175,-118.1318,6,1,0,2000-01,52,22,0.0,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,4
4,Driving Dunk Shot,Dunk,155,20000012,34.0443,0,0,-118.2698,6,2,0,2000-01,19,0,1.0,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,5


In [50]:
df.set_index('shot_id', inplace=True)
df["action_type"] = df["action_type"].astype('object')
df["combined_shot_type"] = df["combined_shot_type"].astype('category')
df["game_event_id"] = df["game_event_id"].astype('category')
df["game_id"] = df["game_id"].astype('category')
df["period"] = df["period"].astype('object')
df["playoffs"] = df["playoffs"].astype('category')
df["season"] = df["season"].astype('category')
df["shot_made_flag"] = df["shot_made_flag"].astype('category')
df["shot_type"] = df["shot_type"].astype('category')
df["team_id"] = df["team_id"].astype('category')

In [51]:
df.shape

(30697, 24)

In [52]:
df.dtypes

action_type             object
combined_shot_type    category
game_event_id         category
game_id               category
lat                    float64
loc_x                    int64
loc_y                    int64
lon                    float64
minutes_remaining        int64
period                  object
playoffs              category
season                category
seconds_remaining        int64
shot_distance            int64
shot_made_flag        category
shot_type             category
shot_zone_area          object
shot_zone_basic         object
shot_zone_range         object
team_id               category
team_name               object
game_date               object
matchup                 object
opponent                object
dtype: object

In [53]:
#Create a clean data set
data_cl = df.copy() # create a copy of data frame

target = data_cl['shot_made_flag'].copy()

In [54]:
data_cl.shape

(30697, 24)

In [55]:
unknown_mask = df['shot_made_flag'].isnull()

In [56]:
# Remove some columns
data_cl.drop('team_id', axis=1, inplace=True) # Always one number
data_cl.drop('lat', axis=1, inplace=True) # Correlated with loc_x
data_cl.drop('lon', axis=1, inplace=True) # Correlated with loc_y
data_cl.drop('game_id', axis=1, inplace=True) # Independent
data_cl.drop('game_event_id', axis=1, inplace=True) # Independent
data_cl.drop('team_name', axis=1, inplace=True) # Always LA Lakers
#data_cl.drop('shot_made_flag', axis=1, inplace=True) #need to copy before dropping

In [57]:
data_cl.dtypes

action_type             object
combined_shot_type    category
loc_x                    int64
loc_y                    int64
minutes_remaining        int64
period                  object
playoffs              category
season                category
seconds_remaining        int64
shot_distance            int64
shot_made_flag        category
shot_type             category
shot_zone_area          object
shot_zone_basic         object
shot_zone_range         object
game_date               object
matchup                 object
opponent                object
dtype: object

In [58]:
#def detect_outliers(series, whis=1.5):
#   q75, q25 = np.percentile(series, [75 ,25])
#  iqr = q75 - q25
#    return ~((series - series.median()).abs() <= (whis * iqr))

## For now - do not remove anything

In [59]:
#New Features:

# Remaining time
data_cl['seconds_from_period_end'] = 60 * data_cl['minutes_remaining'] + data_cl['seconds_remaining']
#data_cl['last_5_sec_in_period'] = data_cl['seconds_from_period_end'] < 5

data_cl.drop('minutes_remaining', axis=1, inplace=True)
data_cl.drop('seconds_remaining', axis=1, inplace=True)
#data_cl.drop('seconds_from_period_end', axis=1, inplace=True)

#Add a home vs away column based on matchup
data_cl['home_away'] = np.where(df['matchup'].str.contains("vs."), 'home', 'away')
#Create binary variable for Home/Away games
data_cl.home_away = data_cl.home_away.apply(lambda value: 0 if value == 'away' else 1)
data_cl.drop('matchup', axis=1, inplace=True)

# Game date
data_cl['game_date'] = pd.to_datetime(data_cl['game_date'])
data_cl['game_year'] = data_cl['game_date'].dt.year
data_cl['game_month'] = data_cl['game_date'].dt.month
data_cl.drop('game_date', axis=1, inplace=True)

# Replace 20 least common action types with value 'Other'
rare_action_types = data_cl['action_type'].value_counts().sort_values().index.values[:20]
data_cl.loc[data_cl['action_type'].isin(rare_action_types), 'action_type'] = 'Other'
    

In [60]:
#create dummy variables for categorical features
categorial_cols = [
    'action_type', 'combined_shot_type', 'period', 'season', 'shot_type',
    'shot_zone_area', 'shot_zone_basic', 'shot_zone_range', 'game_year',
    'game_month', 'opponent']

for cc in categorial_cols:
    dummies = pd.get_dummies(data_cl[cc])
    dummies = dummies.add_prefix("{}#".format(cc))
    data_cl.drop(cc, axis=1, inplace=True)
    data_cl = data_cl.join(dummies)

In [61]:
data_cl.shape

(30697, 161)

In [62]:
data_cl.columns

Index([u'loc_x', u'loc_y', u'playoffs', u'shot_distance', u'shot_made_flag',
       u'seconds_from_period_end', u'home_away',
       u'action_type#Alley Oop Dunk Shot', u'action_type#Alley Oop Layup shot',
       u'action_type#Driving Dunk Shot',
       ...
       u'opponent#PHI', u'opponent#PHX', u'opponent#POR', u'opponent#SAC',
       u'opponent#SAS', u'opponent#SEA', u'opponent#TOR', u'opponent#UTA',
       u'opponent#VAN', u'opponent#WAS'],
      dtype='object', length=161)

In [63]:
data_cl.dtypes

loc_x                                            int64
loc_y                                            int64
playoffs                                      category
shot_distance                                    int64
shot_made_flag                                category
seconds_from_period_end                          int64
home_away                                        int64
action_type#Alley Oop Dunk Shot                float64
action_type#Alley Oop Layup shot               float64
action_type#Driving Dunk Shot                  float64
action_type#Driving Finger Roll Layup Shot     float64
action_type#Driving Finger Roll Shot           float64
action_type#Driving Jump shot                  float64
action_type#Driving Layup Shot                 float64
action_type#Driving Reverse Layup Shot         float64
action_type#Driving Slam Dunk Shot             float64
action_type#Dunk Shot                          float64
action_type#Fadeaway Bank shot                 float64
action_typ

In [64]:
#normalize for any numerical variables needed 
def normalize(X):
    min = X.min()
    max = X.max()
    return (X - min) / (max - min)

data_cl['seconds_from_period_end'] = normalize(data_cl.seconds_from_period_end)
data_cl['shot_distance'] = normalize(data_cl.shot_distance)
data_cl['loc_x'] = normalize(data_cl.loc_x)
data_cl['loc_y'] = normalize(data_cl.loc_y)

In [65]:
data_cl.shape

(30697, 161)

In [66]:
unknown_mask = df['shot_made_flag'].isnull()

In [67]:
# Separate dataset for validation
data_submit = data_cl[unknown_mask]

# Separate dataset for training
X = data_cl[~unknown_mask]
Y = target[~unknown_mask]

In [68]:
#Variance Threshold
#Find all features with more than 90% variance in values.

threshold = 0.90
vt = VarianceThreshold().fit(X)

# Find feature names
feat_var_threshold = data_cl.columns[vt.variances_ > threshold * (1-threshold)]
feat_var_threshold

Index([u'playoffs', u'shot_made_flag', u'home_away', u'action_type#Jump Shot',
       u'combined_shot_type#Jump Shot', u'combined_shot_type#Layup',
       u'period#1', u'period#2', u'period#3', u'period#4',
       u'shot_type#2PT Field Goal', u'shot_type#3PT Field Goal',
       u'shot_zone_area#Center(C)', u'shot_zone_area#Left Side Center(LC)',
       u'shot_zone_area#Left Side(L)', u'shot_zone_area#Right Side Center(RC)',
       u'shot_zone_area#Right Side(R)', u'shot_zone_basic#Above the Break 3',
       u'shot_zone_basic#In The Paint (Non-RA)', u'shot_zone_basic#Mid-Range',
       u'shot_zone_basic#Restricted Area', u'shot_zone_range#16-24 ft.',
       u'shot_zone_range#24+ ft.', u'shot_zone_range#8-16 ft.',
       u'shot_zone_range#Less Than 8 ft.', u'game_month#1', u'game_month#2',
       u'game_month#3', u'game_month#4', u'game_month#11', u'game_month#12'],
      dtype='object')

In [69]:
#Top 20 most important features
#According to RandomForestClassifier

model = RandomForestClassifier()
model.fit(X, Y)

feature_imp = pd.DataFrame(model.feature_importances_, index=X.columns, columns=["importance"])
feat_imp_20 = feature_imp.sort_values("importance", ascending=False).head(20).index
feat_imp_20

Index([u'shot_made_flag', u'action_type#Jump Shot', u'seconds_from_period_end',
       u'loc_y', u'loc_x', u'combined_shot_type#Dunk', u'shot_distance',
       u'shot_zone_basic#Restricted Area', u'combined_shot_type#Layup',
       u'action_type#Layup Shot', u'home_away',
       u'combined_shot_type#Jump Shot', u'action_type#Driving Layup Shot',
       u'action_type#Slam Dunk Shot', u'shot_zone_area#Center(C)', u'period#1',
       u'period#3', u'period#2', u'action_type#Running Jump Shot',
       u'shot_type#3PT Field Goal'],
      dtype='object')

In [70]:
feature_imp.sort_values("importance", ascending=False)

Unnamed: 0,importance
shot_made_flag,0.78188
action_type#Jump Shot,0.02525
seconds_from_period_end,0.018068
loc_y,0.012974
loc_x,0.011364
combined_shot_type#Dunk,0.008252
shot_distance,0.008249
shot_zone_basic#Restricted Area,0.006335
combined_shot_type#Layup,0.005401
action_type#Layup Shot,0.004937


In [71]:
#export dataframe for Variance Threshhold features
data_cl_vt = data_cl[ ['shot_made_flag','playoffs', 'home_away', 'action_type#Jump Shot',
       'combined_shot_type#Jump Shot', 'combined_shot_type#Layup',
       'period#1', 'period#2', 'period#3', 'period#4',
       'shot_type#2PT Field Goal', 'shot_type#3PT Field Goal',
       'shot_zone_area#Center(C)', 'shot_zone_area#Left Side Center(LC)',
       'shot_zone_area#Left Side(L)', 'shot_zone_area#Right Side Center(RC)',
       'shot_zone_area#Right Side(R)', 'shot_zone_basic#Above the Break 3',
       'shot_zone_basic#In The Paint (Non-RA)', 'shot_zone_basic#Mid-Range',
       'shot_zone_basic#Restricted Area', 'shot_zone_range#16-24 ft.',
       'shot_zone_range#24+ ft.', 'shot_zone_range#8-16 ft.',
       'shot_zone_range#Less Than 8 ft.', 'game_month#1', 'game_month#2',
       'game_month#3', 'game_month#4', 'game_month#11', 'game_month#12' ] ]

In [72]:
#export new clean dataframe
data_cl_vt.to_csv('kobe_clean_var_thresh.csv')

In [73]:
#export feature variana dataframe for RandomForestClassifier
data_cl_rf = data_cl[ ['shot_made_flag','seconds_from_period_end', 'loc_x', 'loc_y', 'shot_distance',
       'action_type#Jump Shot', 'home_away', 'action_type#Layup Shot',
       'period#3', 'period#2', 'period#1', 'period#4',
       'shot_zone_range#Less Than 8 ft.', 'game_month#1', 'game_month#3',
       'game_month#2', 'game_month#12', 'game_month#4', 'game_month#11',
       'playoffs', 'action_type#Running Jump Shot' ] ]

In [74]:
#export feature variana dataframe for RandomForestClassifier
data_cl_rf.to_csv('kobe_clean_forest.csv')

In [75]:
#Now we can use tese files to import for other models, without having to run through the munging every time.