# KOBE BRYANT SHOT SELECTION

## Summary of major findings

* xxx

## Things to do

* Can we improve the missing data strategy?
* Are all the limitations to algorithm application verified?
* prevent leakage: train only on shots prior to the ones we're predicting (year)
* select features and then explore number of neighbours
* target: score=0.75
* http://scikit-learn.org/stable/modules/outlier_detection.html#outlier-detection
* Define function to optimize the number of neighbors

# 1. Set up

## 1.1 Environment

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

import scipy as sp

from matplotlib.pylab import rcParams
from sklearn import neighbors
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold

%matplotlib inline

## 1.2 Useful functions

In [4]:
#convert shot flags to colors
def flag_colors(flag):
    if flag == 1: return 'green'
    if flag == 0: return 'red'
    return 'black'

In [5]:
# convert matchup string to Home=1, away=0
# example: LAL @ POR: home=0
# example: LAL vs. POR: home=1
def get_home_away(string):
    if '@' in string: return 0 #away
    elif 'vs' in string : return 1 #home
    else: return "ACORDA CRL"

In [6]:
# convert season in format '1999-00' to 
# example: '1999-00': 3
def get_season_num(year):
    # year is in format '2009-10'
    year0 = 1996
    y = year[:4]
    y_int = int(y)
    
    return y_int - year0

#quick asserts
assert get_season_num('2010-11') == 14
assert get_home_away('LAL @ POR') == 0
assert get_home_away('LAL vs UTA') == 1    

In [7]:
def write_kaggle_submission(df, probs, output_file='kobe_submission.csv'):
    """
    write output to kaggle format (for kobe competition)
    
    Input:
        df (pandas dataframe): dataframe used for predicting probabilities
        probs (numpy array): array of probability values
        
    Output:
        None
    
    Side effects:
        creates csv file "output_file"
        
    Comments:
        number of rows in df should match size of probs array
    """

    # create new Series with df indexes as shot_id and probs values as shot_made_flag
    
    data_values = probs
    index_values = df.index.values +1
    data_dict = {'shot_id': index_values, 'shot_made_flag': data_values}
    
    df_towrite = pd.DataFrame(data = data_dict)
    
    df_towrite.to_csv(output_file, header=True, index=False)
    
    return

In [8]:
def logloss(act, pred):
    """
    logloss function
    imported from kaggle evalutation
    """
    epsilon = 1e-15
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1-epsilon, pred)
    ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
    ll = ll * -1.0/len(act)
    return ll

## 1.3 Read data

In [9]:
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,...,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id
0,Jump Shot,Jump Shot,10,20000012,33.9723,167,72,-118.1028,10,1,...,2PT Field Goal,Right Side(R),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,1
1,Jump Shot,Jump Shot,12,20000012,34.0443,-157,0,-118.4268,10,1,...,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,2
2,Jump Shot,Jump Shot,35,20000012,33.9093,-101,135,-118.3708,7,1,...,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,3
3,Jump Shot,Jump Shot,43,20000012,33.8693,138,175,-118.1318,6,1,...,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,4
4,Driving Dunk Shot,Dunk,155,20000012,34.0443,0,0,-118.2698,6,2,...,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,5


In [10]:
list(data.columns.values)

['action_type',
 'combined_shot_type',
 'game_event_id',
 'game_id',
 'lat',
 'loc_x',
 'loc_y',
 'lon',
 'minutes_remaining',
 'period',
 'playoffs',
 'season',
 'seconds_remaining',
 'shot_distance',
 'shot_made_flag',
 'shot_type',
 'shot_zone_area',
 'shot_zone_basic',
 'shot_zone_range',
 'team_id',
 'team_name',
 'game_date',
 'matchup',
 'opponent',
 'shot_id']

## 1.4 Preprocessing

In [11]:
# TRAINING AND TEST DATA
train_data, test_data = train_test_split(data, test_size = 0.2, random_state = 0)

In [12]:
# MISSING DATA
# don't consider rows with shot_made_flag = NaN
train_data = train_data[np.isfinite(train_data['shot_made_flag'])]
test_data = test_data[np.isfinite(test_data['shot_made_flag'])]

In [13]:
# extract validation set
validation_data = data[data['shot_made_flag'].isnull()]

# 2. Exploring features

## 2.1. Correlation matrix

In [None]:
cm = data.corr()
cm.loc['shot_made_flag']

#### Notes:

* Needs normalization

## 2.2. Opponent

In [None]:
total_shots = pd.pivot_table(data, index="opponent", values="shot_id", aggfunc='count')
shots_scored = pd.pivot_table(data, index="opponent", values="shot_made_flag", aggfunc='sum')

accuracy = shots_scored/total_shots

In [None]:
accuracy.plot.bar(title='Kobe Bryant\n% by opponent')

#### Notes:

* xxx

## 2.3. Shots by distance

In [None]:
total_distance = pd.pivot_table(data, index="shot_distance", values="shot_id", aggfunc='count')
shots_scored = pd.pivot_table(data, index="shot_distance", values="shot_made_flag", aggfunc='sum')

accuracy = shots_scored/total_distance

In [None]:
accuracy.plot.bar(title='Kobe Bryant\n% by distance')
plt.axis([0, 30, 0, .6])

#### Notes:

* Strange values when shot_distance = 0

## 2.4. Minutes remaining

In [None]:
minutes = pd.pivot_table(data, index="minutes_remaining", values="shot_id", aggfunc='count')
shots_scored = pd.pivot_table(data, index="minutes_remaining", values="shot_made_flag", aggfunc='sum')

accuracy = shots_scored/minutes

In [None]:
accuracy.plot.bar(title='Kobe Bryant\n% by minutes remaining')
plt.axis([0, 11, 0, .5])

#### Notes:

* Accuracy decreases in minutes_remaining = 0

## 2.5. Seconds remaining + Period

In [None]:
seconds = pd.pivot_table(data, index=[ "seconds_remaining", 'period'], values="shot_id", aggfunc='count')
shots_scored = pd.pivot_table(data, index=[ "seconds_remaining", 'period'], values="shot_made_flag", aggfunc='sum')

accuracy = shots_scored/seconds

In [None]:
accuracy

#### Notes:

* xxx

## 2.6. Shots by zone range

In [None]:
shot_zones = pd.pivot_table(data, index="shot_zone_range", values="shot_id", aggfunc='count')
shot_zones_pct = shot_zones/shot_zones.sum()

# change order
shot_zones_pct = shot_zones_pct.reindex(index=['Less Than 8 ft.',  u'8-16 ft.', u'16-24 ft.', u'24+ ft.', u'Back Court Shot'])
shot_zones_pct.plot.bar(title='Kobe Bryant\n% by zone range')

In [None]:
# let's try plotting all of his shots and show if miss or hit

shots_pos = data[['loc_x', 'loc_y']]
shot_colors = map(flag_colors, data.shot_made_flag)
shots_pos.plot.scatter(x='loc_x', y='loc_y', s=5, c=shot_colors, ylim=(0,400), figsize=(10,5))

#### Notes:

* Heat map to understand accuracy

## 2.7. Shots by action type

In [36]:
shots_actionType = pd.pivot_table(data, index="action_type", values="shot_id", aggfunc='count')
shots_actionType.plot.bar(title='Kobe Bryant\naction type')

#### Notes:

* xxx

## 2.8. Combined shot type

In [None]:
shots_combined = pd.pivot_table(data, index="combined_shot_type", values="shot_id", aggfunc='count')
combined_scored = pd.pivot_table(data, index="combined_shot_type", values="shot_made_flag", aggfunc='sum')

accuracy = combined_scored / shots_combined

In [None]:
accuracy.plot.bar(title='Kobe Bryant\n% combined shot type')

#### Notes:

* xxx

## 2.9. Shots by period

In [None]:
shots_period = pd.pivot_table(data, index="period", values="shot_id", aggfunc='count')
period_scored = pd.pivot_table(data, index="period", values="shot_made_flag", aggfunc='sum')

accuracy = period_scored/shots_period

In [None]:
accuracy.plot.bar(title='Kobe Bryant\n% by period')

#### Notes:

* xxx

## 2.10. Playoffs

In [None]:
playoff_shots = pd.pivot_table(data, index="playoffs", values="shot_id", aggfunc='count')
playoff_scored = pd.pivot_table(data, index="playoffs", values="shot_made_flag", aggfunc='sum')

accuracy = playoff_scored/playoff_shots

In [None]:
accuracy.plot.bar(title='Kobe Bryant\n% by playoff')

#### Notes:

* Playoffs vs regular season makes no difference

## 2.11. Shot type

In [None]:
shot_type = pd.pivot_table(data, index="shot_type", values="shot_id", aggfunc='count')
shotType_scored = pd.pivot_table(data, index="shot_type", values="shot_made_flag", aggfunc='sum')

accuracy = shotType_scored/shot_type

In [None]:
accuracy.plot.bar(title='Kobe Bryant\n% by shot type')

#### Notes:

* xxx

## 2.12. Season

In [None]:
season = pd.pivot_table(data, index="season", values="shot_id", aggfunc='count')
season_scored = pd.pivot_table(data, index="season", values="shot_made_flag", aggfunc='sum')

season_accuracy = season_scored/season

In [None]:
season_accuracy.plot.bar(title='Kobe Bryant\n% by shot type')

#### Notes: 

* xxx

# 3. kNN

** References: **
* http://scikit-learn.org/stable/modules/neighbors.html
* http://www.amazon.com/Building-Machine-Learning-Systems-Python/dp/1784392774/ref=sr_1_1?ie=UTF8&qid=1461733493&sr=8-1&keywords=luis+pedro+coelho+python
* https://www.dataquest.io/blog/k-nearest-neighbors/
* http://stats.stackexchange.com/questions/136272/how-to-handle-data-normalization-in-knn-when-new-test-data-is-received

## 3.1. New feature matrix

In [14]:
# create design matrix
dm = data[['loc_x', 'loc_y', 'shot_distance', 'period', 'season', 'minutes_remaining', 'seconds_remaining', 'game_date', 
           'matchup', 'shot_made_flag']]

# create validation matrix

V = validation_data[['loc_x', 'loc_y', 'shot_distance', 'period', 'season', 'minutes_remaining', 'seconds_remaining', 'game_date', 
           'matchup']]

V_shotID = validation_data['shot_id']

In [15]:
# modify design matrix and Validation matrix

# convert shot_distance from feet to meters
dm.loc[:,'shot_distance'] = dm['shot_distance'].apply(lambda x: x*0.3048)
V.loc[:,'shot_distance'] = V['shot_distance'].apply(lambda x: x*0.3048)


# add angle feature and clean NaN by assuming angle=0 when distance=0
dm.loc[:,'angle'] = pd.Series(np.degrees(np.arctan(dm['loc_x']/dm['loc_y'])))
dm['angle'].fillna(0, inplace=True)
V.loc[:,'angle'] = pd.Series(np.degrees(np.arctan(V['loc_x']/V['loc_y'])))
V['angle'].fillna(0, inplace=True)

# convert matchup to Home/Away
# Home=0, Away=1
dm.loc[:, 'Home'] = dm['matchup'].apply(get_home_away)
V.loc[:, 'Home'] = V['matchup'].apply(get_home_away)


# convert seasons to first, second etc
# needs: convert to date 
dm.loc[:,'season'] = dm['season'].apply(get_season_num)
V.loc[:,'season'] = V['season'].apply(get_season_num)

# convert minutes + seconds remaining to time remaining in quarter (in seconds)
dm.loc[:, 'time_remaining'] = dm['minutes_remaining']*60 + dm['seconds_remaining']
V.loc[:, 'time_remaining'] = V['minutes_remaining']*60 + V['seconds_remaining']

# clean dataframe
cols_to_delete = ['loc_x', 'loc_y', 'minutes_remaining', 'seconds_remaining', 'matchup']
dm.drop(cols_to_delete, axis=1, inplace=True)
V.drop(cols_to_delete, axis=1, inplace=True)

# temporary: also drop game_date
dm.drop('game_date', axis=1, inplace=True)
V.drop('game_date', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a 

In [16]:
# clean NaN in shot_made_flag column
dm.dropna(axis=0, how='any', inplace=True)

# make sure no NaNs in dm
assert dm.isnull().any().any()==False

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


## 3.2. Model

* Model definition includes **normalization** and **cross-validation**.
* **Normalization**. We had been summing up lengths, areas, and dimensionless quantities, mixing up our units. Sometimes each feature has its own scale and can influence the estimation in different ways. We need to normalize all of the features to a common scale.
* http://stats.stackexchange.com/questions/136272/how-to-handle-data-normalization-in-knn-when-new-test-data-is-received
* http://scikit-learn.org/stable/modules/preprocessing.html
* http://stackoverflow.com/questions/16137816/scikit-learn-preprocessing-svm-with-multiple-classes-in-a-pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

kf = KFold(len(dm), n_folds=5, shuffle=True)

X = dm.drop('shot_made_flag', axis=1).values
Y = dm.shot_made_flag.values

# list of average logloss for every K, to plot
k_losses = []

for i in range(1,30):
        
    classifier = neighbors.KNeighborsClassifier(n_neighbors=i)

    # `means` will be a list of mean accuracies (one entry per fold)
    means = []
    
    # losses will be a list of mean loglosses (one entry per fold)
    losses = []

    for training,testing in kf:
        # normalizing data (source: Coelho and Richert 2015, pp. 46)
        classifier = Pipeline([('norm', StandardScaler()),('knn', classifier)])
        # We fit a model for this fold, then apply it to the testing data with `predict`:
        classifier.fit(X[training], Y[training])
        prediction = classifier.predict(X[testing])

        # np.mean on an array of booleans returns fraction of correct decisions for this fold:
        curmean = np.mean(prediction == Y[testing])
        means.append(curmean)
        
        # compute probabilities and logloss
        probabilities = classifier.predict_proba(X[testing])
        probs_make = probabilities[:,1]
        
        loss = logloss(Y[testing], probs_make)
        losses.append(loss)
        
#    print("n_neighbors: {}".format(i))
#    print("Mean accuracy: {:.1%}".format(np.mean(means)))
#    print("Mean LogLoss: {:.3}".format(np.mean(losses)))
    
    k_losses.append(np.mean(losses))

In [None]:
# plot logloss
plt.plot(k_losses, color='r', linewidth=2)
plt.xlabel("Neighbours")
plt.title("Log loss")

## 3.3. Set up submission

In [None]:
# use KNN model to predict labels for V
# use 4 neighbours (suggested by crossvalidation results above)

neighbs = 21

classifier = neighbors.KNeighborsClassifier(n_neighbors=neighbs)
classifier.fit(X, Y)

probabilities = classifier.predict_proba(V)
p_makes = probabilities[:,1]

In [None]:
# write submission file
write_kaggle_submission(V, p_makes)

# 4. SVC

**References: **
* http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
* http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html

## 4.1. New feature matrix

In [None]:
# create design matrix
dm = data[['loc_x', 'loc_y', 'shot_distance', 'period', 'season', 'minutes_remaining', 'seconds_remaining', 'game_date', 
           'matchup', 'shot_made_flag']]

# create validation matrix

V = validation_data[['loc_x', 'loc_y', 'shot_distance', 'period', 'season', 'minutes_remaining', 'seconds_remaining', 'game_date', 
           'matchup']]

V_shotID = validation_data['shot_id']

In [None]:
# modify design matrix and Validation matrix

# convert shot_distance from feet to meters
dm.loc[:,'shot_distance'] = dm['shot_distance'].apply(lambda x: x*0.3048)
V.loc[:,'shot_distance'] = V['shot_distance'].apply(lambda x: x*0.3048)


# add angle feature and clean NaN by assuming angle=0 when distance=0
dm.loc[:,'angle'] = pd.Series(np.degrees(np.arctan(dm['loc_x']/dm['loc_y'])))
dm['angle'].fillna(0, inplace=True)
V.loc[:,'angle'] = pd.Series(np.degrees(np.arctan(V['loc_x']/V['loc_y'])))
V['angle'].fillna(0, inplace=True)

# convert matchup to Home/Away
# Home=0, Away=1
dm.loc[:, 'Home'] = dm['matchup'].apply(get_home_away)
V.loc[:, 'Home'] = V['matchup'].apply(get_home_away)


# convert seasons to first, second etc
# needs: convert to date 
dm.loc[:,'season'] = dm['season'].apply(get_season_num)
V.loc[:,'season'] = V['season'].apply(get_season_num)

# convert minutes + seconds remaining to time remaining in quarter (in seconds)
dm.loc[:, 'time_remaining'] = dm['minutes_remaining']*60 + dm['seconds_remaining']
V.loc[:, 'time_remaining'] = V['minutes_remaining']*60 + V['seconds_remaining']

# clean dataframe
cols_to_delete = ['loc_x', 'loc_y', 'minutes_remaining', 'seconds_remaining', 'matchup']
dm.drop(cols_to_delete, axis=1, inplace=True)
V.drop(cols_to_delete, axis=1, inplace=True)

# temporary: also drop game_date
dm.drop('game_date', axis=1, inplace=True)
V.drop('game_date', axis=1, inplace=True)

In [None]:
# clean NaN in shot_made_flag column
dm.dropna(axis=0, how='any', inplace=True)

# make sure no NaNs in dm
assert dm.isnull().any().any()==False

## 4.2. Model

In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

kf = KFold(len(dm), n_folds=5, shuffle=True)

# prepare data to predict shot_made_flag
X = dm.drop('shot_made_flag', axis=1).values
Y = dm.shot_made_flag.values

classifier = SVC(probability=True)

# list of average logloss for every kf, to plot
k_losses = []

# `means` will be a list of mean accuracies (one entry per fold)
means = []
    
# losses will be a list of mean loglosses (one entry per fold)
losses = []

for training,testing in kf:
    # normalizing data (source: Coelho and Richert 2015, pp. 46)
    classifier = Pipeline([('norm', StandardScaler()),('svc', classifier)])
    # We fit a model for this fold, then apply it to the testing data with `predict`:
    classifier.fit(X[training], Y[training])
    prediction = classifier.predict(X[testing])
    
    # we are not outputing this right now
    # np.mean on an array of booleans returns fraction of correct decisions for this fold:
    curmean = np.mean(prediction == Y[testing])
    means.append(curmean)
        
    # compute probabilities and logloss
    probabilities = classifier.predict_proba(X[testing])
    probs_make = probabilities[:,1]
        
    loss = logloss(Y[testing], probs_make)
    losses.append(loss)
        
k_losses.append(np.mean(losses))

In [32]:
k_losses

[0.66779793238240204]

## 4.3. Optimized model

* Parameters optimization
* Takes too much time thinking...

**References:**
* http://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf
* http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html

In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn import svm, grid_search

kf = KFold(len(dm), n_folds=5, shuffle=True)

# prepare data to predict shot_made_flag
X = dm.drop('shot_made_flag', axis=1).values
Y = dm.shot_made_flag.values

svr = SVC(probability=True)

# list of average logloss for every kf, to plot
k_losses = []

# `means` will be a list of mean accuracies (one entry per fold)
means = []
    
# losses will be a list of mean loglosses (one entry per fold)
losses = []

# prepare data to optimize parameters
parameters = {'C':[1, 10], 'gamma':[1, 10]}

clf = grid_search.GridSearchCV(svr, parameters)

for training,testing in kf:
    # normalizing data (source: Coelho and Richert 2015, pp. 46)
    classifier = Pipeline([('norm', StandardScaler()),('svc', clf)])
    # We fit a model for this fold, then apply it to the testing data with `predict`:
    classifier.fit(X[training], Y[training])
    prediction = classifier.predict(X[testing])
    
    # we are not outputing this right now
    # np.mean on an array of booleans returns fraction of correct decisions for this fold:
    curmean = np.mean(prediction == Y[testing])
    means.append(curmean)
        
    # compute probabilities and logloss
    probabilities = classifier.predict_proba(X[testing])
    probs_make = probabilities[:,1]
        
    loss = logloss(Y[testing], probs_make)
    losses.append(loss)
        
k_losses.append(np.mean(losses))

## 4.4. Set up submission

In [None]:
# write submission file
write_kaggle_submission(V, probs_makes)