In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

from sklearn import neighbors
from sklearn.cross_validation import train_test_split

%matplotlib inline

### Useful functions ###

In [None]:
#convert shot flags to colors
def flag_colors(flag):
    if flag == 1: return 'green'
    if flag == 0: return 'red'
    return 'black'

In [None]:
# convert matchup string to Home=1, away=0
# example: LAL @ POR: home=0
# example: LAL vs. POR: home=1

def get_home_away(string):
    if '@' in string: return 0 #away
    elif 'vs' in string : return 1 #home
    else: return "ACORDA CRL"


In [None]:
# convert season in format '1999-00' to 
# example: '1999-00': 3
def get_season_num(year):
    # year is in format '2009-10'
    year0 = 1996 #hardcodado
    y = year[:4]
    y_int = int(y)
    
    return y_int - year0
    
    

In [None]:
#quick asserts
assert get_season_num('2010-11') == 14
assert get_home_away('LAL @ POR') == 0
assert get_home_away('LAL vs UTA') == 1

# Import data

In [None]:
data = pd.read_csv('data.csv')

In [None]:
data.head()

In [None]:
list(data.columns.values)

# Notes

* Can we improve the missing data strategy?
* Are all the limitations to algorithm application verified?

* prevent leakage: train only on shots prior to the ones we're predicting (year)
* select features and then explore number of neighbours

* target: score=0.75

# Preprocessing

In [None]:
# TRAINING AND TEST DATA
train_data, test_data = train_test_split(data, test_size = 0.2, random_state = 0)

In [None]:
# MISSING DATA
# don't consider rows with shot_made_flag = NaN
train_data = train_data[np.isfinite(train_data['shot_made_flag'])]
test_data = test_data[np.isfinite(test_data['shot_made_flag'])]

   # Exploring features

### Opponent

In [None]:
total_shots = pd.pivot_table(data, index="opponent", values="shot_id", aggfunc='count')

In [None]:
shots_scored = pd.pivot_table(data, index="opponent", values="shot_made_flag", aggfunc='sum')

In [None]:
accuracy = shots_scored/total_shots

In [None]:
accuracy.mean()

In [None]:
total_shots.plot.bar(title='Kobe Bryant\ntotal shots by opponent')

In [None]:
shots_scored.plot.bar(title='Kobe Bryant\nshots scored by opponent')

In [None]:
accuracy.plot.bar(title='Kobe Bryant\n% by opponent')

#### Comments:

* The opponent doesn't seem to influence the accuracy

* also, spikes for long-time shitty teams: NYK, CLE, GSW(!), PHX, SAC

### Shots by distance

In [None]:
total_distance = pd.pivot_table(data, index="shot_distance", values="shot_id", aggfunc='count')

In [None]:
shots_scored = pd.pivot_table(data, index="shot_distance", values="shot_made_flag", aggfunc='sum')

In [None]:
accuracy = shots_scored/total_distance

In [None]:
total_distance.plot.bar(title='Kobe Bryant\ntotal shots by distance')
plt.axis([0, 30, 0, 6000])

In [None]:
shots_scored.plot.bar(title='Kobe Bryant\nshots scored by distance')
plt.axis([0, 30, 0, 3500])

In [None]:
accuracy.plot.bar(title='Kobe Bryant\n% by distance')
plt.axis([0, 30, 0, 1])

#### Comments:

* ~~Only 30 appears to have a very different percentage~~ now looking sweeter, but still not good - certainly didn't miss half his dunks for his career!
* PM: totally agree!

### Minutes remaining

In [None]:
minutes = pd.pivot_table(data, index="minutes_remaining", values="shot_id", aggfunc='count')

In [None]:
shots_scored = pd.pivot_table(data, index="minutes_remaining", values="shot_made_flag", aggfunc='sum')

In [None]:
accuracy = shots_scored/minutes

In [None]:
minutes.plot.bar(title='Kobe Bryant\nAttempts byminutes remaining')

In [None]:
shots_scored.plot.bar(title='Kobe Bryant\nshots scored by minutes remaining')

In [None]:
accuracy.plot.bar(title='Kobe Bryant\n% by minutes remaining')
plt.axis([0, 11, 0, .5])

### Seconds remaining

In [None]:
seconds = pd.pivot_table(data, index=[ "seconds_remaining", 'period'], values="shot_id", aggfunc='count')

In [None]:
shots_scored = pd.pivot_table(data, index=[ "seconds_remaining", 'period'], values="shot_made_flag", aggfunc='sum')

In [None]:
accuracy = shots_scored/seconds
accuracy

### Shots by zone range

In [None]:
shot_zones = pd.pivot_table(data, index="shot_zone_range", values="shot_id", aggfunc='count')
shot_zones_pct = shot_zones/shot_zones.sum()

# change order
shot_zones_pct = shot_zones_pct.reindex(index=['Less Than 8 ft.',  u'8-16 ft.', u'16-24 ft.', u'24+ ft.', u'Back Court Shot'])
shot_zones_pct.plot.bar(title='Kobe Bryant\n% by zone range')

In [None]:
# let's try plotting all of his shots and show if miss or hit

shots_pos = data[['loc_x', 'loc_y']][data.shot_made_flag==1]
shot_colors = map(flag_colors, data.shot_made_flag)
shots_pos.plot.scatter(x='loc_x', y='loc_y', s=10, c='g', ylim=(0,400), figsize=(20,10))


#### Comments:

* Front, 45 degrees or 90 degrees
* Heat map to understand accuracy

### Shots by action type

In [None]:
shots_actionType = pd.pivot_table(data, index="action_type", values="shot_id", aggfunc='count')
shots_actionType.plot.bar(title='Kobe Bryant\naction type')

### Combined shot type

In [None]:
shots_combined = pd.pivot_table(data, index="combined_shot_type", values="shot_id", aggfunc='count')
shots_combined.plot.bar(title='Kobe Bryant\ncombined shot type')

In [None]:
combined_scored = pd.pivot_table(data, index="combined_shot_type", values="shot_made_flag", aggfunc='sum')
combined_scored.plot.bar(title='Kobe Bryant\nscored combined shot type')

In [None]:
accuracy = combined_scored / shots_combined
accuracy.plot.bar(title='Kobe Bryant\n% combined shot type')

### Shots by period

In [None]:
shots_period = pd.pivot_table(data, index="period", values="shot_id", aggfunc='count')
shots_period.plot.bar(title='Kobe Bryant\nshots by period')

In [None]:
period_scored = pd.pivot_table(data, index="period", values="shot_made_flag", aggfunc='sum')
period_scored.plot.bar(title='Kobe Bryant\nscored by period')

In [None]:
accuracy = period_scored/shots_period
accuracy.plot.bar(title='Kobe Bryant\n% by period')

### Playoffs

In [None]:
accuracy = playoffs_scored/shots_playoffs
accuracy.plot.bar(title='Kobe Bryant\n% by playoff')
accuracy

### Shot type

In [None]:
shot_type = pd.pivot_table(data, index="shot_type", values="shot_id", aggfunc='count')
shot_type.plot.bar(title='Kobe Bryant\nshot type')

In [None]:
shotType_scored = pd.pivot_table(data, index="shot_type", values="shot_made_flag", aggfunc='sum')
shotType_scored.plot.bar(title='Kobe Bryant\nscored by shot type')

In [None]:
accuracy = shotType_scored/shot_type
accuracy.plot.bar(title='Kobe Bryant\n% by shot type')

In [None]:
#season = pd.pivot_table(data, index="season", values="shot_id", aggfunc='count')
#ax1 = season.plot.bar(title='Kobe Bryant\nSeason')

#season_scored = pd.pivot_table(data, index="season", values="shot_made_flag", aggfunc='sum')
#ax2 = season_scored.plot.bar(title='Kobe Bryant\nscored by shot type')

season_accuracy = season_scored/season
ax3 = season_accuracy.plot.bar(title='Kobe Bryant\n% by shot type', ylim=(.30,.45))

# acrescentear subplots

## On temporal series (in construction)

In [None]:
data['game_date'][0]

In [None]:
time.strptime("00-12-30", "%y-%m-%d")

## Dunks issue

* Kobe scored 780 dunk from 00/01 to 15/16
* http://www.basketball-reference.com/players/b/bryanko01.html
* Kobe's carrer started in 96/97

In [None]:
shots_combined = pd.pivot_table(data, index="combined_shot_type", values="shot_id", aggfunc='count')

In [None]:
shots_combined

In [None]:
combined_scored = pd.pivot_table(data, index="combined_shot_type", values="shot_made_flag", aggfunc='sum')

In [None]:
combined_scored

In [None]:
accuracy = combined_scored/shots_combined

In [None]:
accuracy

In [None]:
accuracy.plot.bar()

#### Conclusion:
* We have more successful dunks than basketball reference, which is possible because we have data from 96/97 to 00/01
* In 10 dunks, Kobe misses 2...

### Missing Analysis

* Home/away
* Time of the year
* Location

# 1. kNN

** References: **
* http://scikit-learn.org/stable/modules/neighbors.html
* http://www.amazon.com/Building-Machine-Learning-Systems-Python/dp/1784392774/ref=sr_1_1?ie=UTF8&qid=1461733493&sr=8-1&keywords=luis+pedro+coelho+python

In [None]:
knn = neighbors.KNeighborsClassifier(n_neighbors=2)

In [None]:
x1 = train_data['loc_x'].reshape(len(train_data['loc_x']),1)
x2 = train_data['loc_y'].reshape(len(train_data['loc_y']),1)
x3 = train_data['shot_distance'].reshape(len(train_data['shot_distance']),1)

In [None]:
x = np.concatenate((x1,x2,x3), axis=1)
y = train_data['shot_made_flag'].reshape(len(train_data['shot_made_flag']),1)

print x.shape
print y.shape

In [None]:
knn.fit(x,y.ravel())

In [None]:
#knn.predict([0])

In [None]:
#knn.predict_proba([0])

## 1.1 kNN evaluation

In [None]:
x1_test = test_data['loc_x'].reshape(len(test_data['loc_x']),1)
x2_test = test_data['loc_y'].reshape(len(test_data['loc_y']),1)
x3_test = test_data['shot_distance'].reshape(len(test_data['shot_distance']),1)

In [None]:
x_test = np.concatenate((x1_test,x2_test,x3_test), axis=1)
y_test = test_data['shot_made_flag'].reshape(len(test_data['shot_made_flag']),1)

In [None]:
knn.score(x_test,y_test)

In [None]:
# try different numbers of neighbours
l = []
for i in xrange(1,100):
    knn = neighbors.KNeighborsClassifier(n_neighbors=i)
    knn.fit(x,y.ravel())
    
    sc = knn.score(x_test,y_test)
    l.append(sc)
    
plt.plot(l)

## Knn for distance, shot_type

In [None]:
train_data.replace(to_replace='2PT Field Goal', value=2, inplace=True)
train_data.replace(to_replace='3PT Field Goal', value=3, inplace=True)

In [None]:
knn = neighbors.KNeighborsClassifier(n_neighbors=3)
x1 = train_data['shot_distance'].reshape(len(train_data['shot_distance']),1)
x2 = train_data['shot_type'].reshape(len(train_data['shot_type']),1)

x = np.concatenate((x1,x2), axis=1)
y = train_data['shot_made_flag'].reshape(len(train_data['shot_made_flag']),1)

knn.fit(x,y.ravel())

In [None]:
x1_test = train_data['shot_distance'].reshape(len(train_data['shot_distance']),1)
x2_test = train_data['shot_type'].reshape(len(train_data['shot_type']),1)

x_test = np.concatenate((x1_test,x2_test), axis=1)
y_test = train_data['shot_made_flag'].reshape(len(train_data['shot_made_flag']),1)

In [None]:
knn.score(x_test, y_test)

In [None]:
knn.fit(x1, y.ravel())
knn.score(x1_test, y_test)

In [None]:
# try different numbers of neighbours
l = []
for i in xrange(1,100):
    knn = neighbors.KNeighborsClassifier(n_neighbors=i)
    knn.fit(x,y.ravel())
    
    sc = knn.score(x_test,y_test)
    l.append(sc)
    
plt.plot(l)

## PLaying with correlation

In [None]:
data[['loc_x', 'loc_y']];

In [None]:
data['angle'] = np.degrees(np.arctan(data['loc_x']/data['loc_y']))
RO = data.corr()

In [None]:
RO.loc['shot_made_flag']

In [None]:
data['shot_id' ]

## New feature matrix

In [None]:
# create design matrix
dm = data[['loc_x', 'loc_y', 'shot_distance', 'period', 'season', 'minutes_remaining', 'seconds_remaining', 'game_date', 'matchup', 'shot_made_flag']]

In [None]:
# modify design matrix

# convert shot_distance from feet to meters
dm.loc[:,'shot_distance'] = dm['shot_distance'].apply(lambda x: x*0.3048)

# add angle feature and clean NaN by assuming angle=0 when distance=0
dm.loc[:,'angle'] = pd.Series(np.degrees(np.arctan(dm['loc_x']/dm['loc_y'])))
dm['angle'].fillna(0, inplace=True)


# convert matchup to Home/Away
# Home=0, Away=1
dm.loc[:, 'Home'] = dm['matchup'].apply(get_home_away)

# convert seasons to first, second etc
# needs: convert to date 
dm.loc[:,'season'] = dm['season'].apply(get_season_num)

# convert minutes + seconds remaining to time remaining in quarter (in seconds)
dm.loc[:, 'time_remaining'] = dm['minutes_remaining']*60 + dm['seconds_remaining']

# clean dataframe
cols_to_delete = ['loc_x', 'loc_y', 'minutes_remaining', 'seconds_remaining', 'matchup']
dm.drop(cols_to_delete, axis=1, inplace=True)

# temporary: also drop game_date
dm.drop('game_date', axis=1, inplace=True)

In [None]:
# clean NaN in shot_made_flag column
dm.dropna(axis=0, how='any', inplace=True)

# make sure no NaNs in dm
assert dm.isnull().any().any()==False

# KNN with complete design matrix

In [None]:
# TRAINING AND TEST DATA
train_data, test_data = train_test_split(dm, test_size = 0.2, random_state = 0)

xtrain = train_data.drop('shot_made_flag', axis=1)
xtest = test_data.drop('shot_made_flag', axis=1)

ytrain = train_data['shot_made_flag'].reshape(len(train_data['shot_made_flag']),1)
ytest = test_data['shot_made_flag'].reshape(len(test_data['shot_made_flag']),1)

In [None]:
# score for multiple neighbours
l=[]
for i in range(1,60):
    knn = neighbors.KNeighborsClassifier(n_neighbors=i)
    knn.fit(xtrain, ytrain.ravel())

    sc = knn.score(xtest, ytest)
    l.append(sc)
    
plt.plot(l)