In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
from datetime import timedelta
from sklearn import model_selection, tree

In [2]:
filename = 'shot_logs.csv'
data = pd.read_csv(filename)
data = data.dropna()
print(data.shape)
data.head()

(122502, 21)


Unnamed: 0,GAME_ID,MATCHUP,LOCATION,W,FINAL_MARGIN,SHOT_NUMBER,PERIOD,GAME_CLOCK,SHOT_CLOCK,DRIBBLES,...,SHOT_DIST,PTS_TYPE,SHOT_RESULT,CLOSEST_DEFENDER,CLOSEST_DEFENDER_PLAYER_ID,CLOSE_DEF_DIST,FGM,PTS,player_name,player_id
0,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,1,1,1:09,10.8,2,...,7.7,2,made,"Anderson, Alan",101187,1.3,1,2,brian roberts,203148
1,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,2,1,0:14,3.4,0,...,28.2,3,missed,"Bogdanovic, Bojan",202711,6.1,0,0,brian roberts,203148
3,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,4,2,11:47,10.3,2,...,17.2,2,missed,"Brown, Markel",203900,3.4,0,0,brian roberts,203148
4,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,5,2,10:34,10.9,2,...,3.7,2,missed,"Young, Thaddeus",201152,1.1,0,0,brian roberts,203148
5,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,6,2,8:15,9.1,2,...,18.4,2,missed,"Williams, Deron",101114,2.6,0,0,brian roberts,203148


In [3]:

#from https://stackoverflow.com/questions/50308629/python-pandas-column-convert-minutes-to-second
def timeToSeconds(x):
    mins, secs = map(float, x.split(':'))
    td = timedelta(minutes=mins, seconds=secs)
    return td.total_seconds()

In [51]:
# begin to separate the data into features and labels
labels = data['SHOT_RESULT'].map(dict(made=1, missed=0))
feature_data = data.drop(['SHOT_RESULT', 'MATCHUP', 'GAME_ID', 'CLOSEST_DEFENDER','player_name', 'LOCATION','W', 'PTS', 'FGM'], axis=1)

# go back to adapt Location and W to be usable with the DT
feature_data['GAME_CLOCK'] = feature_data['GAME_CLOCK'].apply(timeToSeconds)
#feature_data['SHOT_CLOCK'].apply(timeToSeconds)

feature_data = feature_data.drop(['CLOSEST_DEFENDER_PLAYER_ID', 'player_id', 'FINAL_MARGIN', 'SHOT_NUMBER', 'PERIOD', 'GAME_CLOCK'], axis=1)

feature_data.head()

Unnamed: 0,SHOT_CLOCK,DRIBBLES,TOUCH_TIME,SHOT_DIST,PTS_TYPE,CLOSE_DEF_DIST
0,10.8,2,1.9,7.7,2,1.3
1,3.4,0,0.8,28.2,3,6.1
3,10.3,2,1.9,17.2,2,3.4
4,10.9,2,2.7,3.7,2,1.1
5,9.1,2,4.4,18.4,2,2.6


In [52]:
x_train, x_test, y_train, y_test = sk.model_selection.train_test_split(feature_data, labels, test_size=.8, train_size=.2)
dTree = sk.tree.DecisionTreeClassifier(criterion='entropy')
dTree.fit(x_train, y_train)
print(sk.metrics.accuracy_score(y_test, dTree.predict(x_test)))

0.5408767168016979


In [44]:
# use gini instead
classifier = sk.tree.DecisionTreeClassifier(criterion='gini')
classifier = classifier.fit(x_train, y_train)
predictions = classifier.predict(x_test)
print('Accuracy on test data using Gini as split criterion is:', (sk.metrics.accuracy_score(y_test, predictions)))
print()

# max depth
depth=4
classifier = sk.tree.DecisionTreeClassifier(criterion='entropy', max_depth=depth)
classifier = classifier.fit(x_train, y_train)
predictions = classifier.predict(x_test)
print('Accuracy on test data with max tree depth', depth, 'is:', (sk.metrics.accuracy_score(y_test, predictions)))
print()

# minimum samples in a split set to 5
classifier = sk.tree.DecisionTreeClassifier(criterion='entropy', min_samples_split=5)
classifier = classifier.fit(x_train, y_train)
predictions = classifier.predict(x_test)
print('Accuracy on test data with max tree minimum sample split 5 is:', (sk.metrics.accuracy_score(y_test, predictions)))

Accuracy on test data using Gini as split criterion is: 0.5409889594089916

Accuracy on test data with max tree depth 4 is: 0.6106303952980552

Accuracy on test data with max tree minimum sample split 5 is: 0.5412134446235791


In [58]:
from sklearn.model_selection import cross_val_score

classifier = sk.tree.DecisionTreeClassifier(criterion='entropy', max_depth=10)

scores = cross_val_score(classifier, feature_data, labels, cv=5) 
                                           
print("Average Accuracy:", scores.mean()*100)

Average Accuracy: 61.15654867897936


In [57]:
from sklearn.model_selection import GridSearchCV

classifier = sk.tree.DecisionTreeClassifier()

params = {"max_depth": [5, 10, 15, 20],
          "min_samples_leaf": [5, 10, 15, 20],
          "max_features": [3, 4, 5, 6]}

grid_search = GridSearchCV(classifier, params, cv=5, scoring='accuracy')

grid_search.fit(feature_data, labels)

print(grid_search.best_params_)

print("Average Accuracy:", grid_search.best_score_*100)

{'max_depth': 5, 'max_features': 6, 'min_samples_leaf': 10}
Average Accuracy: 61.434910228570416


In [60]:
scores = cross_val_score(grid_search, feature_data, labels, cv=10) 
                                           
print("Average Accuracy:", scores.mean()*100)

Average Accuracy: 61.44959901649012
