# Import libraries + dataset

In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
from datetime import timedelta
from sklearn import model_selection, tree

In [2]:
filename = 'shot_logs.csv'
data = pd.read_csv(filename)
data = data.dropna() # SHOT_CLOCK originally has NaNs. I wonder why?
print(data.shape)
data.head()

(122502, 21)


Unnamed: 0,GAME_ID,MATCHUP,LOCATION,W,FINAL_MARGIN,SHOT_NUMBER,PERIOD,GAME_CLOCK,SHOT_CLOCK,DRIBBLES,...,SHOT_DIST,PTS_TYPE,SHOT_RESULT,CLOSEST_DEFENDER,CLOSEST_DEFENDER_PLAYER_ID,CLOSE_DEF_DIST,FGM,PTS,player_name,player_id
0,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,1,1,1:09,10.8,2,...,7.7,2,made,"Anderson, Alan",101187,1.3,1,2,brian roberts,203148
1,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,2,1,0:14,3.4,0,...,28.2,3,missed,"Bogdanovic, Bojan",202711,6.1,0,0,brian roberts,203148
3,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,4,2,11:47,10.3,2,...,17.2,2,missed,"Brown, Markel",203900,3.4,0,0,brian roberts,203148
4,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,5,2,10:34,10.9,2,...,3.7,2,missed,"Young, Thaddeus",201152,1.1,0,0,brian roberts,203148
5,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,6,2,8:15,9.1,2,...,18.4,2,missed,"Williams, Deron",101114,2.6,0,0,brian roberts,203148


In [18]:
# The attributes:

# GAME_ID
# MATCHUP  =  Date - Teams
# LOCATION  =  Home or Away
# W  =  Win/Loss
# FINAL_MARGIN
# SHOT_NUMBER  =  This shot's number (per player, per game)
# PERIOD
# GAME_CLOCK
# SHOT_CLOCK
# DRIBBLES  =  Dribbles before shot
# TOUCH_TIME
# SHOT_DIST
# PTS_TYPE  =  Attempted type of shot
# SHOT_RESULT
# CLOSEST_DEFENDER
# CLOSEST_DEFENDER_PLAYER_ID
# CLOSE_DEF_DIST  =  Distance to the closest defender
# FGM  =  Field Goals Made?
# PTS  =  Points Scored
# player_name
# player_id
        
# Some code I was using to visualize the different values of each column
# for col in data:
#     uniques = data[col].unique()
#     print('{} | # uniques: {}'.format(col, len(uniques)))
#     print('{}\n'.format(uniques[:3]))

# Perform Feature Engineering

In [4]:
# fr https://stackoverflow.com/questions/50308629/python-pandas-column-convert-minutes-to-second
def timeToSeconds(x):
    mins, secs = map(float, x.split(':'))
    td = timedelta(minutes=mins, seconds=secs)
    return td.total_seconds()

In [5]:
# begin to separate the data into features and labels
labels = data['SHOT_RESULT'].map(dict(made=1, missed=0))

### Try out keeping different features by running different cells
The model code should be based solely on feature_data to easily test different feature extractions

In [6]:
# drop_features = ['SHOT_RESULT', 'MATCHUP', 'GAME_ID', 'CLOSEST_DEFENDER','player_name', 'LOCATION', 'W', 'PTS', 'FGM']


In [7]:
# There are a lot more features we want to get rid of than use -- let's specify which ones we want to keep
# desired_features = ['SHOT_CLOCK', 'DRIBBLES', 'TOUCH_TIME', 'SHOT_DIST', 'PTS_TYPE', 'CLOSE_DEF_DIST', 'LOCATION', 'W']

In [8]:
desired_features = ['SHOT_CLOCK', 'DRIBBLES', 'TOUCH_TIME', 'SHOT_DIST', 'PTS_TYPE', 'CLOSE_DEF_DIST']

In [9]:
# desired_features = [?]

In [10]:
# feature_data = data.drop(drop_features, axis=1)
feature_data = data.loc[:, desired_features]

# Reads game clock into seconds
# feature_data['GAME_CLOCK'] = data['GAME_CLOCK'].apply(timeToSeconds)

# One hot encodes any remaining categorical variables (like Location or W)
feature_data = pd.get_dummies(feature_data)

feature_data.head()

Unnamed: 0,SHOT_CLOCK,DRIBBLES,TOUCH_TIME,SHOT_DIST,PTS_TYPE,CLOSE_DEF_DIST
0,10.8,2,1.9,7.7,2,1.3
1,3.4,0,0.8,28.2,3,6.1
3,10.3,2,1.9,17.2,2,3.4
4,10.9,2,2.7,3.7,2,1.1
5,9.1,2,4.4,18.4,2,2.6


# === Models ===

## Decision Tree

In [11]:
import sklearn
from sklearn.tree import DecisionTreeClassifier

# Essentially a playground for trying different parameters of decision trees
# These should probably be run with the cross validation of the cell below tbh, but these are just examples
train_features, test_features, train_labels, test_labels = sklearn.model_selection.train_test_split(feature_data, labels, test_size=0.20)

# use gini
classifier = sk.tree.DecisionTreeClassifier(criterion='gini')
classifier = classifier.fit(train_features, train_labels)
predictions = classifier.predict(test_features)
print('Accuracy on test data using Gini as split criterion is:', (sk.metrics.accuracy_score(test_labels, predictions)))
print()

# max depth
depth=4
classifier = sk.tree.DecisionTreeClassifier(criterion='entropy', max_depth=depth)
classifier = classifier.fit(train_features, train_labels)
predictions = classifier.predict(test_features)
print('Accuracy on test data with max tree depth', depth, 'is:', (sk.metrics.accuracy_score(test_labels, predictions)))
print()

# minimum samples in a split set to 5
classifier = sk.tree.DecisionTreeClassifier(criterion='entropy', min_samples_split=5)
classifier = classifier.fit(train_features, train_labels)
predictions = classifier.predict(test_features)
print('Accuracy on test data with max tree minimum sample split 5 is:', (sk.metrics.accuracy_score(test_labels, predictions)))

Accuracy on test data using Gini as split criterion is: 0.5451614219827762

Accuracy on test data with max tree depth 4 is: 0.6136076078527407

Accuracy on test data with max tree minimum sample split 5 is: 0.5410799559201666


In [12]:
# Run cross validation with chosen hyperparameters

from sklearn.model_selection import cross_val_score

classifier = sk.tree.DecisionTreeClassifier(criterion='entropy', max_depth=10)
scores = cross_val_score(classifier, feature_data, labels, cv=5) 
                                           
print("Average Accuracy:", scores.mean()*100)

Average Accuracy: 61.14103854153392


In [13]:
# Inner loop of nested cross validation - determine what are likely the best hyperparams

from sklearn.model_selection import GridSearchCV

classifier = sk.tree.DecisionTreeClassifier()
params = {"max_depth": [5, 10, 15, 20],
          "min_samples_leaf": [5, 10, 15, 20],
          "max_features": [3, 4, 5, 6]}

grid_search = GridSearchCV(classifier, params, cv=5, scoring='accuracy')
grid_search.fit(feature_data, labels)

print(grid_search.best_params_)
print("Average Accuracy:", grid_search.best_score_*100)

{'max_depth': 5, 'max_features': 5, 'min_samples_leaf': 15}
Average Accuracy: 61.48551894175082


In [14]:
# The Real Deal: Nested Cross Validation

scores = cross_val_score(grid_search, feature_data, labels, cv=10) 

print(grid_search.best_params_)
print("Average Accuracy:", scores.mean()*100)

{'max_depth': 5, 'max_features': 5, 'min_samples_leaf': 15}
Average Accuracy: 61.40796836243272


## K Nearest Neighbors

## Naive Bayes

In [20]:
# From the HW
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_predict

classifier = GaussianNB()

# Cross validation accuracy
scores = cross_val_score(classifier, feature_data, labels, cv=10) 
print("Average Accuracy:", scores.mean()*100)

# Cross validation accuracy + other metrics
predictions = cross_val_predict(classifier, feature_data, labels, cv=10)
matrix = sklearn.metrics.confusion_matrix(labels, predictions)
report = sklearn.metrics.classification_report(labels, predictions)
print(report)

Average Accuracy: 57.91334005220732
              precision    recall  f1-score   support

           0       0.62      0.59      0.60     66622
           1       0.54      0.57      0.55     55880

    accuracy                           0.58    122502
   macro avg       0.58      0.58      0.58    122502
weighted avg       0.58      0.58      0.58    122502



## SVM

## Neural Network

## Random Forest

Scratch Notes I had on Classification Models in case someone finds them useful:

#### Decision Trees
- Choose Tree Depth
- Different ways to split categorical (half and half, one and the rest, ...)
- Random Forests

#### K Nearest Neighbors
- Choose K
- Data Normalization
- Curse of Dimensionality
- Class imbalance worries
- Feature selection very important

#### Naive Bayes
-    

#### SVM
- Choose cost for soft-margin
- Choose kernel functions
- Data Normalization
- Onehot encoding for categorical

#### Ensemble Techniques
- Bagging (give each model a sample of data)
- Boosting (train models iteratively on the subset of data the last one didn't do well on)
- Stacking (use one model's predictions as part of input to next model)

#### Neural Network
- Data Normalization
- Onehot encoding for categorical
- Takes a lot of data to train