# Import libraries + dataset

In [8]:
import pandas as pd
import numpy as np
# import sklearn as sk
from datetime import timedelta
# from sklearn import model_selection, tree

In [9]:
filename = 'shot_logs.csv'
data = pd.read_csv(filename)
data = data.dropna() # SHOT_CLOCK originally has NaNs. I wonder why?
print(data.shape)
data.head()

(122502, 21)


Unnamed: 0,GAME_ID,MATCHUP,LOCATION,W,FINAL_MARGIN,SHOT_NUMBER,PERIOD,GAME_CLOCK,SHOT_CLOCK,DRIBBLES,...,SHOT_DIST,PTS_TYPE,SHOT_RESULT,CLOSEST_DEFENDER,CLOSEST_DEFENDER_PLAYER_ID,CLOSE_DEF_DIST,FGM,PTS,player_name,player_id
0,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,1,1,1:09,10.8,2,...,7.7,2,made,"Anderson, Alan",101187,1.3,1,2,brian roberts,203148
1,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,2,1,0:14,3.4,0,...,28.2,3,missed,"Bogdanovic, Bojan",202711,6.1,0,0,brian roberts,203148
3,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,4,2,11:47,10.3,2,...,17.2,2,missed,"Brown, Markel",203900,3.4,0,0,brian roberts,203148
4,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,5,2,10:34,10.9,2,...,3.7,2,missed,"Young, Thaddeus",201152,1.1,0,0,brian roberts,203148
5,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,6,2,8:15,9.1,2,...,18.4,2,missed,"Williams, Deron",101114,2.6,0,0,brian roberts,203148


In [3]:
# The attributes:

# GAME_ID
# MATCHUP  =  Date - Teams
# LOCATION  =  Home or Away
# W  =  Win/Loss
# FINAL_MARGIN
# SHOT_NUMBER  =  This shot's number (per player, per game)
# PERIOD
# GAME_CLOCK
# SHOT_CLOCK
# DRIBBLES  =  Dribbles before shot
# TOUCH_TIME
# SHOT_DIST
# PTS_TYPE  =  Attempted type of shot
# SHOT_RESULT
# CLOSEST_DEFENDER
# CLOSEST_DEFENDER_PLAYER_ID
# CLOSE_DEF_DIST  =  Distance to the closest defender
# FGM  =  Field Goals Made?
# PTS  =  Points Scored
# player_name
# player_id
        
# Some code I was using to visualize the different values of each column
# for col in data:
#     uniques = data[col].unique()
#     print('{} | # uniques: {}'.format(col, len(uniques)))
#     print('{}\n'.format(uniques[:3]))

# Perform Feature Engineering

In [10]:
# fr https://stackoverflow.com/questions/50308629/python-pandas-column-convert-minutes-to-second
def timeToSeconds(x):
    mins, secs = map(float, x.split(':'))
    td = timedelta(minutes=mins, seconds=secs)
    return td.total_seconds()

In [11]:
# begin to separate the data into features and labels
labels = data['SHOT_RESULT'].map(dict(made=1, missed=0))

### Try out keeping different features by running different cells
The model code should be based solely on feature_data to easily test different feature extractions

In [6]:
# drop_features = ['SHOT_RESULT', 'MATCHUP', 'GAME_ID', 'CLOSEST_DEFENDER','player_name', 'LOCATION', 'W', 'PTS', 'FGM']


In [7]:
# There are a lot more features we want to get rid of than use -- let's specify which ones we want to keep
# desired_features = ['SHOT_CLOCK', 'DRIBBLES', 'TOUCH_TIME', 'SHOT_DIST', 'PTS_TYPE', 'CLOSE_DEF_DIST', 'LOCATION', 'W']

In [5]:
desired_features = ['SHOT_CLOCK', 'DRIBBLES', 'TOUCH_TIME', 'SHOT_DIST', 'PTS_TYPE', 'CLOSE_DEF_DIST']

In [9]:
# desired_features = [?]

In [12]:
# feature_data = data.drop(drop_features, axis=1)
feature_data = data.loc[:, desired_features]

# Reads game clock into seconds
# feature_data['GAME_CLOCK'] = data['GAME_CLOCK'].apply(timeToSeconds)

# One hot encodes any remaining categorical variables (like Location or W)
feature_data = pd.get_dummies(feature_data)

feature_data.head()

Unnamed: 0,SHOT_CLOCK,DRIBBLES,TOUCH_TIME,SHOT_DIST,PTS_TYPE,CLOSE_DEF_DIST
0,10.8,2,1.9,7.7,2,1.3
1,3.4,0,0.8,28.2,3,6.1
3,10.3,2,1.9,17.2,2,3.4
4,10.9,2,2.7,3.7,2,1.1
5,9.1,2,4.4,18.4,2,2.6


# === Models ===

In [None]:
import sklearn as sk

# Tools
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.pipeline import Pipeline

# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

# Parallelism
from sklearn.utils import parallel_backend


## Decision Tree

In [30]:
# Essentially a playground for trying different parameters of decision trees
# These should probably be run with the cross validation of the cell below tbh, but these are just examples
train_features, test_features, train_labels, test_labels = sk.model_selection.train_test_split(feature_data, labels, test_size=0.20)

# use gini
classifier = DecisionTreeClassifier(criterion='gini')
classifier = classifier.fit(train_features, train_labels)
predictions = classifier.predict(test_features)
print('Accuracy on test data using Gini as split criterion is:', (sk.metrics.accuracy_score(test_labels, predictions)))
print()

# max depth
depth=4
classifier = DecisionTreeClassifier(criterion='entropy', max_depth=depth)
classifier = classifier.fit(train_features, train_labels)
predictions = classifier.predict(test_features)
print('Accuracy on test data with max tree depth', depth, 'is:', (sk.metrics.accuracy_score(test_labels, predictions)))
print()

# minimum samples in a split set to 5
classifier = DecisionTreeClassifier(criterion='entropy', min_samples_split=5)
classifier = classifier.fit(train_features, train_labels)
predictions = classifier.predict(test_features)
print('Accuracy on test data with max tree minimum sample split 5 is:', (sk.metrics.accuracy_score(test_labels, predictions)))

Accuracy on test data using Gini as split criterion is: 0.5394881841557487

Accuracy on test data with max tree depth 4 is: 0.6142198277621321

Accuracy on test data with max tree minimum sample split 5 is: 0.5432431329333497


In [40]:
# Run cross validation with chosen hyperparameters

classifier = DecisionTreeClassifier(criterion='entropy', max_depth=10)
with parallel_backend('loky'):
    scores = cross_val_score(classifier, feature_data, labels, cv=5)
                                           
print("Average Accuracy:", scores.mean()*100)

Average Accuracy: 61.137773135457195


In [14]:
# Inner loop of nested cross validation - determine what are likely the best hyperparams

classifier = DecisionTreeClassifier()
params = {"max_depth": [5, 10, 15, 20],
          "min_samples_leaf": [5, 10, 15, 20],
          "max_features": [3, 4, 5, 6]}

grid_search = GridSearchCV(classifier, params, cv=5, scoring='accuracy')
with parallel_backend('loky'):
    grid_search.fit(feature_data, labels)

print(grid_search.best_params_)
print("Average Accuracy:", grid_search.best_score_*100)

{'max_depth': 5, 'max_features': 6, 'min_samples_leaf': 10}
Average Accuracy: 61.434910228570416


In [15]:
# The Real Deal: Nested Cross Validation

with parallel_backend('loky'):
    scores = cross_val_score(grid_search, feature_data, labels, cv=10) 

print(grid_search.best_params_)
print("Average Accuracy:", scores.mean()*100)

{'max_depth': 5, 'max_features': 6, 'min_samples_leaf': 10}
Average Accuracy: 61.446333110666515


## K Nearest Neighbors

In [17]:
scaler = sk.preprocessing.StandardScaler()
PCA = sk.decomposition.PCA()
knn = sk.neighbors.KNeighborsClassifier(n_neighbors=7)
pipe = Pipeline([('scaler' , scaler ), ('pca', PCA), ('knn', knn) ] )

with parallel_backend('loky'):
    accuracies = cross_val_score( pipe, feature_data, labels, cv=5)
    
print("Accuracies: ", accuracies.mean()*100)

Accuracies:  57.211308026577846


In [21]:
param_grid = {
    #'pca__n_components': list(range(5, 19)),
    'knn__n_neighbors': list(range(1, 7))
}
gs = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')

with parallel_backend('loky'):
    gs.fit(feature_data, labels)

print(gs.best_params_)
print(gs.best_score_)

duration = 7.195174217224121
{'knn__n_neighbors': 6}
0.5802353943071046


In [25]:
with parallel_backend('loky'):
    accuracies = cross_val_score( gs, feature_data, labels, cv=5)
print("Accuracies: ", accuracies.mean()*100)

Accuracies:  58.028437356576035


## Naive Bayes

In [26]:
# From the HW
classifier = GaussianNB()

# Cross validation accuracy
scores = cross_val_score(classifier, feature_data, labels, cv=10) 
print("Average Accuracy:", scores.mean()*100)

duration = 0.3098483085632324
Average Accuracy: 57.91334005220732


In [28]:
# Cross validation accuracy + other metrics
predictions = cross_val_predict(classifier, feature_data, labels, cv=10)

matrix = sk.metrics.confusion_matrix(labels, predictions)
report = sk.metrics.classification_report(labels, predictions)
print(report)

duration = 0.3035438060760498
              precision    recall  f1-score   support

           0       0.62      0.59      0.60     66622
           1       0.54      0.57      0.55     55880

    accuracy                           0.58    122502
   macro avg       0.58      0.58      0.58    122502
weighted avg       0.58      0.58      0.58    122502



## SVM

In [None]:
scaler = sk.preprocessing.StandardScaler()
pca = sk.decomposition.PCA()
svc = SVC()
pipeline = Pipeline(steps=[('scaler', scaler), ('PCA', pca), ('SVC', svc)])

param_grid = {
    'PCA__n_components': list(range(2, 6)),
    'SVC__kernel': ['linear', 'rbf', 'poly']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5)

with parallel_backend('loky'):
    predictions = cross_val_predict(grid_search, feature_data, labels, cv=10)

avg_accuracy = sk.metrics.accuracy_score(labels, predictions)
print("Average Accuracy:", avg_accuracy*100)
report = sk.metrics.classification_report(labels, predictions)
print(report)

## Neural Network

In [None]:
scaler = sk.preprocessing.StandardScaler()
pca = sk.decomposition.PCA()
nn = MLPClassifier()
pipeline = Pipeline(steps=[('scaler', scaler), ('PCA', pca), ('NN', nn)])

param_grid = {
    'PCA__n_components': list(range(2, 6)),
    'NN__hidden_layer_sizes': [(8, ), (16, ), (32, ), (64, )],
    'NN__activation': ['logistic', 'tanh', 'relu']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5)
with parallel_backend('loky'):
    grid_search.fit(feature_data, labels)

print(grid_search.best_params_)
print(grid_search.best_score_)

In [None]:
with parallel_backend('loky'):
    predictions = cross_val_predict(grid_search, feature_data, labels, cv=5)

avg_accuracy = sk.metrics.accuracy_score(labels, predictions)
print("Average Accuracy:", avg_accuracy*100)
report = sk.metrics.classification_report(labels, predictions)
print(report)

In [17]:
scaler = sk.preprocessing.StandardScaler()
pca = sk.decomposition.PCA()
classifier = MLPClassifier()
pipeline = Pipeline(steps=[('scaler', scaler), ('PCA', pca), ('mlp', classifier)])

# Find the best hyperparamters for neural net
param_grid = {
    'PCA__n_components': list(range(2, 6)),
    'mlp__activation' : ['identity', 'logistic', 'tanh', 'relu'],
}

# Train and test data
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

with parallel_backend('loky'):
    grid_search.fit(feature_data, labels)
    
# print best parameters
print(grid_search.best_params_)


{'PCA__n_components': 4, 'mlp__activation': 'relu'}


## Random Forest

In [None]:
classifier = RandomForestClassifier()

# Find the best hyperparamters for random forest
param_grid = {
    'criterion' : ['gini', 'entropy'],
    'n_estimators' : [50, 100, 150, 200, 250, 300],
    'min_samples_split' : [2, 3, 4, 5, 6],
    'bootstrap' : [True, False]
}

# Train and test data
grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy')

with parallel_backend('loky'):
    grid_search.fit(feature_data, labels)
    
# print best parameters
print(grid_search.best_params_)

with parallel_backend('loky'):
    predictions = cross_val_predict(grid_search, feature_data, labels, cv=5)

# print results
avg_accuracy = sk.metrics.accuracy_score(labels, predictions)
print("Average Accuracy:", avg_accuracy*100)
report = sk.metrics.classification_report(labels, predictions)
print(report)

Scratch Notes I had on Classification Models in case someone finds them useful:

#### Decision Trees
- Choose Tree Depth
- Different ways to split categorical (half and half, one and the rest, ...)
- Random Forests

#### K Nearest Neighbors
- Choose K
- Data Normalization
- Curse of Dimensionality
- Class imbalance worries
- Feature selection very important

#### Naive Bayes
-    

#### SVM
- Choose cost for soft-margin
- Choose kernel functions
- Data Normalization
- Onehot encoding for categorical

#### Ensemble Techniques
- Bagging (give each model a sample of data)
- Boosting (train models iteratively on the subset of data the last one didn't do well on)
- Stacking (use one model's predictions as part of input to next model)

#### Neural Network
- Data Normalization
- Onehot encoding for categorical
- Takes a lot of data to train