In [None]:
import os
import pandas as pd

hudl_logs_dir = "/playpen-storage/levlevi/contextualized-shot-quality-analysis/data/nba/result-hidden-split/hudl-game-logs"
hudl_logs_fps = [os.path.join(hudl_logs_dir, f) for f in os.listdir(hudl_logs_dir)]
ex = hudl_logs_fps[0]

df = pd.read_csv(ex, delimiter=';')
shots_mask = df["action_name"].str.contains('\+|\-', regex=True)
shot_attempts_df = df.loc[shots_mask].copy()
shot_attempts_df.iloc[0]

In [None]:
from tqdm import tqdm

def generate_features(game_logs_dir: str):
    
    features = pd.DataFrame(columns=[
        "attempt_type", 
        "player_id", 
        "team_id", 
        "period",
        "pos_x",
        "pos_y",
        "shot_made", 
        ])
    
    hudl_logs_fps = [os.path.join(game_logs_dir, f) for f in os.listdir(game_logs_dir)]
    for fp in tqdm(hudl_logs_fps):

        df = pd.read_csv(fp, delimiter=';')
        shots_mask = df["action_name"].str.contains('\+|\-', regex=True)
        
        shot_attempts_df = df.loc[shots_mask, [
            'action_name', 
            'player_id', 
            'team_id', 
            'half',
            "pos_x",
            "pos_y",
            ]].copy()
        
        shot_attempts_df['shot_made'] = df.loc[shots_mask, "action_name"].str.contains("\+", regex=True)
        shots_attempts_features_df = shot_attempts_df.rename(
            columns={"half": "period", "action_name": "attempt_type"}
        )

        # MARK: what are some features everyone has access to?
        # attempt type: 2/3
        # period (half)
        # player_id
        # team_id
        # opponent_id

        attempts_results_stripped = []
        for at in shots_attempts_features_df['attempt_type']:
            attempts_results_stripped.append(at[0])

        shots_attempts_features_df['attempt_type'] = attempts_results_stripped

        # remove free throws
        free_throws_mask = shots_attempts_features_df['attempt_type'] == '1'
        shots_attempts_features_df = shots_attempts_features_df[~free_throws_mask]
        
        features = pd.concat([features, shots_attempts_features_df])

    return features

In [None]:
df = generate_features(hudl_logs_dir)

In [14]:
df_rand = df.dropna(axis=0).iloc[0:10000]
df_sorted = df.dropna(axis=0).sort_values("shot_made")
1- sum(df_sorted["shot_made"]) / len(df["shot_made"])

df_sorted

Unnamed: 0,attempt_type,player_id,team_id,period,pos_x,pos_y,shot_made
89,3,1111.0,80.0,1,26.36,0.66,False
318,2,9364.0,3300.0,2,24.29,9.90,False
308,2,1850.0,2935.0,2,27.08,7.20,False
302,3,8421.0,3300.0,2,21.11,3.09,False
300,2,1850.0,2935.0,2,25.34,7.65,False
...,...,...,...,...,...,...,...
655,2,12.0,2.0,4,27.02,7.62,True
657,3,6383.0,3225.0,4,20.51,4.26,True
143,2,6029.0,3173.0,2,24.44,11.04,True
563,2,16.0,2.0,3,25.49,4.65,True


In [21]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import pandas as pd

_df_balanced =  pd.concat([df_sorted[0:5000], df_sorted[-5000: ]])
_df_imbalanced = pd.concat([df_sorted[5000: int((10000 * .5535)) + 5000], df_sorted[-int((10000 * .4465)) - 5000: -5000]])

In [22]:
import random

_df_balanced['shot_made'] = _df_balanced['shot_made'].astype(int)
_df_imbalanced['shot_made'] = _df_imbalanced['shot_made'].astype(int)

_df_balanced = _df_balanced.sample(frac=1).reset_index(drop=True)
_df_imbalanced = _df_balanced.sample(frac=1).reset_index(drop=True)

In [29]:
scaler = StandardScaler()

_df_balanced[['pos_x', 'pos_y']] = scaler.fit_transform(_df_balanced[['pos_x', 'pos_y']])
_df_imbalanced[['pos_x', 'pos_y']] = scaler.fit_transform(_df_imbalanced[['pos_x', 'pos_y']])


X = _df_imbalanced.drop('shot_made', axis=1)[[
    "attempt_type",
    # "player_id",
    "team_id",
    "period",
    "pos_x",
    # "pos_y"
]]

y = _df_imbalanced['shot_made']
X_train, y_train = X[0:9000], y[0:9000]

X_test = _df_imbalanced.drop('shot_made', axis=1)[[
    "attempt_type",
    # "player_id",
    "team_id",
    "period",
    "pos_x",
    # "pos_y"
]][9000: ]

y_test = _df_imbalanced['shot_made'][9000: ]

models = {
    'Logistic Regression': LogisticRegression(max_iter=100000000),
    'Ridge Classifier': RidgeClassifier(),
    'SGD Classifier': SGDClassifier(max_iter=1000, tol=1e-3),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'Bagging Classifier': BaggingClassifier(),
    'SVC': SVC(),
    'Linear SVC': LinearSVC(),
    'NuSVC': NuSVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'MLP Classifier': MLPClassifier(max_iter=1000),
    'Gaussian Naive Bayes': GaussianNB(),
    'Bernoulli Naive Bayes': BernoulliNB(),
    'Linear Discriminant Analysis': LinearDiscriminantAnalysis()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name}, | accuracy: {accuracy}" )


Logistic Regression, | accuracy: 0.593
Ridge Classifier, | accuracy: 0.592
SGD Classifier, | accuracy: 0.514
Decision Tree, | accuracy: 0.567
Random Forest, | accuracy: 0.57
Gradient Boosting, | accuracy: 0.616




AdaBoost, | accuracy: 0.596
Extra Trees, | accuracy: 0.58
Bagging Classifier, | accuracy: 0.59
SVC, | accuracy: 0.518




Linear SVC, | accuracy: 0.524
NuSVC, | accuracy: 0.485
K-Nearest Neighbors, | accuracy: 0.575
MLP Classifier, | accuracy: 0.542
Gaussian Naive Bayes, | accuracy: 0.571
Bernoulli Naive Bayes, | accuracy: 0.584
Linear Discriminant Analysis, | accuracy: 0.592
