In [35]:
import os
import pandas as pd

hudl_logs_dir = "/playpen-storage/levlevi/contextualized-shot-quality-analysis/data/nba/result-hidden-split/hudl-game-logs"
hudl_logs_fps = [os.path.join(hudl_logs_dir, f) for f in os.listdir(hudl_logs_dir)]
ex = hudl_logs_fps[0]

df = pd.read_csv(ex, delimiter=';')
shots_mask = df["action_name"].str.contains('\+|\-', regex=True)
shot_attempts_df = df.loc[shots_mask].copy()
shot_attempts_df.iloc[0]

id                                          76320111
action_id                                     800302
action_name                                       3-
player_id                                     1111.0
player_name                             Trevor Ariza
team_id                                         80.0
team_name                            Houston Rockets
opponent_id                                      NaN
opponent_name                                    NaN
opponent_team_id                                 NaN
opponent_team_name                               NaN
teammate_id                                      NaN
teammate_name                                    NaN
half                                               1
second                                        39.438
pos_x                                          26.36
pos_y                                           0.66
possession_id                                    NaN
possession_name                               

In [36]:
from tqdm import tqdm

def generate_features(game_logs_dir: str):
    
    features = pd.DataFrame(columns=[
        "attempt_type", 
        "player_id", 
        "team_id", 
        "period",
        "pos_x",
        "pos_y",
        "shot_made", 
        ])
    
    hudl_logs_fps = [os.path.join(game_logs_dir, f) for f in os.listdir(game_logs_dir)]
    for fp in tqdm(hudl_logs_fps):

        df = pd.read_csv(fp, delimiter=';')
        shots_mask = df["action_name"].str.contains('\+|\-', regex=True)
        
        shot_attempts_df = df.loc[shots_mask, [
            'action_name', 
            'player_id', 
            'team_id', 
            'half',
            "pos_x",
            "pos_y",
            ]].copy()
        
        shot_attempts_df['shot_made'] = df.loc[shots_mask, "action_name"].str.contains("\+", regex=True)
        shots_attempts_features_df = shot_attempts_df.rename(
            columns={"half": "period", "action_name": "attempt_type"}
        )

        # MARK: what are some features everyone has access to?
        # attempt type: 2/3
        # period (half)
        # player_id
        # team_id
        # opponent_id

        attempts_results_stripped = []
        for at in shots_attempts_features_df['attempt_type']:
            attempts_results_stripped.append(at[0])

        shots_attempts_features_df['attempt_type'] = attempts_results_stripped

        # remove free throws
        free_throws_mask = shots_attempts_features_df['attempt_type'] == '1'
        shots_attempts_features_df = shots_attempts_features_df[~free_throws_mask]
        
        features = pd.concat([features, shots_attempts_features_df])

    return features

In [37]:
df = generate_features(hudl_logs_dir)

  features = pd.concat([features, shots_attempts_features_df])
100%|██████████| 634/634 [00:03<00:00, 167.65it/s]


In [38]:
df_rand = df.dropna(axis=0).iloc[0:10000]
df_sorted = df.dropna(axis=0).sort_values("shot_made")
1- sum(df_sorted["shot_made"]) / len(df["shot_made"])

0.5534787319233124

In [39]:
import numpy as np
mask = df_sorted["shot_made"]
shots_made = df_sorted[mask]
shots_missed = df_sorted[np.logical_not(mask)]
shots_made

Unnamed: 0,attempt_type,player_id,team_id,period,pos_x,pos_y,shot_made
593,2,1139.0,79.0,4,26.96,8.13,True
571,2,2480.0,2977.0,2,23.03,5.28,True
92,2,9020.0,3280.0,1,24.65,13.53,True
664,2,8395.0,3279.0,3,27.02,7.62,True
73,2,1880.0,2932.0,1,25.91,7.59,True
...,...,...,...,...,...,...,...
655,2,12.0,2.0,4,27.02,7.62,True
657,3,6383.0,3225.0,4,20.51,4.26,True
143,2,6029.0,3173.0,2,24.44,11.04,True
563,2,16.0,2.0,3,25.49,4.65,True


In [40]:
shots_missed

Unnamed: 0,attempt_type,player_id,team_id,period,pos_x,pos_y,shot_made
89,3,1111.0,80.0,1,26.36,0.66,False
318,2,9364.0,3300.0,2,24.29,9.90,False
308,2,1850.0,2935.0,2,27.08,7.20,False
302,3,8421.0,3300.0,2,21.11,3.09,False
300,2,1850.0,2935.0,2,25.34,7.65,False
...,...,...,...,...,...,...,...
324,2,2409.0,2976.0,4,25.31,7.17,False
659,2,1157.0,21.0,4,23.45,2.97,False
640,3,14021.0,3300.0,4,22.43,1.41,False
657,3,7490.0,3300.0,4,22.43,1.23,False


In [41]:
58053 / (58053 + 47396)

0.5505315365721818

In [42]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import pandas as pd

total = 101000
shot_made_ratio = .4465
shot_missed_ratio = 1- shot_made_ratio
_df_imbalanced = pd.concat([shots_made[0: int(total * shot_made_ratio)], shots_missed[0: int(total * shot_missed_ratio)]])
_df_imbalanced = _df_imbalanced.sample(frac=1).reset_index(drop=True)
_df_imbalanced

Unnamed: 0,attempt_type,player_id,team_id,period,pos_x,pos_y,shot_made
0,3,1875.0,2932.0,2,25.22,0.66,True
1,3,5329.0,3171.0,3,17.87,3.72,False
2,3,8333.0,3279.0,1,26.96,14.61,False
3,2,1875.0,2932.0,2,23.60,2.94,True
4,2,102.0,9.0,3,26.42,7.44,False
...,...,...,...,...,...,...,...
100994,2,34324.0,2941.0,1,27.38,3.57,True
100995,3,8331.0,2935.0,1,27.05,14.46,False
100996,2,7535.0,78.0,1,23.90,11.76,True
100997,2,1.0,1.0,3,24.29,4.02,True


In [56]:
NUM_SHOTS_TRAIN = 10000
NUM_SHOTS_TEST = 1000

scaler = StandardScaler()

# old imbalanced: _df_imbalanced = pd.concat([df_sorted[5000: int((10000 * .5535)) + 5000], df_sorted[-int((10000 * .4465)) - 5000: -5000]])
# old imbalanced test: [9000: 10000]

_df_imbalanced[['pos_x', 'pos_y']] = scaler.fit_transform(_df_imbalanced[['pos_x', 'pos_y']])

X = _df_imbalanced.drop('shot_made', axis=1)[[
    "attempt_type",
    # "player_id",
    "team_id",
    "period",
    "pos_x",
    # "pos_y"
]]

y = _df_imbalanced["shot_made"].astype(int)
X_train, y_train = X[0: NUM_SHOTS_TRAIN], y[0: NUM_SHOTS_TRAIN]

X_test = _df_imbalanced.drop("shot_made", axis=1)[
    [
        "attempt_type",
        # "player_id",
        "team_id",
        "period",
        "pos_x",
        # "pos_y"
    ]
][-NUM_SHOTS_TEST: ]

y_test = _df_imbalanced["shot_made"][-NUM_SHOTS_TEST:].astype(int)

models = {
    # 'Logistic Regression': LogisticRegression(max_iter=100000000),
    # 'Ridge Classifier': RidgeClassifier(),
    # 'SGD Classifier': SGDClassifier(max_iter=1000, tol=1e-3),
    # 'Decision Tree': DecisionTreeClassifier(),
    # 'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    # 'AdaBoost': AdaBoostClassifier(),
    # 'Extra Trees': ExtraTreesClassifier(),
    # 'Bagging Classifier': BaggingClassifier(),
    # 'SVC': SVC(),
    # 'Linear SVC': LinearSVC(),
    # 'NuSVC': NuSVC(),
    # 'K-Nearest Neighbors': KNeighborsClassifier(),
    # 'MLP Classifier': MLPClassifier(max_iter=1000),
    # 'Gaussian Naive Bayes': GaussianNB(),
    # 'Bernoulli Naive Bayes': BernoulliNB(),
    # 'Linear Discriminant Analysis': LinearDiscriminantAnalysis()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name}, | accuracy: {accuracy}")

Gradient Boosting, | accuracy: 0.621


In [61]:
61.8 - 55.35

6.449999999999996