In [39]:
import os
import pandas as pd

hudl_logs_dir = "/playpen-storage/levlevi/contextualized-shot-quality-analysis/data/nba/result-hidden-split/hudl-game-logs"
hudl_logs_fps = [os.path.join(hudl_logs_dir, f) for f in os.listdir(hudl_logs_dir)]
ex = hudl_logs_fps[0]

df = pd.read_csv(ex, delimiter=';')
shots_mask = df["action_name"].str.contains('\+|\-', regex=True)
shot_attempts_df = df.loc[shots_mask].copy()
shot_attempts_df.iloc[0]

id                                          76320111
action_id                                     800302
action_name                                       3-
player_id                                     1111.0
player_name                             Trevor Ariza
team_id                                         80.0
team_name                            Houston Rockets
opponent_id                                      NaN
opponent_name                                    NaN
opponent_team_id                                 NaN
opponent_team_name                               NaN
teammate_id                                      NaN
teammate_name                                    NaN
half                                               1
second                                        39.438
pos_x                                          26.36
pos_y                                           0.66
possession_id                                    NaN
possession_name                               

In [66]:
from tqdm import tqdm

def generate_features(game_logs_dir: str):
    
    features = pd.DataFrame(columns=[
        "attempt_type", 
        "player_id", 
        "team_id", 
        "period",
        "pos_x",
        "pos_y",
        "shot_made", 
        ])
    
    hudl_logs_fps = [os.path.join(game_logs_dir, f) for f in os.listdir(game_logs_dir)]
    for fp in tqdm(hudl_logs_fps):

        df = pd.read_csv(fp, delimiter=';')
        shots_mask = df["action_name"].str.contains('\+|\-', regex=True)
        
        shot_attempts_df = df.loc[shots_mask, [
            'action_name', 
            'player_id', 
            'team_id', 
            'half',
            "pos_x",
            "pos_y",
            ]].copy()
        
        shot_attempts_df['shot_made'] = df.loc[shots_mask, "action_name"].str.contains("\+", regex=True)
        shots_attempts_features_df = shot_attempts_df.rename(
            columns={"half": "period", "action_name": "attempt_type"}
        )

        # MARK: what are some features everyone has access to?
        # attempt type: 2/3
        # period (half)
        # player_id
        # team_id
        # opponent_id

        attempts_results_stripped = []
        for at in shots_attempts_features_df['attempt_type']:
            attempts_results_stripped.append(at[0])

        shots_attempts_features_df['attempt_type'] = attempts_results_stripped

        # remove free throws
        free_throws_mask = shots_attempts_features_df['attempt_type'] == '1'
        shots_attempts_features_df = shots_attempts_features_df[~free_throws_mask]
        
        features = pd.concat([features, shots_attempts_features_df])

    return features

In [67]:
df = generate_features(hudl_logs_dir)

  features = pd.concat([features, shots_attempts_features_df])
100%|██████████| 634/634 [00:10<00:00, 60.84it/s]


In [117]:
df_rand = df.dropna(axis=0).iloc[0:10000]
df_sorted = df.dropna(axis=0).sort_values("shot_made")
1- sum(df_sorted["shot_made"]) / len(df["shot_made"])

0.5534787319233124

In [128]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import pandas as pd

_df_balanced =  pd.concat([df_sorted[0:5000], df_sorted[-5000: ]])
_df_imbalanced = pd.concat([df_sorted[5000: int((10000 * .5535)) + 5000], df_sorted[-int((10000 * .4465)) - 5000: -5000]])

In [134]:
_df_balanced['shot_made'] = _df_balanced['shot_made'].astype(int)
_df_imbalanced['shot_made'] = _df_imbalanced['shot_made'].astype(int)

scaler = StandardScaler()

_df_balanced[['pos_x', 'pos_y']] = scaler.fit_transform(_df_balanced[['pos_x', 'pos_y']])
_df_imbalanced[['pos_x', 'pos_y']] = scaler.fit_transform(_df_imbalanced[['pos_x', 'pos_y']])

X = _df_imbalanced.drop('shot_made', axis=1)[[
    "attempt_type",
    # "player_id",
    "team_id",
    "period",
    "pos_x",
    # "pos_y"
]]
y = _df_balanced['shot_made']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

model = LogisticRegression(max_iter=100000000)
model.fit(X_train, y_train)

predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)

print(f"Model accuracy: {accuracy}")


Model accuracy: 0.579
