In [2]:
import os
import pandas as pd

hudl_logs_dir = "/playpen-storage/levlevi/contextualized-shot-quality-analysis/data/nba/result-hidden-split/hudl-game-logs"

In [3]:
from tqdm import tqdm

def generate_features(game_logs_dir: str):
    
    features = pd.DataFrame(columns=["attempt_type", "player_id", "team_id	period", "shot_made"])
    hudl_logs_fps = [os.path.join(game_logs_dir, f) for f in os.listdir(game_logs_dir)]
    
    for fp in tqdm(hudl_logs_fps):

        df = pd.read_csv(fp, delimiter=';')
        shots_mask = df["action_name"].str.contains('\+|\-', regex=True)
        
        shot_attempts_df = df.loc[shots_mask, ['action_name', 'player_id', 'team_id', 'half']].copy()
        shot_attempts_df['shot_made'] = df.loc[shots_mask, "action_name"].str.contains("\+", regex=True)
        shots_attempts_features_df = shot_attempts_df.rename(
            columns={"half": "period", "action_name": "attempt_type"}
        )

        # MARK: what are some features everyone has access to?
        # attempt type: 2/3
        # period (half)
        # player_id
        # team_id
        # opponent_id

        attempts_results_stripped = []
        for at in shots_attempts_features_df['attempt_type']:
            attempts_results_stripped.append(at[0])

        shots_attempts_features_df['attempt_type'] = attempts_results_stripped

        # remove free throws
        free_throws_mask = shots_attempts_features_df['attempt_type'] == '1'
        shots_attempts_features_df = shots_attempts_features_df[~free_throws_mask]
        
        features = pd.concat([features, shots_attempts_features_df])

    return features

In [4]:
df = generate_features(hudl_logs_dir)

  features = pd.concat([features, shots_attempts_features_df])
100%|██████████| 634/634 [00:29<00:00, 21.48it/s]


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

df['shot_made'] = df['shot_made'].astype(int)

# one hot encodings of any continious vars
df_encoded = pd.get_dummies(df, columns=['player_id', 'team_id', 'period'], drop_first=True)

X = df_encoded.drop('shot_made', axis=1)
X = X.dropna(axis=1, how='any')
y = df_encoded['shot_made']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.1, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.5632595383890721

In [9]:
diff = accuracy - (1 - (sum(y) / len(y)))
diff

0.012578865724320987