# Train Models for Different Players
We will use Logistic Regression to train the shot data of different NBA players, and record the coefficients into a file.

### 1. Read the data

In [4]:
import os
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, roc_auc_score, f1_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

Specify data path

In [5]:
data_path = './shot_data'

Get the file name of all players

In [6]:

fname_list = os.listdir(data_path)
fpath_list = []
id_list = []

for fname in fname_list:
    fpath = os.path.join(data_path, fname)
    if os.path.isfile(fpath):
        fpath_list.append(fpath)
        player_id = fname.split('_')[-1].split('.')[0]  # Splitting again to remove file extension
        try:
            id_list.append(int(player_id))
        except ValueError:
            print(f"Filename {fname} does not end with a player ID")


In [7]:
print(id_list)

[101108, 1626145, 1626156, 1626157, 1626162, 1626166, 1626167, 1626168, 1626171, 1626172, 1626179, 1626181, 1626192, 1626196, 1626204, 1626220, 1626224, 1627734, 1627739, 1627741, 1627742, 1627747, 1627749, 1627750, 1627751, 1627752, 1627777, 1627783, 1627826, 1627827, 1627832, 1627884, 1627936, 1628365, 1628368, 1628369, 1628370, 1628372, 1628374, 1628378, 1628379, 1628380, 1628381, 1628392, 1628398, 1628401, 1628404, 1628420, 1628470, 1628975, 1628976, 1628978, 1628983, 1628984, 1628988, 1628989, 1628991, 1628995, 1628997, 1629001, 1629003, 1629006, 1629008, 1629012, 1629013, 1629014, 1629018, 1629020, 1629021, 1629022, 1629023, 1629026, 1629027, 1629029, 1629060, 1629111, 1629130, 1629216, 1629234, 1629312, 1629611, 1629614, 1629622, 1629629, 1629630, 1629631, 1629632, 1629636, 1629639, 1629640, 1629651, 1629652, 1629655, 1629656, 1629659, 1629661, 1629667, 1629670, 1629673, 1629675, 1629680, 1629684, 1629723, 1630162, 1630164, 1630165, 1630167, 1630168, 1630169, 1630170, 1630171, 1

### 2. The functions to process the data

In [8]:
import csv
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

The features we used in training models

In [9]:
combinedshottype_list = ['COMBINED_SHOT_TYPE_Jump Shot', 
               'COMBINED_SHOT_TYPE_Layup', 'COMBINED_SHOT_TYPE_Dunk', 'COMBINED_SHOT_TYPE_Tip Shot',
               'COMBINED_SHOT_TYPE_Hook Shot', 'COMBINED_SHOT_TYPE_Bank Shot']
feature_list = ['PERIOD', 'REMAIN_TIME', 'DIST', 'ANGLE'] + combinedshottype_list

Data Cleaning

In [32]:
# def clean_data(df_old, feature_list):
#     df = df_old.copy()
    
#     # Change the action type into combined shot type
#     action={'Jump Shot': 'Jump Shot', 'Driving Dunk Shot': 'Dunk', 'Layup Shot':'Layup',
#            'Running Jump Shot':'Jump Shot', 'Reverse Dunk Shot':'Dunk', 'Slam Dunk Shot':'Dunk',
#            'Driving Layup Shot':'Layup', 'Turnaround Jump Shot':'Jump Shot', 'Reverse Layup Shot':'Layup',
#            'Tip Shot':'Tip Shot', 'Running Hook Shot':'Hook Shot', 'Alley Oop Dunk Shot':'Dunk',
#            'Dunk Shot':'Dunk', 'Alley Oop Layup shot':'Layup', 'Running Dunk Shot':'Dunk',
#            'Driving Finger Roll Shot':'Layup', 'Running Layup Shot':'Layup',
#            'Finger Roll Shot':'Layup', 'Fadeaway Jump Shot':'Jump Shot', 'Follow Up Dunk Shot':'Dunk',
#            'Hook Shot':'Hook Shot', 'Turnaround Hook Shot':'Hook Shot', 'Jump Hook Shot':'Jump Shot',
#            'Running Finger Roll Shot':'Layup', 'Jump Bank Shot':'Jump Shot',
#            'Turnaround Finger Roll Shot':'Layup', 'Hook Bank Shot':'Bank Shot',
#            'Driving Hook Shot':'Hook Shot', 'Running Tip Shot':'Tip Shot',
#            'Running Reverse Layup Shot':'Layup', 'Driving Finger Roll Layup Shot':'Layup',
#            'Fadeaway Bank shot':'Bank Shot', 'Pullup Jump shot':'Jump Shot', 'Finger Roll Layup Shot':'Layup',
#            'Turnaround Fadeaway shot':'Jump Shot', 'Driving Reverse Layup Shot':'Layup',
#            'Driving Slam Dunk Shot':'Dunk', 'Step Back Jump shot':'Jump Shot',
#            'Turnaround Bank shot':'Bank Shot', 'Reverse Slam Dunk Shot':'Dunk',
#            'Floating Jump shot':'Jump Shot', 'Putback Slam Dunk Shot':'Dunk',
#            'Running Bank shot':'Bank Shot', 'Driving Bank shot':'Bank Shot', 'Driving Jump shot':'Jump Shot',
#            'Putback Layup Shot':'Layup', 'Putback Dunk Shot':'Dunk',
#            'Running Finger Roll Layup Shot':'Layup', 'Pullup Bank shot':'Bank Shot',
#            'Running Slam Dunk Shot':'Dunk', 'Cutting Layup Shot':'Layup',
#            'Driving Floating Jump Shot':'Jump Shot', 'Running Pull-Up Jump Shot':'Jump Shot',
#            'Tip Layup Shot':'Layup', 'Driving Floating Bank Jump Shot':'Jump Shot',
#            'Cutting Finger Roll Layup Shot':'Layup', 'Turnaround Fadeaway Bank Jump Shot':'Jump Shot',
#            'Step Back Bank Jump Shot':'Jump Shot', 'Cutting Dunk Shot':'Dunk', 'Running Bank Hook Shot': 'Hook Shot', 
#             'Turnaround Bank Hook Shot': 'Hook Shot', 'Tip Dunk Shot': 'Dunk', 'Driving Reverse Dunk Shot': 'Dunk', 
#            'Jump Bank Hook Shot': 'Jump Shot', 'Driving Bank Hook Shot': 'Bank Shot', 'No Shot': 'Jump Shot', 
#            'Running Alley Oop Layup Shot': 'Layup', 'Running Alley Oop Dunk Shot': 'Dunk', 
#            'Running Reverse Dunk Shot':'Dunk', 'Putback Reverse Dunk Shot': 'Dunk'}
    
#     df['COMBINED_SHOT_TYPE']=df.ACTION_TYPE.apply(lambda x:action[x])
#     df=df.drop('ACTION_TYPE', axis=1)
    
#     # Combine the minutes_remaining and seconds_remaining into remain_time
#     df['REMAIN_TIME'] = df['MINUTES_REMAINING'] * 60 + df['SECONDS_REMAINING']
#     df = df.drop(['MINUTES_REMAINING', 'SECONDS_REMAINING'], axis = 1)
    
#     # Compute the shot distance and shot angle from loc_x and loc_y
#     df['DIST']=(df.LOC_X**2+df.LOC_Y**2)**0.5
#     df['ANGLE']=np.arctan2(df.LOC_X,df.LOC_Y)/np.pi*180.0
    
#     # Use the one-hot coding for combined_shot_type
#     features_onehot = ['COMBINED_SHOT_TYPE']
#     for i in features_onehot:
#         df = pd.concat([df, pd.get_dummies(df[i], prefix=i)], axis=1)
#         df = df.drop(i, axis=1)

    
#     # If the column of required feature is empty, fill the column with 0
#     for i in feature_list:
#         if i not in df.columns.values:
#             df[i] = 0
    
#     # Exclude the shot from back court
#     df = df[df.DIST < 320.0]
    
#     # Extract required features from data frame
#     df = df[feature_list+['SHOT_MADE_FLAG']]
    
#     return df

import pandas as pd
import numpy as np

def clean_data(df_old, feature_list):
    df = df_old.copy()
    
    # Simplifying shot types into broader categories
    #     # Change the action type into combined shot type
    action_map ={'Jump Shot': 'Jump Shot', 'Driving Dunk Shot': 'Dunk', 'Layup Shot':'Layup',
           'Running Jump Shot':'Jump Shot', 'Reverse Dunk Shot':'Dunk', 'Slam Dunk Shot':'Dunk',
           'Driving Layup Shot':'Layup', 'Turnaround Jump Shot':'Jump Shot', 'Reverse Layup Shot':'Layup',
           'Tip Shot':'Tip Shot', 'Running Hook Shot':'Hook Shot', 'Alley Oop Dunk Shot':'Dunk',
           'Dunk Shot':'Dunk', 'Alley Oop Layup shot':'Layup', 'Running Dunk Shot':'Dunk',
           'Driving Finger Roll Shot':'Layup', 'Running Layup Shot':'Layup',
           'Finger Roll Shot':'Layup', 'Fadeaway Jump Shot':'Jump Shot', 'Follow Up Dunk Shot':'Dunk',
           'Hook Shot':'Hook Shot', 'Turnaround Hook Shot':'Hook Shot', 'Jump Hook Shot':'Jump Shot',
           'Running Finger Roll Shot':'Layup', 'Jump Bank Shot':'Jump Shot',
           'Turnaround Finger Roll Shot':'Layup', 'Hook Bank Shot':'Bank Shot',
           'Driving Hook Shot':'Hook Shot', 'Running Tip Shot':'Tip Shot',
           'Running Reverse Layup Shot':'Layup', 'Driving Finger Roll Layup Shot':'Layup',
           'Fadeaway Bank shot':'Bank Shot', 'Pullup Jump shot':'Jump Shot', 'Finger Roll Layup Shot':'Layup',
           'Turnaround Fadeaway shot':'Jump Shot', 'Driving Reverse Layup Shot':'Layup',
           'Driving Slam Dunk Shot':'Dunk', 'Step Back Jump shot':'Jump Shot',
           'Turnaround Bank shot':'Bank Shot', 'Reverse Slam Dunk Shot':'Dunk',
           'Floating Jump shot':'Jump Shot', 'Putback Slam Dunk Shot':'Dunk',
           'Running Bank shot':'Bank Shot', 'Driving Bank shot':'Bank Shot', 'Driving Jump shot':'Jump Shot',
           'Putback Layup Shot':'Layup', 'Putback Dunk Shot':'Dunk',
           'Running Finger Roll Layup Shot':'Layup', 'Pullup Bank shot':'Bank Shot',
           'Running Slam Dunk Shot':'Dunk', 'Cutting Layup Shot':'Layup',
           'Driving Floating Jump Shot':'Jump Shot', 'Running Pull-Up Jump Shot':'Jump Shot',
           'Tip Layup Shot':'Layup', 'Driving Floating Bank Jump Shot':'Jump Shot',
           'Cutting Finger Roll Layup Shot':'Layup', 'Turnaround Fadeaway Bank Jump Shot':'Jump Shot',
           'Step Back Bank Jump Shot':'Jump Shot', 'Cutting Dunk Shot':'Dunk', 'Running Bank Hook Shot': 'Hook Shot', 
            'Turnaround Bank Hook Shot': 'Hook Shot', 'Tip Dunk Shot': 'Dunk', 'Driving Reverse Dunk Shot': 'Dunk', 
           'Jump Bank Hook Shot': 'Jump Shot', 'Driving Bank Hook Shot': 'Bank Shot', 'No Shot': 'Jump Shot', 
           'Running Alley Oop Layup Shot': 'Layup', 'Running Alley Oop Dunk Shot': 'Dunk', 
           'Running Reverse Dunk Shot':'Dunk', 'Putback Reverse Dunk Shot': 'Dunk'}
    
    df['COMBINED_SHOT_TYPE'] = df['ACTION_TYPE'].map(action_map)

    # Calculate remaining time in seconds as a single feature
    df['REMAINING_TIME'] = df['MINUTES_REMAINING'] * 60 + df['SECONDS_REMAINING']

    # Calculate distance from the hoop and angle
    df['DISTANCE'] = np.sqrt(df['LOC_X']**2 + df['LOC_Y']**2)
    df['ANGLE'] = np.degrees(np.arctan2(df['LOC_Y'], df['LOC_X']))

    # One-hot encode categorical variables
    df = pd.get_dummies(df, columns=['COMBINED_SHOT_TYPE', 'PERIOD'])

    # Fill any NaNs which might disrupt the model
    df.fillna(0, inplace=True)

    # Ensure only required features are included
    df = df[feature_list + ['SHOT_MADE_FLAG']]
    return df


In [25]:
from sklearn.metrics import make_scorer, roc_auc_score

def evaluate_model(X, y, cv_strategy, pipeline, param_grid):
    # Run grid search with the specified pipeline and parameters
    grid_search = GridSearchCV(estimator=pipeline, 
                               param_grid=param_grid, 
                               cv=StratifiedKFold(5), 
                               scoring=make_scorer(roc_auc_score), verbose=1)
    grid_search.fit(X, y)
    
    # Extract the best parameters and the best score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    # Best model evaluation
    print("Best parameters:", grid_search.best_params_)
    print("Best cross-validation ROC AUC score: {:.2f}".format(grid_search.best_score_))

    # Evaluate on the test set
    roc_auc = roc_auc_score(y, grid_search.predict_proba(X)[:, 1])
    print("Test ROC AUC: {:.2f}".format(roc_auc))
    return best_params, best_score


In [26]:
with open('Players_Model.csv', 'w', newline='') as f_rslt:
    writer = csv.writer(f_rslt)
    writer.writerow(['PLAYER_ID', 'PLAYER_NAME', 'BEST_ROC_AUC'])

    scaler = StandardScaler()
    cv_strategy = StratifiedKFold(n_splits=5)
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=42)),
        ('classifier', LogisticRegression(solver='lbfgs', random_state=42))
    ])

    param_grid = {
        'classifier__C': [0.01, 0.1, 1, 10],
        'classifier__penalty': ['l2']  # Only 'l2' is compatible with 'lbfgs'
    }


    for filepath in fpath_list:
        df = pd.read_csv(filepath)
        if df.empty:
            continue
        feature_list = ['DISTANCE', 'ANGLE', 'REMAINING_TIME']
        df_cleaned = clean_data(df, feature_list)
        X = df_cleaned.drop('SHOT_MADE_FLAG', axis=1)
        y = df_cleaned['SHOT_MADE_FLAG']
        X_scaled = scaler.fit_transform(X)

        best_params, best_score = evaluate_model(X_scaled, y, cv_strategy, pipeline, param_grid)
        
        print(f"Player ID: {df.PLAYER_ID.iloc[0]}")
        print(f"Player Name: {df.PLAYER_NAME.iloc[0]}")
        print(f"Best Parameters: {best_params}")
        print(f"Best ROC AUC: {best_score}\n")

        writer.writerow([df.PLAYER_ID.iloc[0], df.PLAYER_NAME.iloc[0], best_score])

print("Model training and evaluation complete.")

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best parameters: {'classifier__C': 0.01, 'classifier__penalty': 'l2'}
Best cross-validation ROC AUC score: 0.51
Test ROC AUC: 0.53
Player ID: 101108
Player Name: Chris Paul
Best Parameters: {'classifier__C': 0.01, 'classifier__penalty': 'l2'}
Best ROC AUC: 0.5140481962481962

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best parameters: {'classifier__C': 0.01, 'classifier__penalty': 'l2'}
Best cross-validation ROC AUC score: 0.52
Test ROC AUC: 0.52
Player ID: 1626145
Player Name: Tyus Jones
Best Parameters: {'classifier__C': 0.01, 'classifier__penalty': 'l2'}
Best ROC AUC: 0.5239808913696358

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best parameters: {'classifier__C': 0.01, 'classifier__penalty': 'l2'}
Best cross-validation ROC AUC score: 0.51
Test ROC AUC: 0.52
Player ID: 1626156
Player Name: D'Angelo Russell
Best Parameters: {'classifier__C': 0.01, 'classifier__penalty': 'l2'}
Best ROC AUC: 0

Function to train the model

In [29]:
def TrainModel(df, scaler=None):
    if df.empty:
        return [float('nan')] * len(df.columns[:-1]), float('nan')  # handle empty DataFrame case
    X = df.drop('SHOT_MADE_FLAG', axis=1)
    y = df['SHOT_MADE_FLAG']
    if scaler is not None:
        X = scaler.transform(X)
    clf = LogisticRegression(C=1.0, max_iter=300, solver='lbfgs')
    clf.fit(X, y)
    coef = list(map(float, clf.coef_[0]))
    intercept = float(clf.intercept_[0])
    return coef, intercept

In [31]:
df

Unnamed: 0,GRID_TYPE,GAME_ID,GAME_EVENT_ID,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_NAME,PERIOD,MINUTES_REMAINING,SECONDS_REMAINING,...,SHOT_ZONE_AREA,SHOT_ZONE_RANGE,SHOT_DISTANCE,LOC_X,LOC_Y,SHOT_ATTEMPTED_FLAG,SHOT_MADE_FLAG,GAME_DATE,HTM,VTM
0,Shot Chart Detail,22200013,17,101108,Chris Paul,1610612756,Phoenix Suns,1,10,51,...,Center(C),24+ ft.,26,61,253,1,0,20221019,PHX,DAL
1,Shot Chart Detail,22200013,212,101108,Chris Paul,1610612756,Phoenix Suns,2,8,16,...,Center(C),8-16 ft.,12,5,127,1,0,20221019,PHX,DAL
2,Shot Chart Detail,22200013,230,101108,Chris Paul,1610612756,Phoenix Suns,2,7,22,...,Right Side(R),8-16 ft.,13,134,25,1,1,20221019,PHX,DAL
3,Shot Chart Detail,22200013,248,101108,Chris Paul,1610612756,Phoenix Suns,2,6,0,...,Center(C),8-16 ft.,13,66,117,1,0,20221019,PHX,DAL
4,Shot Chart Detail,22200013,467,101108,Chris Paul,1610612756,Phoenix Suns,3,4,29,...,Left Side(L),16-24 ft.,16,-156,67,1,0,20221019,PHX,DAL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1130,Shot Chart Detail,22301198,409,101108,Chris Paul,1610612744,Golden State Warriors,3,6,24,...,Right Side Center(RC),16-24 ft.,19,97,168,1,0,20240414,GSW,UTA
1131,Shot Chart Detail,22301214,200,101108,Chris Paul,1610612744,Golden State Warriors,2,11,4,...,Left Side Center(LC),24+ ft.,26,-176,194,1,0,20231206,GSW,POR
1132,Shot Chart Detail,22301214,343,101108,Chris Paul,1610612744,Golden State Warriors,2,1,57,...,Center(C),16-24 ft.,19,58,183,1,1,20231206,GSW,POR
1133,Shot Chart Detail,22301214,528,101108,Chris Paul,1610612744,Golden State Warriors,4,11,50,...,Left Side(L),8-16 ft.,13,-117,67,1,0,20231206,GSW,POR


Begin training for all players  
Save the coefficients into a csv file

In [30]:
with open('Players_Model.csv', 'w', newline='') as f_rslt:
    writer = csv.writer(f_rslt)
    writer.writerow(['PLAYER_ID', 'PLAYER_NAME'] + feature_list + ['INTERCEPT'])
    scaler = StandardScaler()  # Optionally consider global fitting if data available
    for i, filepath in enumerate(fpath_list):
        df = pd.read_csv(filepath)
        if df.empty:
            continue  # Skip processing for empty data files
        player_id = int(df.PLAYER_ID.iloc[0])
        player_name = str(df.PLAYER_NAME.iloc[0])
        df = clean_data(df, feature_list)
        if df.empty:
            writer.writerow([player_id, player_name] + [float('nan')] * len(feature_list) + [float('nan')])
            continue
        if i == 0:
            scaler.fit(df.drop('SHOT_MADE_FLAG', axis=1))  # Fit scaler only once if global data not pooled
        coef, intercept = TrainModel(df, scaler)


        writer.writerow([player_id, player_name] + coef + [intercept])

KeyError: "['SEASON'] not in index"