###########################################

Data exploration

###########################################

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("data_prep/pacers_data/final_shots.csv")
df.shape

In [None]:
df.columns

In [None]:
df["GAME_ID"].unique()[:10]

Plot shot locations from single game.

In [None]:
import matplotlib.pyplot as plt

view_df = df[df["GAME_ID"] == 22401172]

# Create the scatter plot
plt.figure(figsize=(7, 7))  # Set figure size
plt.scatter(view_df['LOC_X'], view_df['LOC_Y'], c=view_df["SHOT_VALUE"], alpha=0.6, edgecolors='black')

# Labels and title
plt.xlabel("X Coordinate (Court)")
plt.ylabel("Y Coordinate (Court)")
plt.title("Single Game Shot Locations")
plt.legend(title="Shot Value", loc='upper right', labels=["3-Point Shot", "2-Point Shot"])

# Display the plot
plt.show()

Value counts of target variable.

In [None]:
df["SHOT_MADE_FLAG"].value_counts()

In [None]:
df["EVENT_TYPE"].unique()

In [None]:
df["ACTION_TYPE"].isna().sum()

In [None]:
df["ACTION_TYPE"].unique()

###########################################

Preprocessing and Modeling

###########################################

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, roc_curve
import matplotlib.pyplot as plt

In [None]:
# Data loading
filename = 'final_data_spurs.csv'
df = pd.read_csv(filename)

print("=== DATA OVERVIEW ===") 
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}") 
print("\nColumn names:") 
print(df.columns.tolist())
print("\nFirst 5 rows:") 
print(df.head(5))

In [16]:
# Target variable and features
num_features = ['SHOT_VALUE', 'SCORE_DIFF', 'CLUTCH_FLAG', 'SHOT_DISTANCE', 'LOC_X', 'LOC_Y', 'ZONE_FG_PCT', 'FG_PCT', 'EFG_PCT', 'PLAYER_SHOT_PCT']
cat_features = ['ACTION_TYPE']
X = df[num_features + cat_features]
y = df['SHOT_MADE_FLAG']

In [17]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

ct = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OrdinalEncoder(), cat_features)
    ]
)

X_train_scaled = ct.fit_transform(X_train)
X_test_scaled = ct.transform(X_test)

In [None]:
# Models and hyperparameter tuning
lr_params = {
    'C': [0.01, 0.1, 1, 10],
    'max_iter': [1000],
    'random_state': [42]
}

rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'random_state': [42]
}

lr = LogisticRegression()
rf = RandomForestClassifier()

lr_grid = GridSearchCV(lr, lr_params, cv=5, scoring='accuracy', n_jobs=-1)
rf_grid = GridSearchCV(rf, rf_params, cv=5, scoring='accuracy', n_jobs=-1)

lr_grid.fit(X_train_scaled, y_train)
rf_grid.fit(X_train_scaled, y_train)

###########################################

Evaluation

###########################################

In [None]:
# Accuracy
lr_best = lr_grid.best_estimator_
rf_best = rf_grid.best_estimator_

lr_pred = lr_best.predict(X_test_scaled)
rf_pred = rf_best.predict(X_test_scaled)

lr_accuracy = accuracy_score(y_test, lr_pred)
rf_accuracy = accuracy_score(y_test, rf_pred)

print("Logistic Regression Test Accuracy:", lr_accuracy)
print("Random Forest Test Accuracy:", rf_accuracy)
print("\nBest Model:", "Random Forest" if rf_accuracy > lr_accuracy else "Logistic Regression")

In [None]:
# ROC Curve
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_best.decision_function(X_test_scaled))
rf_fpr, rf_tpr, _ = roc_curve(y_test, rf_best.predict_proba(X_test_scaled)[:, 1])
plt.figure(figsize=(10, 6))
plt.plot(lr_fpr, lr_tpr, label='Logistic Regression', color='blue')
plt.plot(rf_fpr, rf_tpr, label='Random Forest', color='orange')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid()
plt.show()

In [None]:
# Feature Importance
importances = rf_best.feature_importances_
feature_names = num_features + cat_features
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10, 6))
plt.title("Feature Importance")
plt.bar(range(X_train_scaled.shape[1]), importances[indices], align="center")
plt.xticks(range(X_train_scaled.shape[1]), np.array(feature_names)[indices], rotation=90)
plt.xlim([-1, X_train_scaled.shape[1]])
plt.xlabel("Feature")
plt.ylabel("Importance")
plt.tight_layout()
plt.show()