In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import re
import os
import sys
sys.path.insert(0, str(Path.cwd().resolve().parent))  # add repo root to sys.path

from project_paths import (
    RAW_DIR, EDITED_DIR, FINAL_DIR, ANALYSIS_DIR, TEMP_DIR, DATA_DIR, ROOKIES_PATH
)

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt

BASE_DIR = DATA_DIR
ANALYSIS_DIR = ANALYSIS_DIR

# Load the input file (we will be creating a copy of the RANK SCORE full grid dataset for the test-train split)
rs_grid_path = ANALYSIS_DIR / "RANK_SCORE_full_grid.xlsx"
df = pd.read_excel(rs_grid_path)

In [None]:
# Identify & drop Rank Score columns: *_RS and exact RANK_SCORE, plus Player_Role
pattern = re.compile(r'(?:_RS$)|(?:^RANK_SCORE$)', flags=re.IGNORECASE)
rankscore_cols = [c for c in df.columns if pattern.search(c)]

extra_drop = ['Age', 'OLD', 'NEW', 'Pos1', 'Pos2', 'Player_Role']  # drop this too if present
to_drop = rankscore_cols + [c for c in extra_drop if c in df.columns]

df_ml = df.drop(columns=to_drop, errors='ignore')

print("Dropped columns:", to_drop)
print("Remaining columns (first 20):", list(df_ml.columns)[:20])

In [None]:
# Normalize 'Season' text
df_ml['Season'] = df_ml['Season'].astype(str).str.strip()

# Extract start year and sort Player ASC, SeasonStart DESC
def season_start_year(season: str) -> int:
    m = re.match(r'^\s*(\d{4})-(\d{4})\s*$', season)
    return int(m.group(1)) if m else -10**9  # push malformed to the bottom

df_ml['_SeasonStart'] = df_ml['Season'].apply(season_start_year)
df_ml.sort_values(by=['Player', '_SeasonStart'], ascending=[True, False], inplace=True)

df_ml.drop(columns=['_SeasonStart'], inplace=True)

df_ml.tail(10)

In [None]:
# True Label Classification
# Ensure numeric types
df_ml['G']  = pd.to_numeric(df_ml['G'],  errors='coerce')
df_ml['GS'] = pd.to_numeric(df_ml['GS'], errors='coerce')
df_ml['MP'] = pd.to_numeric(df_ml['MP'], errors='coerce')

season = df_ml['Season'].astype(str).str.strip()
G  = df_ml['G'].fillna(0)
GS = df_ml['GS'].fillna(0)
MP = df_ml['MP']  # keep NaN to avoid accidental ROLEPLAYER

# DNP: did not play any game
is_dnp = G.eq(0)

# STARTER: using absolute GS threshold (includes bubble season override)
abs_starter = ((season == '2019-2020') & (GS >= 52)) | ((season != '2019-2020') & (GS >= 65))

# STARTER: via ratio / all-games-started
eps = 1e-9
ratio = GS.divide(G.replace(0, np.nan))
ratio_starter = (~abs_starter) & (~is_dnp) & ( (ratio >= (65/82) - eps) | (GS.eq(G)) )

# ROLEPLAYER: Minutes Played >= 14.4 (equivalent to 30% of play time of NBA regulation game)
starter    = abs_starter | ratio_starter | (MP >= 24) # MP >= 24 is to include Sixth Men (or first starter off the bench)
roleplayer = (~starter) & (~is_dnp) & (MP >= 14.4)

# 4) BENCHWARMER is the remainder
df_ml['CLASSIFICATION'] = np.select(
    [is_dnp, starter, roleplayer],
    ['DNP', 'STARTER', 'ROLEPLAYER'],
    default='BENCHWARMER'
)

df_ml.tail(10)

In [None]:
# Filter to 2024–2025 and exclude did-not-play rows ("DNP")
season_str = '2024-2025'
mask = (
    df_ml['Season'].astype(str).str.strip().eq(season_str) &
    ~df_ml['CLASSIFICATION'].astype(str).str.upper().eq('DNP')
)
true_labels = df_ml.loc[mask].copy()

In [None]:
# Save to ANALYSIS folder
out_path = ANALYSIS_DIR / "GROUND_TRUTH_2024_2025.xlsx"
os.makedirs(ANALYSIS_DIR, exist_ok=True)
true_labels.to_excel(out_path, index=False)

print(f"Saved ground-truth labels: {out_path}")
print(f"Rows saved: {len(true_labels)}")

In [None]:
# ---- Creating the train-test dataset ----
# Compute the next season string "YYYY-YYYY" -> "YYYY+1-YYYY+1"
def next_season_str(s: str) -> str | None:
    m = re.match(r'^\s*(\d{4})-(\d{4})\s*$', str(s))
    if not m:
        return None
    y1, y2 = int(m.group(1)), int(m.group(2))
    return f"{y1+1}-{y2+1}"

# Build "next season" key on the left
df_ml['__NextSeason'] = df_ml['Season'].apply(next_season_str)

# Build lookup of next-season classification per (Player, NextSeason)
lookup = (
    df_ml[['Player', 'Season', 'CLASSIFICATION']]
    .drop_duplicates(subset=['Player', 'Season'])
    .rename(columns={'Season': '__NextSeason', 'CLASSIFICATION': '__NEXT_CLASS'})
)

# Left-merge to fetch next-season classification for the same player
df_ml = df_ml.merge(lookup, on=['Player', '__NextSeason'], how='left')

# Overwrite current classification:
# - If next-season class is NaN or 'DNP' -> 'DNP'
# - Else copy next-season class
next_is_dnp_or_missing = df_ml['__NEXT_CLASS'].isna() | df_ml['__NEXT_CLASS'].str.upper().eq('DNP')
df_ml['CLASSIFICATION'] = np.where(next_is_dnp_or_missing, 'DNP', df_ml['__NEXT_CLASS'])

# Cleanup helper columns
df_ml.drop(columns=['__NextSeason', '__NEXT_CLASS'], inplace=True)

df_ml.tail(10)

In [None]:
# Ensure FG% and FT% are present and numeric
required_cols = ['FG%', 'FT%', 'CLASSIFICATION']
missing = [c for c in required_cols if c not in df_ml.columns]
if missing:
    raise KeyError(f"Missing required columns: {missing}")

df_ml['FG%'] = pd.to_numeric(df_ml['FG%'], errors='coerce')
df_ml['FT%'] = pd.to_numeric(df_ml['FT%'], errors='coerce')

before = len(df_ml)

# Build filter: keep rows that are NOT DNP and have non-null FG% and FT%
mask = (
    ~df_ml['CLASSIFICATION'].astype(str).str.upper().eq('DNP') &
    df_ml['FG%'].notna() &
    df_ml['FT%'].notna()
)

df_ml = df_ml.loc[mask].copy()
after = len(df_ml)

print(f"Rows before: {before}  |  after filtering: {after}  |  dropped: {before - after}")

df_ml.tail(10)

In [None]:
# Save ML train-test dataset to Excel
ml_out_path = ANALYSIS_DIR / "ML_train_test_dataset.xlsx"
os.makedirs(ANALYSIS_DIR, exist_ok=True)
df_ml.to_excel(ml_out_path, index=False)

print(f"Saved ML train-test dataset -> {ml_out_path}")
print(f"Shape: {df_ml.shape[0]} rows × {df_ml.shape[1]} columns")

In [None]:
# ---- ML Classification (KNN / Decision Tree / Naive Bayes) ----
# Load the prepared ML train-test dataset
ml_in_path = ANALYSIS_DIR / "ML_train_test_dataset.xlsx"
stats = pd.read_excel(ml_in_path)

# re-check columns
feature_cols = ['FG%', '3P', 'FT%', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PTS']
missing_feats = [c for c in feature_cols if c not in stats.columns]
if missing_feats:
    raise KeyError(f"Missing required feature columns: {missing_feats}")
if 'CLASSIFICATION' not in stats.columns:
    raise KeyError("Missing target column 'CLASSIFICATION' in ml_train_test_dataset.xlsx")
if 'Season' not in stats.columns or 'Player' not in stats.columns:
    raise KeyError("Input must contain 'Season' and 'Player' columns.")

# Encode labels
le = LabelEncoder()
stats['ROLE_encoded'] = le.fit_transform(stats['CLASSIFICATION'])
# For reference, print encoding mapping:
print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_)))) # e.g., {'BENCHWARMER': 0, 'ROLEPLAYER': 1, 'STARTER': 2}

In [None]:
# Train-Test Split by Season
# Use the most recent season (by start year) as TEST; prior four seasons as TRAIN
def _season_start(s):
    m = re.match(r'^\s*(\d{4})-(\d{4})\s*$', str(s))
    return int(m.group(1)) if m else -10**9

unique_seasons = sorted(stats['Season'].unique(), key=_season_start, reverse=True)
if len(unique_seasons) < 2:
    raise ValueError("Need at least two seasons for a meaningful train/test split.")

test_season = unique_seasons[0]
train_seasons = unique_seasons[1:5]  # take up to four seasons before the test season
print(f"Train seasons: {train_seasons}")
print(f"Test season:   {test_season}")

train_df = stats[stats['Season'].isin(train_seasons)].copy()
test_df  = stats[stats['Season'] == test_season].copy()

X_train = train_df[feature_cols]
y_train = train_df['ROLE_encoded']

X_test  = test_df[feature_cols]
y_test  = test_df['ROLE_encoded']

players_test  = test_df['Player'].reset_index(drop=True)
seasons_test  = test_df['Season'].reset_index(drop=True)
true_labels   = le.inverse_transform(y_test)

In [None]:
# For plotting Confusion Matrices
def plot_conf_mat(model, title="Confusion Matrix"):
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)
    fig, ax = plt.subplots()
    disp.plot(ax=ax, cmap='YlGnBu')
    ax.set_title(title)
    plt.show()
    print(classification_report(y_test, y_pred, target_names=le.classes_))

In [None]:
# k-Nearest Neighbors: search best k
best_k = None
best_acc = -1.0
k_range = range(1, 102)  # 1..101
k_acc = []

for k in k_range:
    clf = KNeighborsClassifier(n_neighbors=k)
    clf.fit(X_train, y_train)
    acc = clf.score(X_test, y_test)
    k_acc.append(acc)
    if acc > best_acc:
        best_acc = acc
        best_k = k

print(f"Accuracy for KNN (k={best_k}): {best_acc:.4f}")

In [None]:
knn_clf = KNeighborsClassifier(n_neighbors=best_k)
knn_clf.fit(X_train, y_train)
plot_conf_mat(knn_clf, title=f"KNN (k={best_k}) Confusion Matrix")

In [None]:
# k curve
fig = plt.figure()
ax1 = fig.add_axes([0, 0, 1, 1])
ax2 = fig.add_axes([0.5, 0.3, 0.4, 0.2])

ax1.plot(k_range, k_acc, 'b')
ax1.set_xlabel('k-Value')
ax1.set_ylabel('Test Accuracy')

ax2.plot(k_range, k_acc, 'b', marker='o', markersize=6, markerfacecolor="purple")
ax2.set_xlabel('k-Value')
ax2.set_xlim([80, 84])
ax2.set_ylabel('Test Accuracy')
ax2.set_ylim([0.64, 0.70])

In [None]:
# Decision Tree
dt_clf = tree.DecisionTreeClassifier(random_state=42, max_depth=3)
dt_clf.fit(X_train, y_train)
dt_acc = dt_clf.score(X_test, y_test)
print(f"Accuracy for Decision Tree (max_depth=3): {dt_acc:.4f}")
print(tree.export_text(dt_clf, feature_names=feature_cols))

# (Optional) visualize tree
plt.figure(figsize=(18, 12))
tree.plot_tree(
    dt_clf,
    feature_names=feature_cols,
    class_names=list(le.classes_),
    filled=True,
    rounded=True,
    precision=2
)
plt.title("Decision Tree (max_depth=3)")
plt.show()

In [None]:
plot_conf_mat(dt_clf, title="Decision Tree Confusion Matrix")

In [None]:
# Grid search for best max_depth (1..10)
param_grid = {'max_depth': list(range(1, 11))}
grid_search = GridSearchCV(tree.DecisionTreeClassifier(random_state=42), param_grid, cv=10)
grid_search.fit(X_train, y_train)
best_max_depth = grid_search.best_params_['max_depth']
print("Best max_depth via GridSearchCV:", best_max_depth)

In [None]:
# Refit with best depth
dt_best = tree.DecisionTreeClassifier(random_state=42, max_depth=best_max_depth)
dt_best.fit(X_train, y_train)
print(f"Accuracy for Decision Tree (best max_depth={best_max_depth}): {dt_best.score(X_test, y_test):.4f}")
plot_conf_mat(dt_best, title=f"Decision Tree (max_depth={best_max_depth}) Confusion Matrix")

In [None]:
# Naive Bayes
nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)
nb_acc = nb_clf.score(X_test, y_test)
print(f"Accuracy for Naive Bayes: {nb_acc:.4f}")
plot_conf_mat(nb_clf, title="Naive Bayes Confusion Matrix")

In [None]:
# Save ML Predictions to Excel
knn_preds = le.inverse_transform(knn_clf.predict(X_test))
dt_preds  = le.inverse_transform(dt_best.predict(X_test))  # use tuned tree
nb_preds  = le.inverse_transform(nb_clf.predict(X_test))

results_df = pd.DataFrame({
    'Player': players_test,
    'True Label': true_labels,
    'KNN Prediction': knn_preds,
    'Decision Tree Prediction': dt_preds,
    'Naive Bayes Prediction': nb_preds
})

ml_results_out = ANALYSIS_DIR / "ML_Classification_analysis_dataset.xlsx"
results_df.to_excel(ml_results_out, index=False)
print(f"\nClassification results saved to: {ml_results_out}")