In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import re
import os
import sys
sys.path.insert(0, str(Path.cwd().resolve().parent))  # add repo root to sys.path

from project_paths import (
    RAW_DIR, EDITED_DIR, FINAL_DIR, ANALYSIS_DIR, TEMP_DIR, DATA_DIR, ROOKIES_PATH
)

BASE_DIR = DATA_DIR
FINAL_DIR     = FINAL_DIR       # inputs
ANALYSIS_DIR  = ANALYSIS_DIR    # outputs
ANALYSIS_DIR.mkdir(parents=True, exist_ok=True)

# Rookies list (Player, Season=rookie season)
ROOKIES_PATH  = ROOKIES_PATH

In [None]:
# Load Data
final_files = sorted(FINAL_DIR.glob("*Season*_final.xlsx")) or sorted(FINAL_DIR.glob("*final.xlsx"))
if not final_files:
    raise FileNotFoundError(f"No season files found in {FINAL_DIR}")

In [None]:
# Get season range
def parse_season_token(name: str) -> str:
    m = re.search(r"(\d{4}-\d{4})", name)
    return m.group(1) if m else "UNKNOWN"

# Get latest season (2024-2025)
def season_start(season: str) -> int:
    m = re.match(r"(\d{4})-(\d{4})", str(season))
    return int(m.group(1)) if m else -10**9

# Determine's lookup table version (ST (starters and sixth men) or RP (role players and benchwarmers))
def player_role(mp: float) -> int:
    if pd.isna(mp): return 0
    if mp >= 24.0:  return 2  # starter
    if mp >= 14.4:  return 1  # rotation
    return 0                  # bench

In [None]:
# ---- Threshold tables----
BASE_RS_COLS  = ['FG%_RS','3P_RS','FT%_RS','TRB_RS','AST_RS','STL_RS','BLK_RS','TOV_RS','PTS_RS']
UPPER_RS_COLS = ['FG%_RS','3P_RS','FT%_RS','TRB_RS','AST_RS','STL_RS','BLK_RS','PTS_RS']  # (no TOV)
LOWER_RS_COLS = ['FG%_RS','FT%_RS']

base_threshold_ST = {
    ('PG','PG'):[0.450,1.5,0.800,3.5,5.5,1.5,0.5,1.5,13.5],('PG','SG'):[0.450,1.5,0.800,3.5,3.5,1.5,0.5,1.5,13.5],
    ('PG','SF'):[0.450,1.5,0.800,3.5,4.5,1.5,0.5,1.5,13.5],('PG','PF'):[0.450,0.5,0.775,3.5,2.5,0.5,0.5,1.5,13.5],
    ('PG','C') :[0.450,0.5,0.775,3.5,2.5,0.5,0.5,1.5,13.5],
    ('SG','PG'):[0.450,1.5,0.800,3.5,3.5,1.5,0.5,1.5,13.5],('SG','SG'):[0.450,1.5,0.800,4.5,3.5,1.5,0.5,1.5,13.5],
    ('SG','SF'):[0.450,1.5,0.800,4.5,3.5,1.5,0.5,1.5,13.5],('SG','PF'):[0.450,0.5,0.775,4.5,2.5,0.5,0.5,1.5,13.5],
    ('SG','C') :[0.450,0.5,0.775,4.5,2.5,0.5,0.5,1.5,13.5],
    ('SF','PG'):[0.450,1.5,0.800,3.5,4.5,1.5,0.5,1.5,13.5],('SF','SG'):[0.450,1.5,0.800,4.5,3.5,1.5,0.5,1.5,13.5],
    ('SF','SF'):[0.450,1.5,0.800,5.5,4.5,1.5,0.5,1.5,13.5],('SF','PF'):[0.450,0.5,0.775,5.5,2.5,0.5,1.5,1.5,13.5],
    ('SF','C') :[0.450,0.5,0.775,5.5,2.5,0.5,1.5,1.5,13.5],
    ('PF','PG'):[0.450,0.5,0.775,3.5,2.5,0.5,0.5,1.5,13.5],('PF','SG'):[0.450,0.5,0.775,4.5,2.5,0.5,0.5,1.5,13.5],
    ('PF','SF'):[0.450,0.5,0.775,5.5,2.5,0.5,1.5,1.5,13.5],('PF','PF'):[0.475,0.5,0.775,6.5,2.5,0.5,1.5,0.5,13.5],
    ('PF','C') :[0.475,0.5,0.775,6.5,2.5,0.5,1.5,0.5,13.5],
    ('C','PG') :[0.450,0.5,0.775,3.5,2.5,0.5,0.5,1.5,13.5],('C','SG'):[0.450,0.5,0.775,4.5,2.5,0.5,0.5,1.5,13.5],
    ('C','SF') :[0.450,0.5,0.775,5.5,2.5,0.5,1.5,1.5,13.5],('C','PF'):[0.475,0.5,0.775,6.5,2.5,0.5,1.5,0.5,13.5],
    ('C','C')  :[0.475,0.5,0.775,6.5,2.5,0.5,1.5,0.5,13.5]
}
base_threshold_RP = {
    ('PG','PG'):[0.450,1.5,0.800,3.5,4.5,1.5,0.5,1.5,9.5], ('PG','SG'):[0.450,1.5,0.800,3.5,2.5,1.5,0.5,1.5,9.5],
    ('PG','SF'):[0.450,1.5,0.800,3.5,3.5,1.5,0.5,1.5,9.5], ('PG','PF'):[0.450,0.5,0.775,3.5,2.5,0.5,0.5,1.5,9.5],
    ('PG','C') :[0.450,0.5,0.775,3.5,2.5,0.5,0.5,1.5,9.5],
    ('SG','PG'):[0.450,1.5,0.800,3.5,2.5,1.5,0.5,1.5,9.5], ('SG','SG'):[0.450,1.5,0.800,3.5,2.5,1.5,0.5,1.5,9.5],
    ('SG','SF'):[0.450,1.5,0.800,3.5,2.5,1.5,0.5,1.5,9.5], ('SG','PF'):[0.450,0.5,0.775,3.5,2.5,0.5,0.5,1.5,9.5],
    ('SG','C') :[0.450,0.5,0.775,3.5,2.5,0.5,0.5,1.5,9.5],
    ('SF','PG'):[0.450,1.5,0.800,3.5,3.5,1.5,0.5,1.5,9.5], ('SF','SG'):[0.450,1.5,0.800,3.5,2.5,1.5,0.5,1.5,9.5],
    ('SF','SF'):[0.450,1.5,0.800,4.5,3.5,1.5,0.5,1.5,9.5], ('SF','PF'):[0.450,0.5,0.775,4.5,2.5,0.5,1.5,1.5,9.5],
    ('SF','C') :[0.450,0.5,0.775,4.5,2.5,0.5,1.5,1.5,9.5],
    ('PF','PG'):[0.450,0.5,0.775,3.5,2.5,0.5,0.5,1.5,9.5], ('PF','SG'):[0.450,0.5,0.775,3.5,2.5,0.5,0.5,1.5,9.5],
    ('PF','SF'):[0.450,0.5,0.775,4.5,2.5,0.5,1.5,1.5,9.5], ('PF','PF'):[0.475,0.5,0.775,5.5,2.5,0.5,1.5,0.5,9.5],
    ('PF','C') :[0.475,0.5,0.775,5.5,2.5,0.5,1.5,0.5,9.5],
    ('C','PG') :[0.450,0.5,0.775,3.5,2.5,0.5,0.5,1.5,9.5], ('C','SG'):[0.450,0.5,0.775,3.5,2.5,0.5,0.5,1.5,9.5],
    ('C','SF') :[0.450,0.5,0.775,4.5,2.5,0.5,1.5,1.5,9.5], ('C','PF'):[0.475,0.5,0.775,5.5,2.5,0.5,1.5,0.5,9.5],
    ('C','C')  :[0.475,0.5,0.775,5.5,2.5,0.5,1.5,0.5,9.5]
}
upper_threshold_ST = {
    ('PG','PG'):[0.475,2.5,0.850,6.5,7.5,2.5,1.5,17.5],('PG','SG'):[0.475,2.5,0.850,6.5,5.5,2.5,1.5,17.5],
    ('PG','SF'):[0.475,2.5,0.850,6.5,6.5,2.5,1.5,17.5],('PG','PF'):[0.475,1.5,0.825,6.5,4.5,1.5,1.5,17.5],
    ('PG','C') :[0.475,1.5,0.825,6.5,4.5,1.5,1.5,17.5],
    ('SG','PG'):[0.475,2.5,0.850,6.5,5.5,2.5,1.5,17.5],('SG','SG'):[0.475,2.5,0.850,7.5,5.5,2.5,1.5,17.5],
    ('SG','SF'):[0.475,2.5,0.850,7.5,5.5,2.5,1.5,17.5],('SG','PF'):[0.475,1.5,0.825,7.5,4.5,1.5,1.5,17.5],
    ('SG','C') :[0.475,1.5,0.825,7.5,4.5,1.5,1.5,17.5],
    ('SF','PG'):[0.475,2.5,0.850,6.5,6.5,2.5,1.5,17.5],('SF','SG'):[0.475,2.5,0.850,7.5,5.5,2.5,1.5,17.5],
    ('SF','SF'):[0.475,2.5,0.850,8.5,6.5,2.5,1.5,17.5],('SF','PF'):[0.475,1.5,0.825,8.5,4.5,1.5,2.5,17.5],
    ('SF','C') :[0.475,1.5,0.825,8.5,4.5,1.5,2.5,17.5],
    ('PF','PG'):[0.475,1.5,0.825,6.5,4.5,1.5,1.5,17.5],('PF','SG'):[0.475,1.5,0.825,6.5,4.5,1.5,1.5,17.5],
    ('PF','SF'):[0.475,1.5,0.825,7.5,4.5,1.5,2.5,17.5],('PF','PF'):[0.500,1.5,0.825,9.5,4.5,1.5,2.5,17.5],
    ('PF','C') :[0.500,1.5,0.825,9.5,4.5,1.5,2.5,17.5],
    ('C','PG') :[0.475,1.5,0.825,6.5,4.5,1.5,1.5,17.5],('C','SG'):[0.475,1.5,0.825,6.5,4.5,1.5,1.5,17.5],
    ('C','SF') :[0.475,1.5,0.825,8.5,4.5,1.5,2.5,17.5],('C','PF'):[0.500,1.5,0.825,9.5,4.5,1.5,2.5,17.5],
    ('C','C')  :[0.500,1.5,0.825,9.5,4.5,1.5,2.5,17.5]
}
upper_threshold_RP = {
    ('PG','PG'):[0.475,2.5,0.850,6.5,6.5,2.5,1.5,13.5],('PG','SG'):[0.475,2.5,0.850,6.5,4.5,2.5,1.5,13.5],
    ('PG','SF'):[0.475,2.5,0.850,6.5,5.5,2.5,1.5,13.5],('PG','PF'):[0.475,1.5,0.825,6.5,4.5,1.5,1.5,13.5],
    ('PG','C') :[0.475,1.5,0.825,6.5,4.5,1.5,1.5,13.5],
    ('SG','PG'):[0.475,2.5,0.850,6.5,4.5,2.5,1.5,13.5],('SG','SG'):[0.475,2.5,0.850,6.5,4.5,2.5,1.5,13.5],
    ('SG','SF'):[0.475,2.5,0.850,6.5,4.5,2.5,1.5,13.5],('SG','PF'):[0.475,1.5,0.825,6.5,4.5,1.5,1.5,13.5],
    ('SG','C') :[0.475,1.5,0.825,6.5,4.5,1.5,1.5,13.5],
    ('SF','PG'):[0.475,2.5,0.850,6.5,5.5,2.5,1.5,13.5],('SF','SG'):[0.475,2.5,0.850,6.5,4.5,2.5,1.5,13.5],
    ('SF','SF'):[0.475,2.5,0.850,7.5,5.5,2.5,1.5,13.5],('SF','PF'):[0.475,1.5,0.825,7.5,4.5,1.5,2.5,13.5],
    ('SF','C') :[0.475,1.5,0.825,7.5,4.5,1.5,2.5,13.5],
    ('PF','PG'):[0.475,1.5,0.825,6.5,4.5,1.5,1.5,13.5],('PF','SG'):[0.475,1.5,0.825,6.5,4.5,1.5,1.5,13.5],
    ('PF','SF'):[0.475,1.5,0.825,7.5,4.5,1.5,2.5,13.5],('PF','PF'):[0.500,1.5,0.825,8.5,4.5,1.5,2.5,13.5],
    ('PF','C') :[0.500,1.5,0.825,8.5,4.5,1.5,2.5,13.5],
    ('C','PG') :[0.475,1.5,0.825,6.5,4.5,1.5,1.5,13.5],('C','SG'):[0.475,1.5,0.825,6.5,4.5,1.5,1.5,13.5],
    ('C','SF') :[0.475,1.5,0.825,7.5,4.5,1.5,2.5,13.5],('C','PF'):[0.500,1.5,0.825,8.5,4.5,1.5,2.5,13.5],
    ('C','C')  :[0.500,1.5,0.825,8.5,4.5,1.5,2.5,13.5]
}
lower_threshold_ST = {
    ('PG','PG'):[0.425,0.750],('PG','SG'):[0.425,0.750],('PG','SF'):[0.425,0.750],
    ('PG','PF'):[0.425,0.725],('PG','C'):[0.425,0.725],
    ('SG','PG'):[0.425,0.750],('SG','SG'):[0.425,0.750],('SG','SF'):[0.425,0.750],
    ('SG','PF'):[0.425,0.725],('SG','C'):[0.425,0.725],
    ('SF','PG'):[0.425,0.750],('SF','SG'):[0.425,0.750],('SF','SF'):[0.425,0.750],
    ('SF','PF'):[0.425,0.725],('SF','C'):[0.425,0.725],
    ('PF','PG'):[0.425,0.725],('PF','SG'):[0.425,0.725],('PF','SF'):[0.425,0.725],
    ('PF','PF'):[0.450,0.725],('PF','C'):[0.450,0.725],
    ('C','PG'):[0.425,0.725],('C','SG'):[0.425,0.725],('C','SF'):[0.425,0.725],
    ('C','PF'):[0.450,0.725],('C','C'):[0.450,0.725]
}
lower_threshold_RP = lower_threshold_ST  # same

In [None]:
# Steps/Functions for scoring (and adjusting of scores) player averages in each categories
def base_check_stat(row):
    pos_key = (row.get('Pos1'), row.get('Pos2'))
    thresholds = base_threshold_ST.get(pos_key) if row['Player_Role'] == 2 else base_threshold_RP.get(pos_key)
    if thresholds is None:
        return [0]*len(BASE_RS_COLS)
    stats = ['FG%','3P','FT%','TRB','AST','STL','BLK','TOV','PTS']
    out = []
    for stat, thr in zip(stats, thresholds):
        v = row.get(stat, np.nan)
        out.append(0 if pd.isna(v) else int(v >= thr))
    return out

def upper_check_stat(row):
    pos_key = (row.get('Pos1'), row.get('Pos2'))
    thresholds = upper_threshold_ST.get(pos_key) if row['Player_Role'] == 2 else upper_threshold_RP.get(pos_key)
    if thresholds is None:
        return list(row[UPPER_RS_COLS])
    stats = ['FG%','3P','FT%','TRB','AST','STL','BLK','PTS']
    curr  = list(row[UPPER_RS_COLS])
    out   = []
    for stat, thr, cur in zip(stats, thresholds, curr):
        v = row.get(stat, np.nan)
        out.append(cur if pd.isna(v) else (2 if v >= thr else cur))
    return out

def lower_check_stats(row):
    pos_key = (row.get('Pos1'), row.get('Pos2'))
    thresholds = lower_threshold_ST.get(pos_key) if row['Player_Role'] == 2 else lower_threshold_RP.get(pos_key)
    if thresholds is None:
        return list(row[LOWER_RS_COLS])
    stats = ['FG%','FT%']
    curr  = list(row[LOWER_RS_COLS])
    out   = curr[:]
    for i, (stat, thr) in enumerate(zip(stats, thresholds)):
        v = row.get(stat, np.nan)
        if pd.notna(v) and v < thr:
            out[i] = -1
    return out

def check_tov(row):
    pos_key = (row.get('Pos1'), row.get('Pos2'))
    bigs = {('PF','PF'),('PF','C'),('C','PF'),('C','C')}
    v = row.get('TOV', np.nan)
    if pd.isna(v): return 0
    if pos_key in bigs:
        return 2 if v < 0.5 else (1 if v < 1.0 else (0 if v < 1.5 else -1))
    return 2 if v < 1.5 else (1 if v < 2.0 else (0 if v < 2.5 else -1))

In [None]:
# Score each player per category in each season file
scored_frames = []
for path in final_files:
    season = parse_season_token(path.name)
    df = pd.read_excel(path)

    required = ['Player','MP','Pos1','Pos2','FG%','3P','FT%','TRB','AST','STL','BLK','TOV','PTS']
    miss = [c for c in required if c not in df.columns]
    if miss:
        raise ValueError(f"{path.name} missing columns: {miss}")

    # Numerics & percentage scale
    for c in ['MP','FG%','3P','FT%','TRB','AST','STL','BLK','TOV','PTS']:
        df[c] = pd.to_numeric(df[c], errors='coerce')
    for c in ['FG%','FT%']:
        if df[c].dropna().max() > 1.0:
            df[c] = df[c]/100.0

    # Role & checks
    df['Player_Role'] = df['MP'].apply(player_role)
    df[BASE_RS_COLS]  = df.apply(base_check_stat,  axis=1, result_type='expand')
    df[UPPER_RS_COLS] = df.apply(upper_check_stat, axis=1, result_type='expand')
    df[LOWER_RS_COLS] = df.apply(lower_check_stats, axis=1, result_type='expand')
    df['TOV_RS'] = df.apply(check_tov, axis=1)
    df['RANK_SCORE'] = df[['FG%_RS','3P_RS','FT%_RS','TRB_RS','AST_RS',
                              'STL_RS','BLK_RS','TOV_RS','PTS_RS']].sum(axis=1)

    df.insert(0, 'Season', season)
    scored_frames.append(df)

scored = pd.concat(scored_frames, ignore_index=True)

In [None]:
# Builds full Player×Season grid
all_players = scored['Player'].dropna().unique()
all_seasons = sorted({parse_season_token(p.name) for p in final_files},
                     key=lambda s: season_start(s), reverse=True)

full_index = pd.MultiIndex.from_product([all_seasons, all_players], names=['Season','Player'])
grid = pd.DataFrame(index=full_index).reset_index()

# Merge scored data into full grid (left keeps full Season×Player)
grid = grid.merge(scored, on=['Season','Player'], how='left')

# Fill check columns with 0 where a player didn't play that season
for col in BASE_RS_COLS + ['RANK_SCORE']:
    if col in grid.columns:
        grid[col] = grid[col].fillna(0).astype(int)

# Age imputation for latest season only (generalized) ----
latest_season = all_seasons[0]  # most recent by start-year desc
start_map = {s: season_start(s) for s in all_seasons}

# Build most recent known age before the latest season for each player
age_lookup = {}
for pl, sub in grid[grid['Age'].notna()].groupby('Player', dropna=False):
    # Keep the row with the max start-year strictly less than latest
    prior = sub[sub['Season'].map(start_map) < start_map[latest_season]]
    if not prior.empty:
        row = prior.iloc[prior['Season'].map(start_map).argmax()]
        age_lookup[pl] = (row['Age'], start_map[row['Season']])

# Fill missing Age in latest season
mask_latest = (grid['Season'] == latest_season) & (grid['Age'].isna())
for idx in grid[mask_latest].index:
    pl = grid.at[idx, 'Player']
    if pl in age_lookup:
        prev_age, prev_year = age_lookup[pl]
        est = pd.to_numeric(prev_age, errors='coerce')
        if pd.notna(est):
            years_since = start_map[latest_season] - prev_year
            grid.at[idx, 'Age'] = est + years_since

In [None]:
# Composite per-player scoring
# Base recency weights: top 5 most-recent seasons -> 5/15,4/15,3/15,2/15,1/15; others 0
top5 = all_seasons[:5]
base_weights = {s:w/15 for s,w in zip(top5, [5,4,3,2,1])}

# Build weight_frame over all (Player, Season) in the grid
wf = grid[['Player','Season']].drop_duplicates().copy()
wf['BaseWeight'] = wf['Season'].map(base_weights).fillna(0.0)
wf['SeasonStartYear'] = wf['Season'].map(season_start)

# Load rookies list
rookies = None
if ROOKIES_PATH.exists():
    if ROOKIES_PATH.suffix.lower() in ('.xlsx','.xls'):
        rookies = pd.read_excel(ROOKIES_PATH)
    elif ROOKIES_PATH.suffix.lower() == '.csv':
        rookies = pd.read_csv(ROOKIES_PATH)
    else:
        rookies = None

if rookies is not None and {'Player','Season'}.issubset(rookies.columns):
    rookies = rookies.rename(columns={'Season':'RookieSeason'})
    rookies['RookieSeason'] = rookies['RookieSeason'].astype(str).str.strip()
    rookies['RookieStartYear'] = rookies['RookieSeason'].str.split('-').str[0].astype(int)
    wf = wf.merge(rookies[['Player','RookieStartYear']], on='Player', how='left')
else:
    wf['RookieStartYear'] = np.nan  # no rookie overrides available

# Rank within rookie-and-later seasons (pre-rookie -> 0)
is_listed = wf['RookieStartYear'].notna()
wf['Rank'] = np.where(
    is_listed,
    np.maximum(0, wf['SeasonStartYear'] - wf['RookieStartYear'] + 1),
    np.nan
)

# Finalize weights: for listed rookies use triangular normalized ranks; others use BaseWeight
def finalize_weights(group: pd.DataFrame) -> pd.Series:
    if group['RookieStartYear'].notna().any():
        N = group['Rank'].max()
        if pd.notna(N) and N > 0:
            denom = N*(N+1)/2.0
            return group['Rank'].fillna(0)/denom  # pre-rookie rank=0 -> 0 weight
        return group['BaseWeight']
    return group['BaseWeight']

wf['Weight'] = wf.groupby('Player', group_keys=False).apply(finalize_weights)
wf['Weight'] = wf['Weight'].fillna(0.0)

# Compute weighted composite per metric
rs_cols = ['FG%_RS','3P_RS','FT%_RS','TRB_RS','AST_RS',
              'STL_RS','BLK_RS','TOV_RS','PTS_RS']

composites = {}
for col in rs_cols:
    tmp = grid[['Player','Season',col]].merge(wf[['Player','Season','Weight']],
                                              on=['Player','Season'], how='left')
    tmp['Weighted'] = tmp[col] * tmp['Weight']
    composites[col] = tmp.groupby('Player', dropna=False)['Weighted'].sum(min_count=1)

analysis = pd.DataFrame(composites).rename_axis('Player').reset_index()

# Attach Age in latest season (after imputation above)
age_latest = (grid[grid['Season'] == latest_season][['Player','Age']]
              .drop_duplicates(subset='Player'))
analysis = analysis.merge(age_latest, on='Player', how='left')

# Composite total (sum across checks)
analysis['RANK_SCORE'] = analysis[rs_cols].sum(axis=1).round(3)

In [None]:
# Classification based on RANK_SCORE
# Uses the (already rounded) RANK_SCORE: >=5 STARTER, <4 BENCHWARMER, else ROLEPLAYER
analysis['Classification'] = np.select(
    [analysis['RANK_SCORE'] >= 5, analysis['RANK_SCORE'] < 4],
    ['STARTER', 'BENCHWARMER'],
    default='ROLEPLAYER'
)

In [None]:
# Save outputs
panel_path   = ANALYSIS_DIR / "RANK_SCORE_full_grid.xlsx"
summary_path = ANALYSIS_DIR / "RANK_SCORE_analysis_dataset.xlsx"

grid.to_excel(panel_path, index=False)
analysis[['Player','Age'] + rs_cols + ['RANK_SCORE','Classification']].sort_values('Player').to_excel(summary_path, index=False)

print(f"Saved full grid  -> {panel_path}")
print(f"Saved per-player -> {summary_path}")

In [None]:
# Full Grid Quick view
from IPython.display import display
display(grid.head(12)[['Season','Player','Pos1','Pos2','MP'] + rs_cols + ['RANK_SCORE']])
display(analysis.head(10))

In [None]:
# Check a player's composite score computation
who = "CUNNINGHAM, Cade"  # change me
dbg = (grid[grid['Player']==who][['Season','MP'] + rs_cols]
       .merge(wf[wf['Player']==who][['Season','Weight']], on='Season', how='left')
       .sort_values('Season'))
dbg['FG%_Weighted'] = dbg['FG%_RS'] * dbg['Weight']
dbg