# 01 — Combine per-season Advanced/Misc Team CSVs
Input files (in `../data`): `nba_misc_2015.csv ... nba_misc_2024.csv`  
Output: `../data/combined_team_stats.csv`

In [14]:
from pathlib import Path

# Find the data folder automatically
if Path("data").exists():
    DATA = Path("data")              # notebook at repo root
elif Path("../data").exists():
    DATA = Path("../data")           # notebook inside notebooks/
else:
    raise FileNotFoundError("Could not find a 'data' folder.")

print("Using data folder:", DATA.resolve())
files = sorted(DATA.glob("nba_misc_*.csv"))
print("Found files:", [f.name for f in files])


Using data folder: C:\Users\jeff\Documents\nba-win-predictor\data
Found files: ['nba_misc_2014.csv', 'nba_misc_2015.csv', 'nba_misc_2016.csv', 'nba_misc_2017.csv', 'nba_misc_2018.csv', 'nba_misc_2019.csv', 'nba_misc_2020.csv', 'nba_misc_2021.csv', 'nba_misc_2022.csv', 'nba_misc_2023.csv', 'nba_misc_2024.csv', 'nba_misc_2025.csv']


In [15]:
import re
import pandas as pd

def read_br_csv(path):
    # Basketball-Reference CSVs have a banner row; the real header is row 2
    return pd.read_csv(path, header=1)

frames = []
for f in files:
    season = int(re.search(r"(\d{4})", f.name).group(1))
    df = read_br_csv(f)
    df["Season"] = season
    frames.append(df)

raw = pd.concat(frames, ignore_index=True)
print(raw.shape)
print(raw.columns.tolist()[:25])  # quick peek at columns


(372, 32)
['Rk', 'Team', 'Age', 'W', 'L', 'PW', 'PL', 'MOV', 'SOS', 'SRS', 'ORtg', 'DRtg', 'NRtg', 'Pace', 'FTr', '3PAr', 'TS%', 'Unnamed: 17', 'eFG%', 'TOV%', 'ORB%', 'FT/FGA', 'Unnamed: 22', 'eFG%.1', 'TOV%.1']


In [16]:
def snakeify(s: str) -> str:
    s = s.strip().lower().replace('%','pct')
    for ch in [' ', '/', '-', '(', ')', '.']:
        s = s.replace(ch, '_')
    return '_'.join(t for t in s.split('_') if t)

raw.columns = [snakeify(c) for c in raw.columns]

# small fixes for common variations
if "3par" in raw.columns: 
    raw = raw.rename(columns={"3par": "threepar"})
if "ft_per_fga" in raw.columns:
    raw = raw.rename(columns={"ft_per_fga": "ft_fga"})

raw.columns[:20]

Index(['rk', 'team', 'age', 'w', 'l', 'pw', 'pl', 'mov', 'sos', 'srs', 'ortg',
       'drtg', 'nrtg', 'pace', 'ftr', 'threepar', 'tspct', 'unnamed:_17',
       'efgpct', 'tovpct'],
      dtype='object')

In [17]:
rename_map = {
    "team":"team","w":"wins","l":"losses","pace":"pace",
    "ortg":"ortg","off_rtg":"ortg","offensive_rating":"ortg",
    "drtg":"drtg","def_rtg":"drtg","defensive_rating":"drtg",
    "nrtg":"net_rating","net_rtg":"net_rating","netrating":"net_rating","net_rating":"net_rating",
    "srs":"srs","ts_pct":"ts_pct","efg_pct":"efg_pct","threepar":"threepar","ftr":"ftr",
    "tov_pct":"tov_pct","orb_pct":"orb_pct","drb_pct":"drb_pct",
    "opp_efg_pct":"opp_efg_pct","opp_tov_pct":"opp_tov_pct","opp_ft_fga":"opp_ft_fga","ft_fga":"ft_fga"
}
for k, v in list(rename_map.items()):
    if k in raw.columns:
        raw = raw.rename(columns={k: v})

df = raw.copy()
if "team" in df.columns:
    df = df[~df["team"].str.contains("league average|playoffs", case=False, na=False)]

df.shape, df.head(3)


((360, 32),
     rk                    team   age  wins  losses  pw  pl   mov   sos   srs  \
 0  1.0      San Antonio Spurs*  28.9  62.0    20.0  61  21  7.72  0.28  8.00   
 1  2.0   Los Angeles Clippers*  28.1  57.0    25.0  59  23  6.98  0.30  7.27   
 2  3.0  Oklahoma City Thunder*  26.2  59.0    23.0  58  24  6.34  0.32  6.66   
 
    ...  unnamed:_22  efgpct_1  tovpct_1  drbpct  ft_fga_1  unnamed:_27  \
 0  ...          NaN     0.482      12.8    76.4     0.184          NaN   
 1  ...          NaN     0.484      13.8    72.5     0.222          NaN   
 2  ...          NaN     0.488      13.9    75.6     0.221          NaN   
 
                      arena  attend  attend_g  season  
 0              AT&T Center  755031     18415    2014  
 1           STAPLES Center  787692     19212    2014  
 2  Chesapeake Energy Arena  746323     18203    2014  
 
 [3 rows x 32 columns])

In [18]:
# choose columns that exist in your data
keep = [
    "team","season","wins","losses",
    "pace","ortg","drtg","net_rating","srs",
    "ts_pct","efg_pct","threepar","ftr","tov_pct","orb_pct","drb_pct",
    "opp_efg_pct","opp_tov_pct","opp_ft_fga","ft_fga"
]
keep = [c for c in keep if c in df.columns]
print("Keeping:", keep)

df = df[keep].copy()
df["season"] = pd.to_numeric(df["season"], errors="coerce").astype("Int64")
df = df.sort_values(["season","team"]).reset_index(drop=True)

# SAVE here (uses the DATA folder we detected earlier)
out_path = DATA / "combined_team_stats.csv"
df.to_csv(out_path, index=False)
print("Saved:", out_path.resolve())
df.tail(3)


Keeping: ['team', 'season', 'wins', 'losses', 'pace', 'ortg', 'drtg', 'net_rating', 'srs', 'threepar', 'ftr', 'ft_fga']
Saved: C:\Users\jeff\Documents\nba-win-predictor\data\combined_team_stats.csv


Unnamed: 0,team,season,wins,losses,pace,ortg,drtg,net_rating,srs,threepar,ftr,ft_fga
357,Toronto Raptors,2025,30.0,52.0,99.7,110.5,114.8,-4.3,-4.4,0.373,0.232,0.173
358,Utah Jazz,2025,17.0,65.0,100.0,111.2,120.4,-9.2,-8.51,0.449,0.252,0.195
359,Washington Wizards,2025,18.0,64.0,100.9,106.8,119.1,-12.3,-12.14,0.435,0.229,0.178
