In [29]:
#all the imports
import pandas as pd
import numpy as np
from collections import defaultdict

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [30]:
#loading the csv file from my directory
df = pd.read_csv("/Users/hamimc/Documents/Datasets/NBA-Champions-ONLY/nba_championships_since46.csv")

#had to clean column names bc i had the extra whitespace which was causing issues
df.columns = df.columns.str.strip()

#printing out and visualizing
print("Raw columns:", df.columns)
print("Data sample:")
print(df.head())

Raw columns: Index(['id', 'matchup', 'year', 'winner', 'runnerup', 'finalscore',
       'easternConfRunnerUp', 'westernConfRunnerUp'],
      dtype='object')
Data sample:
   id                                           matchup     year  \
0   0       2023/24 — Boston Celtics v Dallas Mavericks  2023/24   
1   1             2022/23 — Denver Nuggets v Miami Heat  2022/23   
2   2  2021/22 — Golden State Warriors v Boston Celtics  2021/22   
3   3          2020/21 — Milwaukee Bucks v Phoenix Suns  2020/21   
4   4         2019/20 — Los Angeles Lakers v Miami Heat  2019/20   

                  winner          runnerup finalscore easternConfRunnerUp  \
0         Boston Celtics  Dallas Mavericks        4/1      Indiana Pacers   
1         Denver Nuggets        Miami Heat        4/1      Boston Celtics   
2  Golden State Warriors    Boston Celtics        4/2          Miami Heat   
3        Milwaukee Bucks      Phoenix Suns        4/2       Atlanta Hawks   
4     Los Angeles Lakers        Miam

In [31]:
#the years on this csv file are written weirdly (2023/24) and it would be hard to train a model like that, so I made a function that would
#convert the year into the season_end. For example the 2023/24 season would just be 2024 since the season ended in June 2024. 
def fix_year(x):
    last = int(str(x).split('/')[-1])
    return 2000 + last if last < 50 else 1900 + last

df['season_end'] = df['year'].apply(fix_year)

#better to keep 1990+ years
df = df[df['season_end'] >= 1990].reset_index(drop=True)
print(df[['year','season_end']].head(10))

      year  season_end
0  2023/24        2024
1  2022/23        2023
2  2021/22        2022
3  2020/21        2021
4  2019/20        2020
5  2018/19        2019
6  2017/18        2018
7  2016/17        2017
8  2015/16        2016
9  2014/15        2015


In [32]:
#at first I did not have this, but when I ran the ML model first, it had the Syracuse Nationals and the New Jersey Nets winning during some
#years and that didn't make sense because some of these teams a) don't exist anymore or b) renamed. So I had to remove them and not include
#them in the ML model. 
outdated_teams = [
    "St.Louis Hawks", "Washington Bullets", "Syracuse Nationals", "Indianapolis Olympians",
    "Anderson Packers", "Rochester Royals", "Chicago Stags", "St.Louis Bombers",
    "Ft.Wayne Pistons", "Baltimore Bullets", "Minneapolis Lakers", "San Francisco Warriors",
    "Seattle Supersonics", "Kansas City Kings", "New Jersey Nets"
]

mask = ~df[['winner','runnerup','easternConfRunnerUp','westernConfRunnerUp']].isin(outdated_teams).any(axis=1)
df = df[mask].reset_index(drop=True)

print("Filtered data (modern teams only):")
print(df.head())

Filtered data (modern teams only):
   id                                           matchup     year  \
0   0       2023/24 — Boston Celtics v Dallas Mavericks  2023/24   
1   1             2022/23 — Denver Nuggets v Miami Heat  2022/23   
2   2  2021/22 — Golden State Warriors v Boston Celtics  2021/22   
3   3          2020/21 — Milwaukee Bucks v Phoenix Suns  2020/21   
4   4         2019/20 — Los Angeles Lakers v Miami Heat  2019/20   

                  winner          runnerup finalscore easternConfRunnerUp  \
0         Boston Celtics  Dallas Mavericks        4/1      Indiana Pacers   
1         Denver Nuggets        Miami Heat        4/1      Boston Celtics   
2  Golden State Warriors    Boston Celtics        4/2          Miami Heat   
3        Milwaukee Bucks      Phoenix Suns        4/2       Atlanta Hawks   
4     Los Angeles Lakers        Miami Heat        4/2      Boston Celtics   

      westernConfRunnerUp  season_end  
0  Minnesota Timberwolves        2024  
1      Los An

In [33]:
#the csv file that i found on Kaggle and edited is too bland to actually make solid predictions, there aren't enough csv files with other
#pieces of information such as player ratings, impact, injuries, etc. this is just a csv file of past championship matchups + eastern and 
#western conference runner ups. so these are just some features I added to help with the prediction model. 
championships = defaultdict(int)
finals_appearances = defaultdict(int)
conference_appearances = defaultdict(int)

records = []

for _, row in df.iterrows():
    year = row['season_end']
    winner = row['winner']
    runnerup = row['runnerup']
    east_ru = row['easternConfRunnerUp']
    west_ru = row['westernConfRunnerUp']
    
    for team in [winner, runnerup, east_ru, west_ru]:
        records.append({
            'year': year,
            'team': team,
            'past_championships': championships[team],
            'past_finals': finals_appearances[team],
            'past_conference_finals': conference_appearances[team],
            'champion': 1 if team == winner else 0
        })
    
    #update counters for champs and appearances
    championships[winner] += 1
    finals_appearances[winner] += 1
    finals_appearances[runnerup] += 1
    conference_appearances[winner] += 1
    conference_appearances[runnerup] += 1
    conference_appearances[east_ru] += 1
    conference_appearances[west_ru] += 1

features_df = pd.DataFrame(records)

print("Feature sample:")
print(features_df.head(10))

Feature sample:
   year                    team  past_championships  past_finals  \
0  2024          Boston Celtics                   0            0   
1  2024        Dallas Mavericks                   0            0   
2  2024          Indiana Pacers                   0            0   
3  2024  Minnesota Timberwolves                   0            0   
4  2023          Denver Nuggets                   0            0   
5  2023              Miami Heat                   0            0   
6  2023          Boston Celtics                   1            1   
7  2023      Los Angeles Lakers                   0            0   
8  2022   Golden State Warriors                   0            0   
9  2022          Boston Celtics                   1            1   

   past_conference_finals  champion  
0                       0         1  
1                       0         0  
2                       0         0  
3                       0         0  
4                       0         1  
5      

In [34]:
X = features_df[['past_championships','past_finals','past_conference_finals']]
y = features_df['champion']

print("X shape:", X.shape)
print("y distribution:\n", y.value_counts())

X shape: (124, 3)
y distribution:
 champion
0    93
1    31
Name: count, dtype: int64


In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy on holdout set:", accuracy_score(y_test, y_pred))

Accuracy on holdout set: 0.68


In [36]:
#to fix the model and make it more realistic (because when I ran this the second time, it said the Los Angeles Lakers would win every
#single championship from 2027-2050). I needed to make a weighting factor, in this case the years. I chose 10. 
RECENCY_YEARS = 10

latest_year = features_df['year'].max()
teams = [t for t in features_df['team'].unique() if t not in outdated_teams]

team_champ_years = defaultdict(list)
for _, row in df.iterrows():
    team_champ_years[row['winner']].append(row['season_end'])

future_predictions = []

for i in range(2, 27):
    year = latest_year + i
    season_data = []
    
    for team in teams:
        past_years = team_champ_years[team]
        recency_score = sum([max(0, 1 - (year - y)/RECENCY_YEARS) for y in past_years])
        
        season_data.append({
            'year': year,
            'team': team,
            'past_championships': recency_score,
            'past_finals': finals_appearances[team],
            'past_conference_finals': conference_appearances[team]
        })
    
    season_df = pd.DataFrame(season_data)
    X_future = season_df[['past_championships','past_finals','past_conference_finals']]
    probs = model.predict_proba(X_future)[:,1]
    
    winner = np.random.choice(season_df['team'], p=probs/probs.sum())
    future_predictions.append((year, winner))
    
    championships[winner] += 1
    finals_appearances[winner] += 1
    conference_appearances[winner] += 1
    team_champ_years[winner].append(year)

print("🏆 Predicted NBA Champions from 2026-2050:")
for year, champ in future_predictions:
    print(year, "->", champ)

🏆 Predicted NBA Champions from 2026-2050:
2026 -> Miami Heat
2027 -> Memphis Grizzlies
2028 -> Los Angeles Lakers
2029 -> Orlando Magic
2030 -> Cleveland Cavaliers
2031 -> Oklahoma City Thunder
2032 -> Detroit Pistons
2033 -> Cleveland Cavaliers
2034 -> Utah Jazz
2035 -> Portland Trailblazers
2036 -> Boston Celtics
2037 -> Golden State Warriors
2038 -> Memphis Grizzlies
2039 -> Portland Trailblazers
2040 -> Phoenix Suns
2041 -> Dallas Mavericks
2042 -> Orlando Magic
2043 -> San Antonio Spurs
2044 -> Detroit Pistons
2045 -> Toronto Raptors
2046 -> Detroit Pistons
2047 -> Utah Jazz
2048 -> Indiana Pacers
2049 -> Boston Celtics
2050 -> Houston Rockets
