In [78]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

team_data = pd.read_csv('../data/team_data.csv')
game_data = pd.read_csv('../data/game_data.csv')

In [79]:
team_data.head()

Unnamed: 0,TEAM_ID,TEAM_NAME,GP,W,L,W_PCT,MIN,FGM,FGA,FG_PCT,...,REB_RANK,AST_RANK,TOV_RANK,STL_RANK,BLK_RANK,BLKA_RANK,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK
0,1610612737,Atlanta Hawks,36,18,18,0.5,1738.0,1537,3314,0.464,...,4,3,26,2,10,23,13,3,8,21
1,1610612738,Boston Celtics,36,26,10,0.722,1743.0,1504,3270,0.46,...,7,15,2,20,6,2,3,14,5,3
2,1610612751,Brooklyn Nets,36,13,23,0.361,1738.0,1372,3049,0.45,...,29,16,22,28,29,26,28,7,20,24
3,1610612766,Charlotte Hornets,34,7,27,0.206,1647.0,1299,3050,0.426,...,15,28,20,22,20,21,22,21,29,25
4,1610612741,Chicago Bulls,36,17,19,0.472,1733.0,1554,3325,0.467,...,6,4,19,20,21,17,12,30,7,23


In [80]:
game_data.head()

Unnamed: 0,GAME_ID,TEAM_ID_HOME,PTS_HOME,REB_HOME,AST_HOME,STL_HOME,BLK_HOME,TOV_HOME,TEAM_ID_AWAY,PTS_AWAY,REB_AWAY,AST_AWAY,STL_AWAY,BLK_AWAY,TOV_AWAY,HOME_WIN
0,22400062,1610612747,110,46,22,7,8,7,1610612750,103,47,17,4,1,16,1
1,22400061,1610612738,132,40,33,6,3,4,1610612752,109,34,20,2,3,12,1
2,22400065,1610612748,97,41,24,5,8,11,1610612753,116,57,28,8,8,14,0
3,22400072,1610612757,104,42,21,10,4,18,1610612744,140,57,38,13,5,18,0
4,22400071,1610612746,113,51,27,9,1,22,1610612756,116,42,25,12,9,22,0


In [81]:
# check for duplicates in game_data
print("Duplicates in game_data:", game_data.duplicated().sum())

# check for duplicates in team_data
print("Duplicates in team_data:", team_data.duplicated().sum())

# drop duplicates if any exist
game_data = game_data.drop_duplicates()
team_data = team_data.drop_duplicates()

# check for duplicate columns after merging
merged_game_data = game_data.merge(
    team_data, left_on='TEAM_ID_HOME', right_on='TEAM_ID', suffixes=('', '_HOME')
)
merged_game_data = merged_game_data.merge(
    team_data, left_on='TEAM_ID_AWAY', right_on='TEAM_ID', suffixes=('', '_AWAY')
)

# ensure no duplicate column names
print("Duplicate columns after merging:", merged_game_data.columns.duplicated().sum())

# if duplicate columns exist, drop them
if merged_game_data.columns.duplicated().sum() > 0:
    merged_game_data = merged_game_data.loc[:, ~merged_game_data.columns.duplicated()]


Duplicates in game_data: 0
Duplicates in team_data: 0
Duplicate columns after merging: 7


In [82]:
# remove duplicate rows from datasets
game_data = game_data.drop_duplicates()
team_data = team_data.drop_duplicates()

# merge team stats with game data
game_data = game_data.merge(team_data, left_on='TEAM_ID_HOME', right_on='TEAM_ID', suffixes=('', '_HOME'))
game_data = game_data.merge(team_data, left_on='TEAM_ID_AWAY', right_on='TEAM_ID', suffixes=('', '_AWAY'))

# remove duplicate columns
if game_data.columns.duplicated().sum() > 0:
    game_data = game_data.loc[:, ~game_data.columns.duplicated()]

# stat differences
game_data['PTS_diff'] = game_data['PTS_HOME'] - game_data['PTS_AWAY']
game_data['REB_diff'] = game_data['REB_HOME'] - game_data['REB_AWAY']
game_data['AST_diff'] = game_data['AST_HOME'] - game_data['AST_AWAY']
game_data['STL_diff'] = game_data['STL_HOME'] - game_data['STL_AWAY']
game_data['BLK_diff'] = game_data['BLK_HOME'] - game_data['BLK_AWAY']
game_data['TOV_diff'] = game_data['TOV_HOME'] - game_data['TOV_AWAY']

# prepare features
X = game_data[['PTS_diff', 'REB_diff', 'AST_diff', 'STL_diff', 'BLK_diff', 'TOV_diff']]
y = game_data['HOME_WIN']

# split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# train
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        51
           1       1.00      1.00      1.00        56

    accuracy                           1.00       107
   macro avg       1.00      1.00      1.00       107
weighted avg       1.00      1.00      1.00       107



In [83]:
# function to predict winner between two teams
def predict_winner(team1, team2, team_stats):
    """
    predict the winner between two teams using the pre-trained model.
    
    :param team1: name of the first team (str)
    :param team2: name of the second team (str)
    :param team_stats: DataFrame containing stats for all teams
    
    :return: predicted winner (str)
    """
    # make sure team exists in dataset
    if team1 not in team_stats['TEAM_NAME'].values:
        raise ValueError(f"Team {team1} not found in the dataset.")
    if team2 not in team_stats['TEAM_NAME'].values:
        raise ValueError(f"Team {team2} not found in the dataset.")
    
    # stats for both teams
    team1_stats = team_stats.loc[team_stats['TEAM_NAME'] == team1].iloc[0]
    team2_stats = team_stats.loc[team_stats['TEAM_NAME'] == team2].iloc[0]
    
    # stat differences
    input_data = pd.DataFrame([{
        'PTS_diff': team1_stats['PTS'] - team2_stats['PTS'],
        'REB_diff': team1_stats['REB'] - team2_stats['REB'],
        'AST_diff': team1_stats['AST'] - team2_stats['AST'],
        'STL_diff': team1_stats['STL'] - team2_stats['STL'],
        'BLK_diff': team1_stats['BLK'] - team2_stats['BLK'],
        'TOV_diff': team1_stats['TOV'] - team2_stats['TOV']
    }])

    # predict outcome
    prediction = model.predict(input_data)[0]
    return team1 if prediction == 1 else team2

team1 = "Boston Celtics"
team2 = "Washington Wizards"
try:
    winner = predict_winner(team1, team2, team_data)
    print(f"the predicted winner is: {winner}")
except ValueError as e:
    print(e)

the predicted winner is: Boston Celtics


In [84]:
# First-round matches with full team names
first_round_matches = [
    ("Oklahoma City Thunder", "Dallas Mavericks"),  # Match 1
    ("Houston Rockets", "Golden State Warriors"),   # Match 2
    ("Milwaukee Bucks", "Orlando Magic"),           # Match 3
    ("New York Knicks", "Atlanta Hawks")            # Match 4
]

# Recursive function to simulate the bracket
def simulate_bracket(matches, team_stats, model, round_name="Quarterfinals"):
    """
    Simulate the entire bracket recursively and print results for each round.

    :param matches: List of tuples, where each tuple is a matchup (team1, team2).
    :param team_stats: DataFrame containing stats for all teams.
    :param model: Trained machine learning model.
    :param round_name: Name of the current round (str).
    :return: Winner of the tournament.
    """
    print(f"\n--- {round_name} ---")
    next_round = []
    
    # Predict each match and print the results
    for team1, team2 in matches:
        winner = predict_winner(team1, team2, team_stats)
        print(f"{team1} vs. {team2} -> {winner} wins")
        next_round.append(winner)
    
    # If this is the final round, return the champion
    if len(next_round) == 1:
        return next_round[0]
    
    # Generate next round matches by pairing winners
    next_round_matches = [(next_round[i], next_round[i+1]) for i in range(0, len(next_round), 2)]
    
    # Determine the next round's name
    next_round_name = {
        4: "Semifinals",
        2: "Final",
    }.get(len(next_round), f"Round of {len(next_round)}")
    
    # Recursively simulate the next round
    return simulate_bracket(next_round_matches, team_stats, model, next_round_name)

# Predict the NBA Cup winner and output each round
champion = simulate_bracket(first_round_matches, team_data, model)
print(f"\nThe predicted NBA Cup Champion is: {champion}")


--- Quarterfinals ---
Oklahoma City Thunder vs. Dallas Mavericks -> Dallas Mavericks wins
Houston Rockets vs. Golden State Warriors -> Houston Rockets wins
Milwaukee Bucks vs. Orlando Magic -> Orlando Magic wins
New York Knicks vs. Atlanta Hawks -> New York Knicks wins

--- Semifinals ---
Dallas Mavericks vs. Houston Rockets -> Dallas Mavericks wins
Orlando Magic vs. New York Knicks -> New York Knicks wins

--- Final ---
Dallas Mavericks vs. New York Knicks -> New York Knicks wins

The predicted NBA Cup Champion is: New York Knicks


In [85]:
team1 = "Houston Rockets"
team2 = "Oklahoma City Thunder"
try:
    winner = predict_winner(team1, team2, team_data)
    print(f"the predicted winner is: {winner}")
except ValueError as e:
    print(e)

the predicted winner is: Oklahoma City Thunder
