In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import joblib


In [55]:
import requests
from bs4 import BeautifulSoup
from io import StringIO

website_url = "https://www.basketball-reference.com/leagues/NBA_2025.html"
season = "2024-2025"

def get_teams_table(team1_link,team2_link):
    #getting the data from the website
    team1_data = requests.get(team1_link)
    team2_data = requests.get(team2_link)


    #getting the tables
    team_1_table = pd.read_html(StringIO(team1_data.text), match ="2024-25 Regular Season",header=1)[0]
    team_2_table = pd.read_html(StringIO(team2_data.text), match ="2024-25 Regular Season",header=1)[0]

    team_1_table["Won"] = (team_1_table["Rslt"] == "W").astype(int)
    team_2_table["Won"] = (team_2_table["Rslt"] == "W").astype(int)

    
    team_1_table = clean_data(team_1_table)
    team_2_table = clean_data(team_2_table)


    return team_1_table, team_2_table

def convert_to_int(team1_table):
    columns = team1_table.columns.tolist()[9:51]
    for i in columns: 
        try:
            team1_table[i] = team1_table[i].to_numpy().astype(int)
        except:
            team1_table[i] = team1_table[i].to_numpy().astype(float)

    return team1_table
    
def clean_data(team_table):
    rows_to_drop = []

    Rslt = team_table["Rslt"].to_numpy()

    for i in range(len(Rslt)):
        if Rslt[i] == "Rslt" or "-" in Rslt[i]:
            rows_to_drop.append(i)

    table = team_table.drop(rows_to_drop, inplace=False)

    columns = table.columns.tolist()
    pf = False
    for i in range(len(columns)):
        if columns[i] == "Won":
            pf = False
        if pf:
            columns[i] = "OPP-" + columns[i]
        if columns[i] == "PF":
            pf = True

    for i in range(len(columns)):
        if ".1" in columns[i]:
            columns[i] = columns[i].replace(".1", "")
    table.columns = columns



    
    return table

def get_rolling_averages(team_table):
    all_columns = team_table.columns.to_list()
    short_rolling = all_columns[10:53]
    for i in short_rolling:
        team_table[f"avg-{i}"] = team_table[i].rolling(window=7).mean()
    return team_table

def opp_rolling_averages(team_table,opp_table):
    columns_to_add = opp_table.columns.to_list()[52:94]
    for i in columns_to_add:
        team_table[f"opp-{i}"] = opp_table[i]
    return team_table
    


def get_season_averages(team_table):
    all_columns = team_table.columns.to_list()
    short_rolling = all_columns[10:52]


    for i in short_rolling:
    
        team_table[f"season-avg{i}"] = team_table[i].mean()
    return team_table

def clean_csv(precsv):
    # Get the indices that actually exist in the DataFrame
    existing_indices = precsv.index.intersection(range(7))
    
    # Only drop indices that exist
    if len(existing_indices) > 0:
        precsv = precsv.drop(existing_indices, inplace=False)
    
    return precsv
    
def get_necessary_data(team1, team2):
    global website_url
    #getting website html and data
    data = requests.get(website_url)
    soup = BeautifulSoup(data.text)

    #getting the caption of the table I want
    caption = soup.find("caption", string="Per Game Stats Table")

    #getting the table that the caption is in
    teams = caption.find_parent("table")

    #getting all links
    links = teams.find_all("a")

    #going through all links and finding hte href value for each teams data we need
    for i in links:
        if team1 in str(i.text):
            team1_link = i["href"]
        if team2 in str(i.text):
            team2_link = i["href"]

    #returning -1 if there is no team found
    if team1_link == None or team2_link == None:
        return -1
    
    
    #converting the link to the full link and the game log link
    team1_link = f"https://www.basketball-reference.com{team1_link}"
    team1_link = team1_link.replace(".html", "/gamelog/")

    team2_link = f"https://www.basketball-reference.com{team2_link}"
    team2_link = team2_link.replace(".html", "/gamelog/")


    #getting the data for each team
    team1_table, team2_table = get_teams_table(team1_link, team2_link)

    #converting the data to integers
    team1_table = convert_to_int(team1_table)
    team2_table = convert_to_int(team2_table)
    
    #getting the rolling averages for each team
    team1_table = get_rolling_averages(team1_table)
    team2_table = get_rolling_averages(team2_table)


    team1_table = opp_rolling_averages(team1_table, team2_table)
    

    team1_table = get_season_averages(team1_table)

    team_win_rates = team1_table['Won'].rolling(window=10).mean().reset_index(0, drop=True).fillna(0.5)
    
    
    team1_table = clean_csv(team1_table)

    team1_table["Date"] = pd.to_datetime(team1_table["Date"])

    team1_table['DayOfWeek'] = team1_table['Date'].dt.dayofweek
    team1_table['Month'] = team1_table['Date'].dt.month
    team1_table['Season_Progress'] = team1_table['Date'].dt.dayofyear % 365 / 365

    team1_table['Team_Win_Rate_Last10'] = team_win_rates

    all_columns = team1_table.columns.to_list()
    short_rolling = all_columns[9:52]
    for i in short_rolling:
        team1_table[f"short-rolling-{i}"] = team1_table[i].rolling(window=3).mean()
    team1_table[f"short-rolling-{i}"] = team1_table[f"short-rolling-{i}"].iloc[-1]


    team1_table["Home"] = 1


    team_name = team1

  

    return team1_table,team_name



team1_table,team_name = get_necessary_data("Golden State Warriors", "Sacramento Kings")

        


  team_table[f"opp-{i}"] = opp_table[i]
  team_table[f"opp-{i}"] = opp_table[i]
  team_table[f"opp-{i}"] = opp_table[i]
  team_table[f"opp-{i}"] = opp_table[i]
  team_table[f"opp-{i}"] = opp_table[i]
  team_table[f"opp-{i}"] = opp_table[i]
  team_table[f"opp-{i}"] = opp_table[i]
  team_table[f"opp-{i}"] = opp_table[i]
  team_table[f"opp-{i}"] = opp_table[i]
  team_table[f"opp-{i}"] = opp_table[i]
  team_table[f"opp-{i}"] = opp_table[i]
  team_table[f"opp-{i}"] = opp_table[i]
  team_table[f"opp-{i}"] = opp_table[i]
  team_table[f"opp-{i}"] = opp_table[i]
  team_table[f"opp-{i}"] = opp_table[i]
  team_table[f"opp-{i}"] = opp_table[i]
  team_table[f"opp-{i}"] = opp_table[i]
  team_table[f"opp-{i}"] = opp_table[i]
  team_table[f"opp-{i}"] = opp_table[i]
  team_table[f"opp-{i}"] = opp_table[i]
  team_table[f"opp-{i}"] = opp_table[i]
  team_table[f"opp-{i}"] = opp_table[i]
  team_table[f"opp-{i}"] = opp_table[i]
  team_table[f"opp-{i}"] = opp_table[i]
  team_table[f"opp-{i}"] = opp_table[i]


In [272]:
team1_table.to_csv("bug.csv")

In [58]:
improved_results = joblib.load("improved_nba_model_package.pkl")

model = improved_results['model']
optimal_threshold = improved_results['optimal_threshold']
selected_features = improved_results['selected_features']
best_params = improved_results['best_params']

team1_table = team1_table[selected_features]


In [59]:
team1_table = team1_table[selected_features]
predictors = team1_table.iloc[-1]
if len(np.array(predictors).shape) == 1:
    # Reshape 1D array to 2D (one sample, multiple features)
    predictors = np.array(predictors).reshape(1, -1)
elif isinstance(predictors, list) and not isinstance(predictors[0], (list, np.ndarray)):
    # Handle case where predictors is a simple list
    predictors = [predictors]

feature_names = model.feature_names_in_
predictors = pd.DataFrame(predictors, columns=feature_names)
predictors.to_csv("bug.csv")

# Get probability predictions
y_prob = model.predict_proba(predictors)

# Apply optimal threshold
y_pred = (y_prob >= optimal_threshold).astype(int)

if (y_pred[0][0] ==1):
    print(f"{team_name} will lose")
else:
    print(f"{team_name} will win")

Golden State Warriors will win


In [26]:
predictors

Unnamed: 0,short-rolling-Won,avg-Won,Home,opp-avg-Won,season-avgWon,season-avgeFG%,season-avgOPP-3PA,opp-avg-2P%,season-avgOPP-2P%,opp-avg-OPP-BLK,...,avg-OPP-FT,short-rolling-BLK,season-avgORB,avg-OPP-BLK,short-rolling-OPP-FT%,avg-OPP-DRB,short-rolling-TRB,short-rolling-OPP-ORB,opp-avg-2P,season-avgOPP-ORB
0,0.666667,0.571429,1.0,,0.560606,0.548288,37.439394,,0.531864,,...,25.0,5.666667,11.409091,5.0,0.795333,32.0,53.0,11.0,,10.121212
