In [1]:
# Dependencies
import pandas as pd
import numpy as np

In [2]:
# Initialize variables
cutoff_year = 2000

test_data_cutoff_year = 2023
prediction_data_cutoff_year = test_data_cutoff_year + 1

# DATA CLEANUP

In [3]:
# Read in CSV file 
team_summaries = pd.read_csv("Resources\Team Summaries.csv")
team_summaries = team_summaries[['season','abbreviation','playoffs','w','l']]
team_summaries.rename(columns={'abbreviation':'tm'}, inplace=True)
team_summaries['playoffs'] = team_summaries['playoffs'].astype(int)
# team_summaries

In [4]:
# Read in CSV file
all_nba_df = pd.read_csv("Resources\End of Season Teams.csv")
# all_nba_df.head()

In [5]:
# Filter All-NBA players after year 2010. Keep only columns: all_nba, seas_id
modern_all_nba = all_nba_df[(all_nba_df['season'] >= cutoff_year) & (all_nba_df['type'] == 'All-NBA')][['type','seas_id']]

# print(modern_all_nba.info())
# modern_all_nba.head()

In [6]:
# Read in CSV file
player_totals_df = pd.read_csv("Resources\Player Totals.csv")

# Filter data after cutoff_year
player_totals_df =  player_totals_df[player_totals_df['season'] >= cutoff_year]

# player_totals_df.head()

In [7]:
# Find traded players current team
traded_players = player_totals_df.sort_values(by='seas_id', ascending=False).drop_duplicates(['player','season'])[['seas_id','season','player','tm']]

player_totals = pd.merge(player_totals_df, traded_players, on=['player','season'], how='left')

player_totals.rename(columns={'seas_id_x':'seas_id','tm_y':'tm'}, inplace=True)
player_totals.drop('seas_id_y', axis=1, inplace=True)

player_totals = pd.merge(player_totals, team_summaries, on=['season','tm'], how='left')

player_totals = player_totals.sort_values(by='seas_id').drop_duplicates(['season','player'])
#player_totals

In [8]:
# Filter data after 2010
modern_player_totals = player_totals.copy()

# Categorize positions into 3 categories
modern_player_totals['forward'] = modern_player_totals['pos'].str.contains('f', case=False).astype(int)
modern_player_totals['guard'] = modern_player_totals['pos'].str.contains('g', case=False).astype(int)
modern_player_totals['center'] = modern_player_totals['pos'].str.contains('c', case=False).astype(int)

# Drop unnecessary columns
modern_player_totals.drop(['birth_year','player_id','player','lg','tm_x','pos','fg_percent','x3p_percent','x2p_percent','ft_percent','e_fg_percent','tm'], axis=1, inplace=True)


# print(modern_player_totals.info())
# modern_player_totals.head()

In [9]:
# Merge ALL NBA members with players
merged_nba = pd.merge(modern_all_nba, modern_player_totals, on='seas_id', how='right')
merged_nba['all_nba'] = merged_nba['type'].notnull().astype(int)

merged_nba.drop('type', axis=1, inplace=True)

# print(merged_nba.info())
# merged_nba.head()



In [10]:
print(merged_nba.columns)  # Verify column names in merged_nba
new_merged_df = pd.read_csv("Resources/2023_2024.csv", encoding='latin1')
new_merged_nba = pd.merge(merged_nba, new_merged_df, left_index=True, right_index=True)
new_merged_nba['all_nba'] = merged_nba.any(axis=1).astype(int)

# Check if 'type' exists in merged_nba before attempting to drop
if 'type' in merged_nba.columns:
    new_merged_nba.drop('type', axis=1, inplace=True)

Index(['seas_id', 'season', 'age', 'experience', 'g', 'gs', 'mp', 'fg', 'fga',
       'x3p', 'x3pa', 'x2p', 'x2pa', 'ft', 'fta', 'orb', 'drb', 'trb', 'ast',
       'stl', 'blk', 'tov', 'pf', 'pts', 'playoffs', 'w', 'l', 'forward',
       'guard', 'center', 'all_nba'],
      dtype='object')


# MACHINE LEARNING

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [12]:
test_data = new_merged_nba[new_merged_nba['season'] <= test_data_cutoff_year]

# Split the data into X_train, X_test, y_train, y_test
X = test_data.drop("all_nba", axis=1)
y = test_data["all_nba"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Scale the features using a standard scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


ValueError: could not convert string to float: 'Trae Young'

In [None]:
# Train a Logistic Regression model and print the model score
classifier = LogisticRegression(max_iter=10000)

classifier.fit(X_train_scaled, y_train)

print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")
print('-------------------------------------')

In [None]:
# Make predictions on new data
prediction_data = merged_nba[merged_nba['season'] == prediction_data_cutoff_year].drop('all_nba', axis=1)
prediction_data_scaled = scaler.transform(prediction_data)
proba = classifier.predict_proba(prediction_data_scaled)

# Add the All-NBA probability to the prediction_data dataframe as percentage
prediction_data['All-NBA Probability'] = np.around(proba[:, 1] * 100 , 2)
prediction_data.sort_values(by='All-NBA Probability', ascending=False, inplace=True)

In [None]:
# Merge in player names with correct Team name.  Correcting erroneous players with TOT as team name
player_info = player_totals.sort_values(by='seas_id', ascending=False)[['seas_id','player','tm']]

predicted_players = pd.merge(player_info, prediction_data, on='seas_id', how='right')

In [None]:
# Separate players by positions
top_guards = predicted_players[predicted_players['guard'] == 1].head(20)
top_forwards = predicted_players[predicted_players['forward'] == 1].head(10)
top_centers = predicted_players[predicted_players['center'] == 1].head(10)

# OUTPUT

In [None]:
print("Top Guards")
print(top_guards.head(10).to_markdown())
print('-----------------------------------------------------------------------------------------------------')
print("Top Forwards")
print(top_forwards.head(10).to_markdown())
print('-----------------------------------------------------------------------------------------------------')
print("Top Centers")
print(top_centers.head(6).to_markdown())

In [None]:
# predicted_players.to_csv("Resources/2023 All-NBA Predictions.csv")