In [1]:
from ipywidgets import interact, interact_manual
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv('player_stats_1976-2019.csv', encoding = "UTF-8")

Basketball Reference includes multiple rows for the same player's season if that player was traded to another team during the season. The following cell pares down the data to just one row per player per season, only including the rows that span a player's whole season.

In [3]:
idx = []
for i in range(1, len(df)):
    if df.at[i,'Player'] == df.at[(i-1),'Player']:
        idx.append(i)
df = df.drop(idx)

In [4]:
# creating a dataframe of just players that played last season
players_next_season = df.loc[df['year'] == 2019]

The following function takes in a player's name, then returns a dataframe of player seasons from the dataset in which the player was the same age as the input player was last season. This helps us ensure that we're comparing to player's at a similar point in their careers.

In [5]:
def find_peers(player):
    player_row = players_next_season.loc[players_next_season['Player'] == player]
    age = player_row['Age'].item()
    peers = df.loc[df['Age'] == age].reset_index(drop=True)
    return peers

This next function takes in a player, then uses cosine similarity to generate the 300 most similar players in the dataset.

In [6]:
def find_comps(player):
    df = find_peers(player)
    # saving each player season with its index to refer to later
    ids_ = df[['Player', 'year']]
    
    # Removing unwanted columns and creating a cosine similarity matrix
    df = df.drop(['Player', 'Tm', 'year'], axis = 1)
    df = pd.get_dummies(df, columns = ['Pos'])
    cs = cosine_similarity(df)
    
    # Extracting the top 300 similarity scores for the player in question
    comps = []
    idx = ids_.loc[ids_['Player'] == player].index[0]
    scores = pd.Series(cs[idx]).sort_values(ascending = False)
    top_300 = list(scores.iloc[1:301].index)
    
    # Finding the name, year and score for each of the top 300 and adding to list
    for i in top_300:
        if ids_.loc[i][0] != player:
            comps.append((ids_.loc[i][0], ids_.loc[i][1], round(scores.loc[i], 5)))
    return comps

This next function takes each player in the list of comparative players and collects both their box plus minus and their box plus minus from the next season (if they've played a next season, that is. 

Then, I've fit a linear regression model to this data, and used it to predict our player of interest's box plus minus next season. 

In terms of accuracy, there's a lot left to be desired here, but that's why this is a work in progress. I'm sure there are hours of tinkering around the edges in my future in order to find a better method of prediction. First, however, I'm focused on developing a working system.

In [13]:
def predict(player):
    ox, oy = [], []
    dx, dy = [], []
    
    # collecting the BPM for each comp, as well as the BPM for the next year (if possible)
    for comp in find_comps(player):
        try:
            year2 = df.loc[(df['Player'] == comp[0]) & (df['year'] == (comp[1]+1))]
            year1 = df.loc[(df['Player'] == comp[0]) & (df['year'] == comp[1])]
            oy.append(year2['OBPM'].item())
            ox.append(year1['OBPM'].item())
            dy.append(year2['DBPM'].item())
            dx.append(year1['DBPM'].item())
        except:
            pass
    
    # converting from a list to a numpy array
    ox = np.array(ox)
    ox = ox.reshape(-1, 1)
    oy = np.array(oy)
    
    # fitting a regression model and predicting next year's BPMs
    # These predictions are just for the purpose of visualization
    oreg = LinearRegression().fit(ox,oy)
    oy_pred = oreg.predict(ox)
    
    dx = np.array(dx)
    dx = dx.reshape(-1, 1)
    dy = np.array(dy)
    
    # fitting a regression model and predicting next year's BPMs
    # These predictions are just for the purpose of visualization
    dreg = LinearRegression().fit(dx,dy)
    dy_pred = dreg.predict(dx)
    
    # Predicting next year's BPM for the player in question
    previous_row = df.loc[(df['Player'] == 'Derrick Rose') & (df['year'] == 2019)]
    previous_bpms = [previous_row['OBPM'].item(), previous_row['DBPM'].item()]
    minutes = previous_row['MP'].item()
    bpm_preds = [round(oreg.predict(np.array(previous_bpms[0]).reshape(-1,1)).item(),5),
                 round(dreg.predict(np.array(previous_bpms[1]).reshape(-1,1)).item(),5)]
    
    return ox, oy, oy_pred, bpm_preds, minutes

In [42]:
def display(player):
    ox, oy, oy_pred, bpm_preds, minutes = predict(player)
    #war = (bpm_preds * minutes * 2.18) / (48 * 82)
    print(f'We predict that {player} will have a Box Plus Minus of {bpm_preds} next year.')
    print('Top Comparisons:')
    for comp in find_comps(player)[1:6]:
        print(f'Name: {comp[0]}    Year: {comp[1]}    Sim Score: {comp[2]} \n')
        
    plt.scatter(ox, oy)
    plt.plot(ox, oy_pred, color = 'black')
    plt.show()

In [41]:
def create_a_team():
    team = np.random.choice(players_next_season['Player'], 15)
    opms, dpms = [], []
    for p in team:
        ox, oy, oy_pred, bpm_preds, minutes = predict(p)
        opms.append(bpm_preds[0])
        dpms.append(bpm_preds[1])
    team_opm = sum([opm * 0.1417 for opm in opms[:5]]) + sum([opm * 0.0417 for opm in opms[5:10]])
    team_opm += + sum([opm * 0.0167 for opm in opms[11:]])
    team_dpm = sum([dpm * 0.1417 for dpm in dpms[:5]]) + sum([dpm * 0.0417 for dpm in dpms[5:10]])
    team_dpm += + sum([dpm * 0.0167 for dpm in dpms[11:]])

    win_percent = (108 + team_opm)**14 / ((108 + team_opm)**14 + (108 - team_dpm)**14)
    win_percent

    wins = []
    for _ in range(50000):
        record = 0
        for g in range(82):
            toss = np.random.rand()
            if toss <= win_percent:
                record += 1
        wins.append(record)
    win_total = sum(wins) // len(wins)
    
    message = f'Your starting lineup is {team[:5]}.\n Your bench is {team[5:]}'
    print(message)
    print(f'We predict your team will win {win_total} games')

In [43]:
create_a_team()

Your starting lineup is ['Nikola Vučević' 'Guerschon Yabusele' 'Danuel House' 'Evan Fournier'
 'Yuta Watanabe'].
 Your bench is ['Quincy Pondexter' 'Markelle Fultz' 'Luke Kornet' 'Jeremy Lin'
 'Zach Collins' 'Lonnie Walker' 'Marcus Morris' 'Matthew Dellavedova'
 'Rodney Hood' 'Bobby Portis']
We predict your team will win 38 games
