In [1]:
import sys
import random
import pandas as pd
import numpy as np
from numpy.linalg import svd, matrix_rank, norm
from sklearn import linear_model

import matplotlib.pyplot as plt
import pickle

plt.rcParams.update({'font.size': 14})

In [4]:
# data prep

def removeDuplicated(players, stats):
    """
    players: "../data/nba-players-stats/player_data.csv"
    stats: "../data/nba-players-stats/Seasons_Stats.csv"
    """
    # players with the same name
    names = players.name.unique()
    duplicated = np.array([])

    for name in names:
        numrows = len(players[players.name == name])
        if numrows != 1:
            duplicated = np.append(duplicated, name)

    duplicated = np.sort(duplicated)

    start_year = players.copy()
    start_year = start_year.rename(columns={"name":"Player"})

    # for non-duplicated players
    stats_not_duplicated = stats[~stats.Player.isin(duplicated)]
    stats_not_duplicated = pd.merge(stats_not_duplicated, start_year, on="Player", how="left")

    # only take the values that make sense
    stats_not_duplicated = stats_not_duplicated[(stats_not_duplicated.Year >= stats_not_duplicated.year_start) & (stats_not_duplicated.Year <= stats_not_duplicated.year_end )]
    stats_not_duplicated["year_count"] = stats_not_duplicated.Year - stats_not_duplicated.year_start

    return stats_not_duplicated

def slidingWindowDF(df, window, p=0, save=False):
    """
    df = (dataframe) pivoted df with players in rows, year_count in columns.
    window = (int) sliding window size
    p = (float, 0<=p<=1) fraction of NaN's allowed in each row
    """
    df_final = pd.DataFrame(columns = range(window))
    for i in range(df.shape[1]-window+1):
        df_window = df.iloc[:,i:(i+window)]
        df_window = df_window[np.isnan(df_window).sum(axis=1)/window <= p]
        df_window.columns = range(window)
        df_final = df_final.append(df_window)
    df_final = df_final.sort_index()
    if save==True:
        df_final.to_pickle("../data/nba-players-stats/sliding_window_{}_{}.pkl".format(window,metric))
    return df_final

def getDonorTargetDf(stats_not_duplicated, metric, pred_year, target_id):
    """
    stats_not_duplicated = (dataframe) stats df
    metric = (string) metric of interest (column name of stats_not_duplicated)
    window = (int) sliding window size
    """
    
    # data up to pred_year
    stats_this_year = stats_not_duplicated[stats_not_duplicated.Year <= pred_year]
    
    #target
    stats_target = stats_this_year[stats_this_year.player_id == target_id]
    num_years = stats_target.iloc[-1, -1]
    target_pivot = pd.pivot_table(stats_target, values=metric, columns=['year_count'],index=['Player'])
    if(np.isnan(target_pivot).sum().sum() != 0):
        raise("NaN value in target")

    # donor
    
    stats_donor = stats_this_year[stats_this_year.year_count <= num_years] # only who played more than num_years
    stats_donor = stats_donor[stats_donor.player_id != target_id] 
    donor_pivot = pd.pivot_table(stats_donor, values=metric, columns=['year_count'],index=['Player'])
    donor_pivot = donor_pivot[~donor_pivot[num_years].isnull()]
    donor_pivot = donor_pivot.T.fillna(donor_pivot.mean(axis=1)).T
    
    return donor_pivot, target_pivot

def topPlayers(stats, year, metric, n):
    stats = stats[stats.Year == year]
    stats = stats.groupby('Player').mean().reset_index()
    stats_sorted = stats[stats.Year == year].sort_values(metric, ascending = False).reset_index(drop=True)
    return stats_sorted[["Player","player_id"]][:n]

# plots
def plotPrediction(df_true, df_pred, metric, name, num_sv = 0):
    """
    Plot the groundtruth and prediction of each year.
    (Only the last dot is a test datapoint, all the datapoints before the marker are train datapoints)
    """
    title = "Target Player: "+ name
    if num_sv != 0:
        title = title + "; HSVT: "+ str(num_sv)
    else:
        title = title + "; no HSVT"
    markers_on = [df_true.shape[0]-1]
    plt.plot(df_pred, "blue", marker = 'o', markevery=markers_on)
    plt.plot(df_true, "red", marker = 'o', markevery=markers_on)
    plt.xlabel("year")
    plt.ylabel(metric)
    plt.legend(["Prediction","Truth"])
    plt.title(title)
    plt.show()
    
def plotPredictionEachYear(df_true, df_pred, metric, name, num_sv):
    """
    Plot the groundtruth and prediction of each year. (Each dot means new prediction)
    """
    title = "Target Player: "+ name
    if num_sv != 0:
        title = title + "; HSVT: "+ str(num_sv)
    else:
        title = title + "; no HSVT"
    plt.plot(df_pred, "blue", marker = 'o')
    plt.plot(df_true, "red", marker = 'o')
    # plt.plot(df_numdoner, "grey", linestyle = "dashed")
    plt.xlabel("year")
    plt.ylabel(metric)
    plt.legend(["Prediction","Truth"])
    plt.title(title)
    plt.show()

def plotMape(df_true, df_pred, name, num_sv):
    df_mape = pd.DataFrame(mape(df_true.values, df_pred.values), index = df_true.index)
    
    title = "Target Player: "+ name
    if num_sv != 0:
        title = title + "; HSVT: "+ str(num_sv)
    else:
        title = title + "; no HSVT"
    
    plt.plot(df_mape, "gold", marker = 'o')
    plt.xlabel("year")
    plt.ylabel("MAPE")
    plt.legend(["MAPE", "Doner Pool Size"])
    plt.title(title)
    plt.show()

def plotDonorSize(df_numdonor, name, num_sv):
    df_mape = pd.DataFrame(mape(df_true.values, df_pred.values), index = df_true.index)
    
    title = "Target Player: "+ name
    if num_sv != 0:
        title = title + "; HSVT: "+ str(num_sv)
    else:
        title = title + "; no HSVT"
        
    plt.plot(df_numdonor, "grey", linestyle = "dashed", marker = 'o')
    plt.xlabel("year")
    plt.ylabel("size")
    plt.legend(["Doner Pool Size"])
    plt.title(title)
    plt.show()

# Data Prep

In [3]:
players = pd.read_csv("../data/nba-players-stats/player_data.csv")
players = players[players.year_start >= 1980] # only choose players who started after 1980
players["player_id"] = range(0,len(players.name)) # assign id

players.head()

Unnamed: 0,name,year_start,year_end,position,height,weight,birth_date,college,player_id
0,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,"June 24, 1968",Duke University,0
3,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,"March 9, 1969",Louisiana State University,1
4,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,"November 3, 1974",San Jose State University,2
5,Shareef Abdur-Rahim,1997,2008,F,6-9,225.0,"December 11, 1976",University of California,3
9,Alex Abrines,2017,2018,G-F,6-6,190.0,"August 1, 1993",,4


In [99]:
stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv")
stats = stats[stats.Player.isin(players.name)]

# only after 1980
stats = stats[stats.Year >= 1980]

# without duplicated names --> to do: how to distinguish multiple player with the same name
stats = removeDuplicated(players, stats)
stats.head()

Unnamed: 0.1,Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,...,PTS,year_start,year_end,position,height,weight,birth_date,college,player_id,year_count
0,5733,1980.0,James Bailey,PF,22.0,SEA,67.0,,726.0,12.3,...,312.0,1980,1988,F-C,6-9,220.0,"May 21, 1957",Rutgers University,119,0.0
1,5753,1980.0,Lawrence Boston,PF,23.0,WSB,13.0,,125.0,12.6,...,56.0,1980,1980,F,6-8,225.0,"May 18, 1956",University of Maryland,252,0.0
2,5759,1980.0,Dudley Bradley,SG,22.0,IND,82.0,,2027.0,13.5,...,688.0,1980,1989,G-F,6-6,195.0,"March 19, 1957",University of North Carolina,267,0.0
3,5784,1980.0,Bill Cartwright,C,22.0,NYK,82.0,,3150.0,17.9,...,1781.0,1980,1995,C,7-1,245.0,"July 30, 1957",University of San Francisco,399,0.0
4,5796,1980.0,Jeff Cook,PF,23.0,PHO,66.0,,904.0,14.4,...,362.0,1980,1988,F-C,6-10,215.0,"October 21, 1956",Idaho State University,481,0.0


In [193]:
metricsPerGameColNames = ["PTS","AST","TOV","TRB","STL","BLK"]
metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames)

metricsPerCentColNames = ["FG","FT","3P"]
metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames)

metricsWeightedColNames = ["PER"]
metricsWeightedDict = getMetricsWeightedDict(stats, metricsWeightedColNames)

allMetricsDict = {}
allMetricsDict.update(metricsPerGameDict)
allMetricsDict.update(metricsPerCentDict)
allMetricsDict.update(metricsWeightedDict)


offenseMetrics = ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P%"]
defenseMetrics = ["TRB_G","STL_G","BLK_G"]


In [196]:
metricsPerCentColNames = ["FG","FT","3P"]

In [200]:
def doThis(stats, metrics):
    metricsPerCentDict = {}
    for metric in metrics:
        newColName = metric+"%"
        metricsPerCentDict.update({newColName : getMetricPerCent(stats, metric)})
    return metricsPerCentDict

In [201]:
doThis(stats,metricsPerCentColNames)

{'3P%':                                 3P%
 Player             Year            
 A.C. Green         1986.0  0.166667
                    1987.0  0.000000
                    1988.0  0.000000
                    1989.0  0.235294
                    1990.0  0.282609
                    1991.0  0.200000
                    1992.0  0.214286
                    1993.0  0.347826
                    1994.0  0.228571
                    1995.0  0.338583
                    1996.0  0.269231
                    1997.0  0.050000
                    1998.0  0.000000
                    1999.0  0.000000
                    2000.0  0.250000
                    2001.0  0.000000
 A.J. Bramlett      2000.0  0.000000
 A.J. English       1991.0  0.096774
                    1992.0  0.176471
 A.J. Guyton        2001.0  0.391304
                    2002.0  0.373984
                    2003.0  0.000000
 A.J. Hammons       2017.0  0.500000
 A.J. Price         2010.0  0.344828
                    2011.0  0.2

In [199]:
def getMetricPerGame(stats, metric):
    columnsOfInterest = ["Year", "Player", "G", metric]
    newColName = metric+"_G"
    df = stats.loc[:,columnsOfInterest].groupby(["Player","Year"]).sum()
    df[newColName] = df[metric]/df["G"]
    return df.iloc[:,-1:]

def getMetricsPerGameDict(stats, metrics):
    metricsPerGameDict = {}
    for metric in metrics:
        newColName = metric+"_G"
        metricsPerGameDict.update({newColName : getMetricPerGame(stats, metric)})
        return metricsPerGameDict

def getMetricPerCent(stats, metric):
    attempts = metric+"A"
    columnsOfInterest = ["Year", "Player", metric, attempts]
    newColName = metric+"%"

    df = stats.loc[:,columnsOfInterest].groupby(["Player","Year"]).sum()
    df[newColName] = df[metric]/df[attempts]
    df.loc[df[attempts] == 0, newColName] = 0    # 0 if no attempt
    return df.iloc[:,-1:]

def getMetricsPerCentDict(stats, metrics):
    metricsPerCentDict = {}
    for metric in metrics:
        newColName = metric+"%"
        metricsPerCentDict.update({newColName : getMetricPerCent(stats, metric)})
        return metricsPerCentDict

def getMetricsWeighted(stats, metric):
    columnsOfInterest = ["Year", "Player", "G", metric]
    newColName = metric+"_w"
    df = stats.loc[:,columnsOfInterest]
    g_sum = stats.loc[:,["Player","Year","G"]].groupby(["Player","Year"]).sum().reset_index()
    df = df.merge(g_sum, on=["Year","Player"], how="left")
    df[newColName] = df[metric]*df["G_x"]/df["G_y"]
    df = df.groupby(["Player","Year"]).sum().iloc[:,-1:]
    return df

def getMetricsWeightedDict(stats, metrics):
    metricsWeightedDict = {}
    for metric in metrics:
        newColName = metric+"_w"
        metricsWeightedDict.update({newColName : getMetricsWeighted(stats, metric)})
        return metricsWeightedDict
