In [1]:
import io
import os
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
import scipy.sparse as sp
from wurlitzer import pipes, STDOUT
from sklearn.datasets import dump_svmlight_file
import xlearn

In [2]:
epsilons = [0.001, 0.0001]
sample_prcts = [0.7] #[0.7, 0.75, 0.8]
sample_params = [{'epsilon' : e, 'sample_prct' : s} for e in epsilons for s in sample_prcts] 

In [3]:
lrs = [0.01] #[0.01, 0.02, 0.05, 0.075, 0.1, 0.15, 0.2]
lmbs = [0.0001] #[0.00002, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01]
ks = [3] #[2, 3, 4]
train_params = [{'task' : 'reg', 'init' : 0.1, 'lr' : lr, 'lambda' : lmb, 'k' : k} 
                for lr in lrs for lmb in lmbs for k in ks]

In [6]:
def process_season(season, base_folder = 'data/lineup'):
    
    print(season)
    
    all_preds = []
    
    # load all season lineups
    mat, target, player_df = _create_sparse_data(season, base_folder)
    
    # sample data
    for sample_param in tqdm(sample_params):
        
        sample_prct = sample_param['sample_prct']
        epsilon = sample_param['epsilon']
            
        rand_idx = target.sample(int(sample_prct * len(target)), 
                                 replace=False, weights=target+epsilon).index.values

        predict_file = f'/tmp/nba_predict.txt'
        dump_svmlight_file(sp.eye(len(player_df)*2), [1]*(len(player_df)*2), predict_file)

        train_file = f'/tmp/nba_train.txt'
        dump_svmlight_file(mat[rand_idx], target[rand_idx], train_file)
        
        for train_param in train_params:
            
            fm = xlearn.create_fm()
            fm.setTrain(train_file)

            out = io.StringIO()
            with pipes(stdout=out, stderr=STDOUT):
                fm.cv(train_param)
            out_val = out.getvalue()
            cv_mse = float(out_val.split('Average mse_loss:')[1].split('\n')[0].strip())            
            
            fm = xlearn.create_fm()
            fm.setTrain(train_file)
            fm.setTXTModel(f'/tmp/nba_model.txt')

            out = io.StringIO()
            with pipes(stdout=out, stderr=STDOUT):
                fm.fit(train_param, f'/tmp/nba_model.out')
            full_out_val = out.getvalue()
            
            # run prediction
            fm = xlearn.create_fm()
            fm.setTest(predict_file)
            # make predictions
            pred_file = '/tmp/nba_mod_predict.txt'
            out = io.StringIO()
            with pipes(stdout=out, stderr=STDOUT):
                fm.predict(f'/tmp/nba_model.out', pred_file)

            pred_df = pd.DataFrame()
            pred_values = []
            with open(pred_file) as f:
                for line in f:
                    pred_values.append(float(line.replace('\n', '')))
            pred_df['predict'] = pred_values
            pred_df['LAT_ID'] = np.arange(len(pred_df))
            
            player_pred = player_df.copy()
            player_pred = player_pred.merge(pred_df.rename(columns={'LAT_ID' : 'off_id', 
                                                                    'predict' : 'off_pred'}), on='off_id')
            player_pred = player_pred.merge(pred_df.rename(columns={'LAT_ID' : 'def_id', 
                                                                    'predict' : 'def_pred'}), on='def_id')
            
            player_pred['off_norm'] = (player_pred.off_pred - player_pred.off_pred.mean()) / player_pred.off_pred.std()
            player_pred['def_norm'] = (player_pred.def_pred - player_pred.def_pred.mean()) / player_pred.def_pred.std()
            player_pred['cv_mse'] = cv_mse
            
            all_preds.append(player_pred)
            
    
    [os.remove(f) for f in glob('/tmp/nba*')]
    
    all_preds = pd.concat(all_preds, ignore_index=True)
    
    preds_grp = []
    for person_id, g in all_preds.groupby('person_id'):
        off_norm = np.average(g.off_norm, weights=1/g.cv_mse)
        def_norm = np.average(g.def_norm, weights=1/g.cv_mse)
        time = g.time.iloc[0]
        preds_grp.append([person_id, time, off_norm, def_norm])
    preds_grp = pd.DataFrame(preds_grp, columns=['person_id', 'time', 'off_norm', 'def_norm'])
    
    all_players = _get_player_info(season)
    
    res = all_players.merge(preds_grp, on='person_id')
    
    return res
            
def _create_sparse_data(season, base_folder):
    df = pd.concat([pd.read_csv(f) for f in glob(f'{base_folder}/{season}/*.csv')], ignore_index=True)
    df['time'] = df.end - df.start   
    
    # group lineups 
    data = []
    for k,g in df.groupby(['h1', 'h2', 'h3', 'h4', 'h5', 'v1', 'v2', 'v3', 'v4', 'v5']):
        data.append(k + (g.home_points.sum(), g.visit_points.sum(), g.time.sum()))
    data = pd.DataFrame(data, columns=['h1', 'h2', 'h3', 'h4', 'h5', 'v1', 'v2', 'v3', 'v4', 'v5', 
                                       'home_points', 'visit_points', 'time'])
    
    # get player ids
    player_df = _get_player_df(data)
    
    # add sparse id to lineups
    new_ids = [('h1', 'off'), ('h2', 'off'), ('h3', 'off'), ('h4', 'off'), ('h5', 'off'), 
               ('v1', 'off'), ('v2', 'off'), ('v3', 'off'), ('v4', 'off'), ('v5', 'off'), 
               ('h1', 'def'), ('h2', 'def'), ('h3', 'def'), ('h4', 'def'), ('h5', 'def'), 
               ('v1', 'def'), ('v2', 'def'), ('v3', 'def'), ('v4', 'def'), ('v5', 'def')]
    for i, p in new_ids:
        data = _add_sparse_id(data, player_df, i, p)
        
    data['home_per_time'] = data.home_points / data.time
    data['visit_per_time'] = data.visit_points / data.time
    
    data_home = data[['h1_off', 'h2_off', 'h3_off', 'h4_off', 'h5_off', 
                      'v1_def', 'v2_def', 'v3_def', 'v4_def', 'v5_def', 'home_per_time']].copy()
    data_home['is_home'] = 1
    data_visit = data[['v1_off', 'v2_off', 'v3_off', 'v4_off', 'v5_off', 
                       'h1_def', 'h2_def', 'h3_def', 'h4_def', 'h5_def', 'visit_per_time']].copy()
    data_visit['is_home'] = 0
    
    data_home.columns = ['off1', 'off2', 'off3', 'off4', 'off5', 
                         'def1', 'def2', 'def3', 'def4', 'def5',
                         'points_per_time', 'is_home']
    data_visit.columns = ['off1', 'off2', 'off3', 'off4', 'off5', 
                          'def1', 'def2', 'def3', 'def4', 'def5', 
                          'points_per_time', 'is_home']
    
    data = pd.concat([data_home, data_visit], ignore_index=True)
    
    data.dropna(inplace=True)
    data.reset_index(inplace=True, drop=True)
    
    # create sparse matrix
    
    target = data.points_per_time

    cols= data[['off1', 'off2', 'off3', 'off4', 'off5', 
                'def1', 'def2', 'def3', 'def4', 'def5']].stack()
    rows = cols.index.get_level_values(0)
    z = [1] * len(rows)
    
    mat = sp.csr_matrix((z,(rows, cols)),shape=(len(data), len(player_df)*2))
    
    return mat, target, player_df

def _get_player_df(data):

    player_df = pd.DataFrame()
    player_df['person_id'] = data[['h1', 'h2', 'h3', 'h4', 'h5', 
                                   'v1', 'v2', 'v3', 'v4', 'v5']].unstack().unique()

    player_time = pd.DataFrame()
    player_time['person_id'] =  data[['h1', 'h2', 'h3', 'h4', 'h5', 
                                      'v1', 'v2', 'v3', 'v4', 'v5']].stack().values
    player_time['time'] = np.repeat(data.time.values, 10)
    player_time = player_time.groupby('person_id').time.sum().reset_index()

    player_df = player_df.merge(player_time, on = 'person_id')

    player_df['off_id'] = np.arange(len(player_df))
    player_df['def_id'] = np.arange(len(player_df)) + len(player_df)
    
    return player_df

def _add_sparse_id(data, player_df, ind, pos):
    data = data.merge(player_df[[f'{pos}_id', 'person_id']].rename(columns={'person_id' : ind, 
                                                                            f'{pos}_id' : f'{ind}_{pos}'}), 
                      on=ind, how='left')
    return data

def _get_player_info(season):
    rot_files = glob(f'data/nba-api/rotation/{season}/*.csv')
    
    all_players = []
    for rot_file in tqdm(rot_files):
        r = pd.read_csv(rot_file)
        player = r[['TEAM_NAME', 'PERSON_ID', 'PLAYER_FIRST', 'PLAYER_LAST']].drop_duplicates()
        all_players.append(player)
    all_players = pd.concat(all_players, ignore_index=True).drop_duplicates()
    all_players.reset_index(inplace=True, drop=True)
    all_players.columns = [c.lower() for c in all_players.columns]
    all_players['player_name'] = all_players.player_first + ' ' + all_players.player_last
    
    return all_players


import plotly.graph_objects as go

def create_plot(res):
    
    fig = go.Figure(go.Scatter(x = res['off_norm'], 
                               y = res['def_norm'], 
                               mode = 'markers', 
                               marker = {'size' : res['time']/5000},
                               customdata = (res['time']/60).round(),
                               hovertemplate = ('<b>%{hovertext}</b><br>' +
                                                'Off : %{x:.2f}<br>' +
                                                'Def : %{y:.2f}<br>' +
                                                'Minutes : %{customdata}' + 
                                                '<extra></extra>'),
                               hovertext = res['player_name']))

    fig.update_layout(title={'text': season,
                             'y':0.98, 'x':0.5, 
                             'xanchor': 'center', 'yanchor': 'top'},
                      autosize = True,
                      xaxis = dict(title = 'Offensive PPM'), 
                      yaxis = dict(title = 'Defensive PPM'))

    fig.add_vline(x=res['off_norm'].mean(), line_width=2, line_dash="dash", line_color="red")
    fig.add_hline(y=res['def_norm'].mean(), line_width=2, line_dash="dash", line_color="red")

    buttons = [dict(method='update',
                    label='All',
                    args=[{'x': [res['off_norm']],
                           'y': [res['def_norm']],
                           'customdata' : [(res['time']/60).round()],
                           'hovertext' : [res['player_name']],
                           'marker' : [{'size' : res['time']/5000, 
                                        'text' : res['player_name']}]}])]

    for team in res.team_name.unique():

        buttons.append(dict(method='update',
                            label=team,
                            args=[{'x': [res.loc[res.team_name==team, 'off_norm']],
                                   'y': [res.loc[res.team_name==team, 'def_norm']],
                                   'customdata' : [(res.loc[res.team_name==team, 'time']/60).round()],
                                   'hovertext' : [res.loc[res.team_name==team, 'player_name']],
                                   'marker' : [{'size' : res.loc[res.team_name==team, 'time']/5000, 
                                                'text' : res.loc[res.team_name==team, 'player_name']}]}]))


    fig.update_layout(updatemenus=[dict(buttons=buttons, direction='down', x=0.1, y=1.1, showactive=True)])    
    
    return fig

In [10]:
season = '1997-98'
res = process_season(season)

1997-98


100%|█████████████████████████████████████████████| 2/2 [00:02<00:00,  1.33s/it]
100%|██████████████████████████████████████| 1189/1189 [00:02<00:00, 402.73it/s]


In [11]:
fig = create_plot(res)