# Train Contributions

In [None]:
import os
import io
from glob import glob

import pandas as pd
import numpy as np
from tqdm import tqdm
from wurlitzer import pipes, STDOUT
import scipy.sparse as sp
from sklearn.preprocessing import power_transform
from sklearn.model_selection import train_test_split
from sklearn.datasets import dump_svmlight_file
import xlearn

In [None]:
if not os.path.exists('data/contribution_predictions'):
    os.makedirs('data/contribution_predictions')

## Parameters

In [None]:
season_files = sorted(glob('data/lineup_scores/*.pkl'))
quantiles = [0.1, 0.2, 0.3, 0.4, 0.5]

#lrs = [0.01, 0.02, 0.05, 0.075, 0.1, 0.15, 0.2, 0.25, 0.5]
#lmbs = [0.00002, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01]
lrs = [0.05]
lmbs = [0.00002]

params = [(lr, lmb) for lr in lrs for lmb in lmbs for _ in range(5)]

## Train ensemble

In [None]:
for season_file in tqdm(season_files):

    filename = 'data/contribution_predictions/' + season_file.split('/')[-1]
    
    if os.path.exists(filename):
        continue
    
    res = []
    count = 0
    
    for quantile in quantiles:
        
        df = pd.read_pickle(season_file)
        
        player_time = pd.DataFrame()
        player_time['PERSON_ID'] = df[['off1', 'off2','off3', 'off4', 'off5', 'def1', 'def2', 'def3', 'def4', 'def5']].stack().values
        player_time['TIME'] = df.seconds.repeat(10).values
        player_time = player_time.groupby('PERSON_ID').TIME.sum().reset_index()

        df['off1_time'] = df.merge(player_time, left_on='off1', right_on='PERSON_ID', how='left')['TIME']
        df['off2_time'] = df.merge(player_time, left_on='off2', right_on='PERSON_ID', how='left')['TIME']
        df['off3_time'] = df.merge(player_time, left_on='off3', right_on='PERSON_ID', how='left')['TIME']
        df['off4_time'] = df.merge(player_time, left_on='off4', right_on='PERSON_ID', how='left')['TIME']
        df['off5_time'] = df.merge(player_time, left_on='off5', right_on='PERSON_ID', how='left')['TIME']

        df['def1_time'] = df.merge(player_time, left_on='def1', right_on='PERSON_ID', how='left')['TIME']
        df['def2_time'] = df.merge(player_time, left_on='def2', right_on='PERSON_ID', how='left')['TIME']
        df['def3_time'] = df.merge(player_time, left_on='def3', right_on='PERSON_ID', how='left')['TIME']
        df['def4_time'] = df.merge(player_time, left_on='def4', right_on='PERSON_ID', how='left')['TIME']
        df['def5_time'] = df.merge(player_time, left_on='def5', right_on='PERSON_ID', how='left')['TIME']
        
        good_rows = df[['off1_time', 'off2_time', 'off3_time', 'off4_time', 'off5_time', 
                        'def1_time', 'def2_time', 'def3_time', 'def4_time', 'def5_time']].min(1) > 3200
        df = df[good_rows].copy()
        df.reset_index(inplace=True, drop=True)
        
        good_rows = df[['off1_time', 'off2_time', 'off3_time', 'off4_time', 'off5_time', 
                        'def1_time', 'def2_time', 'def3_time', 'def4_time', 'def5_time']].min(1) > player_time.TIME.quantile(quantile)
        df = df[good_rows].copy()
        df.reset_index(inplace=True, drop=True)
        
        player_df = pd.DataFrame()
        player_df['PERSON_ID'] = df[['off1', 'off2', 'off3', 'off4', 'off5']].unstack().unique()
        player_df['OFF_ID'] = np.arange(len(player_df))
        player_df['DEF_ID'] = np.arange(len(player_df)) + len(player_df)
        
        df['off1_id'] = df.merge(player_df, left_on='off1', right_on='PERSON_ID', how='left')['OFF_ID']
        df['off2_id'] = df.merge(player_df, left_on='off2', right_on='PERSON_ID', how='left')['OFF_ID']
        df['off3_id'] = df.merge(player_df, left_on='off3', right_on='PERSON_ID', how='left')['OFF_ID']
        df['off4_id'] = df.merge(player_df, left_on='off4', right_on='PERSON_ID', how='left')['OFF_ID']
        df['off5_id'] = df.merge(player_df, left_on='off5', right_on='PERSON_ID', how='left')['OFF_ID']
        
        df['def1_id'] = df.merge(player_df, left_on='def1', right_on='PERSON_ID', how='left')['DEF_ID']
        df['def2_id'] = df.merge(player_df, left_on='def2', right_on='PERSON_ID', how='left')['DEF_ID']
        df['def3_id'] = df.merge(player_df, left_on='def3', right_on='PERSON_ID', how='left')['DEF_ID']
        df['def4_id'] = df.merge(player_df, left_on='def4', right_on='PERSON_ID', how='left')['DEF_ID']
        df['def5_id'] = df.merge(player_df, left_on='def5', right_on='PERSON_ID', how='left')['DEF_ID']
        
        target = power_transform((df.points / df.seconds).values.reshape(-1,1)).ravel()
        
        cols = df[['off1_id', 'off2_id', 'off3_id', 'off4_id', 'off5_id', 
                   'def1_id', 'def2_id', 'def3_id', 'def4_id', 'def5_id']].stack()
        rows = cols.index.get_level_values(0)
        z = [1] * len(rows)

        mat = sp.csr_matrix((z,(rows, cols)),shape=(len(df), len(player_df)*2))
        
        predict_file = f'/tmp/nba_predict.txt'
        dump_svmlight_file(sp.eye(len(player_df)*2), [1]*(len(player_df)*2), predict_file)
        
        X_train, X_val, y_train, y_val = train_test_split(mat, target, test_size=0.1)
    
        train_file = f'/tmp/nba_train.txt'
        valid_file = f'/tmp/nba_valid.txt'
        full_train_file = f'/tmp/nba_full_train.txt'
        dump_svmlight_file(X_train, y_train, train_file)
        dump_svmlight_file(X_val, y_val, valid_file)
        dump_svmlight_file(mat, target, full_train_file)
        
        for (lr, lmb) in params:
            
            train_param = {'task':'reg', 'init': 0.1, 'k':1, 'lr':lr, 'lambda':lmb}
        
            # setting up the FM
            fm = xlearn.create_fm()
            fm.setTrain(train_file)
            fm.setValidate(valid_file)

            out = io.StringIO()
            with pipes(stdout=out, stderr=STDOUT):
                fm.cv(train_param)
            out_val = out.getvalue()
            # get the cv loss
            cv_mse = float(out_val.split('Average mse_loss:')[1].split('\n')[0].strip())

            fm = xlearn.create_fm()
            fm.setTrain(full_train_file)
            fm.setTXTModel(f'/tmp/model.txt')

            out = io.StringIO()
            with pipes(stdout=out, stderr=STDOUT):
                fm.fit(train_param, f'/tmp/model.out')
            full_out_val = out.getvalue()

            # run prediction
            #fm = xlearn.create_fm()
            fm.setTest(predict_file)
            # make predictions
            pred_file = '/tmp/predict.txt'
            out = io.StringIO()
            with pipes(stdout=out, stderr=STDOUT):
                fm.predict(f'/tmp/model.out', pred_file)

            pred_df = pd.DataFrame()
            pred_values = []
            with open(pred_file) as f:
                for line in f:
                    pred_values.append(float(line.replace('\n', '')))
            pred_df['contribution'] = pred_values
            pred_df['LAT_ID'] = np.arange(len(pred_df))

            player_df_temp = player_df.copy()
            player_df_temp['off_contribution'] = player_df_temp.merge(pred_df, left_on='OFF_ID', right_on='LAT_ID', how='left')['contribution']
            player_df_temp['def_contribution'] = player_df_temp.merge(pred_df, left_on='DEF_ID', right_on='LAT_ID', how='left')['contribution']
            
            player_df_temp['off_contribution_norm'] = ((player_df_temp.off_contribution - player_df_temp.off_contribution.mean()) / 
                                                       (player_df_temp.off_contribution.std()))
            player_df_temp['def_contribution_norm'] = ((player_df_temp.def_contribution - player_df_temp.def_contribution.mean()) / 
                                                       (player_df_temp.def_contribution.std()))
            
            player_df_temp = player_df_temp.merge(player_time, on='PERSON_ID', how='left')
            
            player_df_temp['cv_mse'] = cv_mse
            player_df_temp['quantile'] = quantile
            player_df_temp['counter'] = count
            count += 1
            
            res.append(player_df_temp)
            
    res = pd.concat(res, ignore_index=True)
    res['season'] = season_file.split('/')[-1].split('.')[0]
    
    res.to_pickle(filename)