UNDER DEVELOPMENT
- all seasons field-aware factorization machine

# Prepare Model

In [None]:
import os
from glob import glob
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import power_transform
from sklearn.model_selection import train_test_split
from multiprocessing import Pool

In [None]:
if not os.path.exists('data/ffm'):
    os.mkdir('data/ffm')

In [None]:
season_files = sorted(glob('data/lineup_scores/*.pkl'))

In [None]:
def load_season(season_file, lineup_quantile):
    
    df = pd.read_pickle(season_file)

    # group the same exact lineups, to get an overall season results
    df = df.groupby(['off1', 'off2', 'off3', 'off4', 'off5', 
                     'def1', 'def2', 'def3', 'def4', 'def5']).agg({'seconds' : 'sum', 
                                                                   'points' : 'sum', 
                                                                   'season' : 'first'}).reset_index()
    
    # limit to longer lineup times, reduces the number of 0 scores
    df = df[df.seconds > df.seconds.quantile(lineup_quantile)].copy()
    
    player_time = pd.DataFrame()
    player_time['PERSON_ID'] = df[['off1', 'off2', 'off3', 'off4', 'off5', 
                                   'def1', 'def2', 'def3', 'def4', 'def5']].stack().values
    player_time['TIME'] = df.seconds.repeat(10).values
    player_time = player_time.groupby('PERSON_ID').TIME.sum().reset_index()

    df['off1_time'] = df.merge(player_time, left_on='off1', right_on='PERSON_ID', how='left')['TIME']
    df['off2_time'] = df.merge(player_time, left_on='off2', right_on='PERSON_ID', how='left')['TIME']
    df['off3_time'] = df.merge(player_time, left_on='off3', right_on='PERSON_ID', how='left')['TIME']
    df['off4_time'] = df.merge(player_time, left_on='off4', right_on='PERSON_ID', how='left')['TIME']
    df['off5_time'] = df.merge(player_time, left_on='off5', right_on='PERSON_ID', how='left')['TIME']

    df['def1_time'] = df.merge(player_time, left_on='def1', right_on='PERSON_ID', how='left')['TIME']
    df['def2_time'] = df.merge(player_time, left_on='def2', right_on='PERSON_ID', how='left')['TIME']
    df['def3_time'] = df.merge(player_time, left_on='def3', right_on='PERSON_ID', how='left')['TIME']
    df['def4_time'] = df.merge(player_time, left_on='def4', right_on='PERSON_ID', how='left')['TIME']
    df['def5_time'] = df.merge(player_time, left_on='def5', right_on='PERSON_ID', how='left')['TIME']

    # prelim analyses,  to avod bad data limit to players that have at least 3200 seconds of playing time
    good_rows = df[['off1_time', 'off2_time', 'off3_time', 'off4_time', 'off5_time', 
                    'def1_time', 'def2_time', 'def3_time', 'def4_time', 'def5_time']].min(1) > 3200
    df = df[good_rows].copy()
    df.reset_index(inplace=True, drop=True)    
    
    return df

In [None]:
lineup_quantile = 0.5
all_df = []

for season_file in tqdm(season_files, position=0, leave=True):
    
    with Pool(1) as pool:
        df = pool.starmap(load_season, [(season_file, lineup_quantile)])[0]

    all_df.append(df)
    
all_df = pd.concat(all_df, ignore_index=True)

In [None]:
features = ['off', 'def'] + all_df.season.unique().tolist()

In [None]:
player_df = pd.DataFrame()
player_df['PERSON_ID'] = all_df[['off1', 'off2', 'off3', 'off4', 'off5']].unstack().unique()
player_df['ID'] = np.arange(len(player_df))

In [None]:
all_df['off1_id'] = all_df.merge(player_df, left_on='off1', right_on='PERSON_ID', how='left')['ID']
all_df['off2_id'] = all_df.merge(player_df, left_on='off2', right_on='PERSON_ID', how='left')['ID']
all_df['off3_id'] = all_df.merge(player_df, left_on='off3', right_on='PERSON_ID', how='left')['ID']
all_df['off4_id'] = all_df.merge(player_df, left_on='off4', right_on='PERSON_ID', how='left')['ID']
all_df['off5_id'] = all_df.merge(player_df, left_on='off5', right_on='PERSON_ID', how='left')['ID']

all_df['def1_id'] = all_df.merge(player_df, left_on='def1', right_on='PERSON_ID', how='left')['ID']
all_df['def2_id'] = all_df.merge(player_df, left_on='def2', right_on='PERSON_ID', how='left')['ID']
all_df['def3_id'] = all_df.merge(player_df, left_on='def3', right_on='PERSON_ID', how='left')['ID']
all_df['def4_id'] = all_df.merge(player_df, left_on='def4', right_on='PERSON_ID', how='left')['ID']
all_df['def5_id'] = all_df.merge(player_df, left_on='def5', right_on='PERSON_ID', how='left')['ID']

In [None]:
all_df['target'] = power_transform((all_df.points / all_df.seconds).values.reshape(-1,1)).ravel()

In [None]:
all_df.to_pickle('data/ffm/nba_all_df.pkl')

In [None]:
train_df, test_df = train_test_split(all_df)

In [None]:
def save_txt(filename, df):
    
    with open(filename, 'w') as f_out:

        for i, row in tqdm(df.iterrows(), total=len(df)):

            season_feature = np.where(np.isin(features, row['season']))[0][0]

            line = ''
            line += str(row['target']) + ' '

            for oi in ['off1_id', 'off2_id', 'off3_id', 'off4_id', 'off5_id']:
                oid = row[oi] 
                line += str(oid) + ':0:1 ' + str(oid) + ':' + str(season_feature) + ':1 '

            for di in ['def1_id', 'def2_id', 'def3_id', 'def4_id', 'def5_id']:
                did = row[di] 
                line += str(did) + ':1:1 ' + str(did) + ':' + str(season_feature) + ':1 '

            line = line[:-1] + '\n'

            f_out.write(line)

In [None]:
save_txt('data/ffm/nba_ffm_train.txt', train_df)
save_txt('data/ffm/nba_ffm_test.txt', test_df)
save_txt('data/ffm/nba_ffm_full_train.txt', all_df)

In [None]:
# create prediction
pred_df = pd.DataFrame()
pred_df['PERSON_ID'] = all_df[['off1', 'off2', 'off3', 'off4', 'off5']].stack().values
pred_df['LAT_ID'] = all_df[['off1_id', 'off2_id', 'off3_id', 'off4_id', 'off5_id']].stack().values
pred_df['season'] = np.repeat(all_df.season.values, 5)
pred_df.drop_duplicates(inplace=True)

pred_df = pd.concat([pred_df, pred_df], ignore_index=True)
pred_df['poss'] = [0] * (len(pred_df)//2) + [1] * (len(pred_df)//2)

pred_df.to_pickle('data/ffm/nba_pred_df.pkl')

In [None]:
pred_file = 'data/ffm/nba_ffm_pred.txt'

with open(pred_file, 'w') as f_out:
    
    for i, row in tqdm(pred_df.iterrows(), total=len(pred_df)):
        
        line = '1 '
        
        season_feature = np.where(np.isin(features, row['season']))[0][0]
        
        line += str(row['LAT_ID']) + ':' + str(row['poss']) + ':1 '
        line += str(row['LAT_ID']) + ':' + str(season_feature) + ':1\n'
        
        f_out.write(line)

# Train Models

In [None]:
import pandas as pd
import xlearn
from wurlitzer import pipes, STDOUT
from tqdm import tqdm
import io
import numpy as np
from multiprocessing import Pool
import joblib
import os

In [None]:
#lrs = [0.01, 0.02, 0.05, 0.075, 0.1, 0.15, 0.2]
#lmbs = [0.00002, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01]
#ks = [1, 2, 3, 4]

lrs = [0.02, 0.05]
lmbs = [0.0002, 0.0005]
ks = [2, 3]

params = [(lr, lmb, k) for lr in lrs for lmb in lmbs for k in ks]

In [None]:
def train(lr, lmb, k):
    
    train_param = {'task':'reg', 'init': 0.1, 'k':k, 'lr':lr, 'lambda':lmb}

    # setting up the FM
    fm = xlearn.create_ffm()
    fm.setTrain('data/ffm/nba_ffm_train.txt')
    fm.setValidate('data/ffm/nba_ffm_test.txt')
    
    out = io.StringIO()
    with pipes(stdout=out, stderr=STDOUT):
        fm.cv(train_param)
    out_val = out.getvalue()
    # get the cv loss
    cv_mse = float(out_val.split('Average mse_loss:')[1].split('\n')[0].strip())
    
    fm = xlearn.create_ffm()
    fm.setTrain('data/ffm/nba_ffm_full_train.txt')
    fm.setTXTModel('data/ffm/model.txt')
    
    out = io.StringIO()
    with pipes(stdout=out, stderr=STDOUT):
        fm.fit(train_param, 'data/ffm/model.out')
    full_out_val = out.getvalue()
    
    # run prediction
    fm = xlearn.create_ffm()
    fm.setTest('data/ffm/nba_ffm_pred.txt')
    # make predictions
    pred_file = 'data/ffm/predict.txt'
    out = io.StringIO()
    with pipes(stdout=out, stderr=STDOUT):
        fm.predict(f'data/ffm/model.out', pred_file)
        
    pred_values = []
    with open(pred_file) as f:
        for line in f:
            pred_values.append(float(line.replace('\n', '')))
    pred_values = np.asarray(pred_values)
    pred_values = (pred_values - pred_values.mean()) / pred_values.std()    
    
    return cv_mse, pred_values

In [None]:
cv_mse_list = []
pred_values_list = []

for (lr, lmb, k) in tqdm(params, position=0, leave=True):

    filename = 'data/ffm/{lr}_{lmb}_{k}.pkl'
    
    if os.path.exists(filename):
        continue
        
    with Pool(1) as pool:
        cv_mse, pred_values = pool.starmap(train, [[lr, lmb, k]])[0]
        
    cv_mse_list.append(cv_mse)
    pred_values_list.append(pred_values)

In [None]:
all_pred_df = []

for cv_mse, pred_values in zip(cv_mse_list, pred_values_list):
    
    pred_df = pd.read_pickle('data/ffm/nba_pred_df.pkl')
    
    pred_df['pred'] = pred_values
    pred_df['cv_mse'] = cv_mse
   
    all_pred_df.append(pred_df)
    
all_pred_df = pd.concat(all_pred_df, ignore_index=True)
all_pred_df.to_pickle('data/ffm/all_pred_df.pkl')