In [2]:
# Load Data
from datetime import timedelta

import pandas as pd
import numpy as np
import dateutil

VERBOSE = False
SPORT = 'MLB'
USE_QGRID = True
    
def show_df(df):
    if USE_QGRID:
        import qgrid
        display(qgrid.show_grid(df))
    else:
        display(df)

# transformation functions to apply to dfs
xform_rotowire = lambda df: df
xform_rotogrinder = lambda df: df
xform_fantasy = lambda df: df

if SPORT == 'NHL':
    pos_groups = [None, ['G'], ['LW', 'RW', 'C', 'D']]
    rotowire_file = "/home/delano/Google Drive/fantasy/nhl/external-projections/rotowire_all.csv"
    rotogrinder_file = "/home/delano/Google Drive/fantasy/nhl/external-projections/grinder_all.csv"
    fantasy_file = "/home/delano/scratch/nhl-prediction-dump.csv"
elif SPORT == 'NBA':
    pos_groups = [None]
    rotowire_file = "/home/delano/Google Drive/fantasy/nba/external-projections/rotowire_all.csv"
    rotogrinder_file = "/home/delano/Google Drive/fantasy/nba/external-projections/grinder_all.csv"
    fantasy_file = "/home/delano/scratch/nba-prediction-dump.csv"    
elif SPORT == 'NFL':
    pos_groups = [None, ['QB'], ['RB'], ['WR', 'TE'], ['DEF']]
    rotowire_file = "/home/delano/Google Drive/fantasy/nfl/external-projections/rotowire_all.csv"
    rotogrinder_file = "/home/delano/Google Drive/fantasy/nfl/external-projections/grinder_all.csv"
    fantasy_file = "/home/delano/scratch/nfl-prediction-dump.csv" 
    # transform team predictions to position DEF, and convert all dates to sunday
    def to_sunday(date_str):
        date_ = dateutil.parser.parse(date_str)
        days_to_closest_sunday = (timedelta(days=-1)
                              if date_.weekday() == 0 else
                              timedelta(days=(6 - date_.weekday())))
        closest_sunday = date_ + days_to_closest_sunday
        return closest_sunday.strftime("%Y-%m-%d")

    def xform_fantasy(df): 
        df['pos'] = np.where(np.isnan(df['player_id']), 'DEF', df.pos)
        for service in ['dk', 'fd', 'y']:
            for stat_type in ['pred', 'calc']:
                def_col = '{}:{}_score_def'.format(stat_type, service)
                off_col = '{}:{}_score_off'.format(stat_type, service)
                df['{}:{}_score'.format(stat_type, service)] = np.where(
                    np.isnan(df.player_id), 
                    df[def_col], 
                    df[off_col])
                df = df.drop(columns=[def_col, off_col])
        df['date'] = df.apply(lambda row: to_sunday(row['date']), axis=1)
        return df
    
    def xform_rotogrinder(df): 
        df['player'] = np.where(df.position == 'DST', '', df.player)
        df['position'] = np.where(df.position == 'DST', 'DEF', df.position)
        return df

    def xform_rotowire(df): 
        player_col = df.columns[1]
        df[player_col] = np.where(df.POS == 'D', '', df[player_col])
        df['POS'] = np.where(df.POS == 'D', 'DEF', df.POS)
        return df
    
    
elif SPORT == 'MLB':
    pos_groups = [None, ['P'], ['C', '1B', '2B', '3B', 'SS', 'LF', 'CF', 'RF']]
    rotowire_file = None
    rotogrinder_file = "/home/delano/Google Drive/fantasy/mlb/external-projections/grinder_all.csv"
    fantasy_file = "/home/delano/scratch/mlb-prediction-dump.csv"    
else:
    raise NotImplementedError()

if rotowire_file is not None:
    rotowire_df = xform_rotowire(pd.read_csv(rotowire_file))
    if VERBOSE:
        display("rotowire")
        show_df(rotowire_df)
else:
    rotowire_df = None
if rotogrinder_file is not None:
    rotogrinder_df = xform_rotogrinder(pd.read_csv(rotogrinder_file))
    if VERBOSE:
        display("rotogrinder")
        show_df(rotogrinder_df)
else:
    rotogrinder_df = None
fantasy_df = xform_fantasy(pd.read_csv(fantasy_file))
if VERBOSE:
    display("fantasy")
    show_df(fantasy_df)

FileNotFoundError: [Errno 2] File /home/delano/scratch/mlb-prediction-dump.csv does not exist: '/home/delano/scratch/mlb-prediction-dump.csv'

In [None]:
rotowire_pts_df = rotowire_df.rename(columns={rotowire_df.columns[1]: 'player',
                                              'TEAM': 'team'})
rotowire_pts_df['service'] = rotowire_pts_df.apply(lambda row: 'rotowire:' + row[0].split('.')[1], axis=1)
rotowire_pts_df['date'] = pd.to_datetime(rotowire_pts_df.apply(lambda row: row[0].split('.')[2], axis=1))

rotowire_pts_df = rotowire_pts_df.pivot_table(index=['date', 'team', 'player'], values='FPTS', columns='service')
if VERBOSE:
    show_df(rotowire_pts_df)

In [None]:
rotogrinder_pts_df = rotogrinder_df
rotogrinder_pts_df['service'] = rotogrinder_pts_df.apply(lambda row: 'rotogrinder:' + row.file.split('.')[1], axis=1)
rotogrinder_pts_df['date'] = pd.to_datetime(rotogrinder_pts_df.apply(lambda row: row.file.split('.')[2], axis=1))

rotogrinder_pts_df = rotogrinder_pts_df.pivot_table(index=['date', 'team', 'player'], values='pts', columns='service')
if VERBOSE:
    show_df(rotogrinder_pts_df)

In [None]:
fantasy_pts_df = fantasy_df
fantasy_pts_df['player'] = np.where(np.isnan(fantasy_pts_df.player_id), '', 
                                    fantasy_pts_df.first_name + ' ' + fantasy_pts_df.last_name)
fantasy_cols = ['date', 'team', 'pos', 'player'] + \
    [col for col in fantasy_pts_df.columns if ':' in col and 'goal' not in col]
fantasy_pts_df = fantasy_pts_df[fantasy_cols]
fantasy_pts_df = fantasy_pts_df.rename(columns=lambda col: col.split('_')[0] if '_' in col else col)
fantasy_pts_df['date'] = pd.to_datetime(fantasy_pts_df.date)

if VERBOSE:
    show_df(fantasy_pts_df)

In [None]:
# merge data
merged_df = fantasy_pts_df.join(rotowire_pts_df, on=['date', 'team', 'player'], how='left') \
                          .join(rotogrinder_pts_df, on=['date', 'team', 'player'], how='left') \

if VERBOSE:
    show_df(merged_df)

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score

services = []
predictors = []
mae = []
r2 = []
positions = []

def analyze_performance(df, true_col, pred_col):
    services.append(service)
    predictors.append(predictor)
    mae.append(mean_absolute_error(df[true_col], df[pred_col]))
    r2.append(r2_score(df[true_col], df[pred_col]))
    positions.append(pos)
    

for service in ['y', 'dk', 'fd']:
    true_col = 'calc:' + service
    for predictor in ['pred', 'rotogrinder', 'rotowire']:
        pred_col = predictor + ':' + service
        if pred_col not in merged_df.columns:
            display("skipping {}, missing prediction column found".format(pred_col))
            continue
        df = merged_df[~np.isnan(merged_df[pred_col])]
        if len(df) == 0:
            display("skipping {}, no prediction data found".format(pred_col))
            continue
        for pos in pos_groups:
            if pos is not None:
                pos_df = df[df.pos.isin(pos)]
                if len(pos_df) == 0:
                    display("skipping {} {}, no data found".format(pred_col, pos))
                    # show_df(merged_df)
                    continue
            else:
                pos_df = df
                    
            analyze_performance(pos_df, true_col, pred_col)

scores_df = pd.DataFrame({
    'service': services,
    'predictor': predictors,
    'pos': [','.join(pos) if pos is not None else 'all'
            for pos in positions],
    'mae': mae,
    'r2': r2
})
    
scores_df

In [None]:
import matplotlib.pyplot as plt

PLOT_SIDE_LENGTH = 5

fig, axes = plt.subplots(len(scores_df), 2, figsize=(PLOT_SIDE_LENGTH * 2, PLOT_SIDE_LENGTH * len(scores_df)))

axis_i = 0

for service in ['y', 'dk', 'fd']:
    true_col = 'calc:' + service
    for predictor in ['pred', 'rotogrinder', 'rotowire']:
        pred_col = predictor + ':' + service
        if pred_col not in merged_df.columns:
            continue

        df = merged_df[~np.isnan(merged_df[pred_col])].copy()
        df['residual'] = df[true_col] - df[pred_col]
        min_pts = min(df[true_col].min(), df[pred_col].min()) - 3
        max_pts = max(df[true_col].max(), df[pred_col].max()) + 4
        
        for pos in pos_groups:
            pos_df = df[df.pos.isin(pos)] if pos is not None else df
            
            if len(pos_df) == 0:
                continue
            
            score_df_pos = ','.join(pos) if pos is not None else 'all'
            result = scores_df[(scores_df.service == service) & 
                               (scores_df.predictor == predictor) &
                               (scores_df.pos == score_df_pos)]
            
            # plot pred vs true
            axes[axis_i, 0].set_xlim(min_pts, max_pts)
            axes[axis_i, 0].set_ylim(min_pts, max_pts)
            axes[axis_i, 0].text(min_pts + 1, max_pts - 5, 
                                 'r2={:.3}, mae={:.3}'.format(float(result.r2), float(result.mae)))
            
            axes[axis_i, 0].set_title('{}:{}:{} (n={})'.format(predictor, service, pos or 'all', len(pos_df)))
            axes[axis_i, 0].set_xlabel(true_col)
            axes[axis_i, 0].set_ylabel(pred_col)
            axes[axis_i, 0].scatter(pos_df[true_col], pos_df[pred_col])
            axes[axis_i, 0].plot([min_pts, max_pts], [min_pts, max_pts], '-k')
            
            # plot of residuals
            axes[axis_i, 1].set_xlim(min_pts, max_pts)
            axes[axis_i, 1].set_ylim((max_pts - min_pts) / -2, (max_pts - min_pts) / 2)
            
            axes[axis_i, 1].set_title('{}:{}:{} residual (n={})'.format(predictor, service, pos or 'all', len(pos_df)))
            axes[axis_i, 1].set_xlabel(true_col)
            axes[axis_i, 1].set_ylabel(pred_col + ' residual')
            axes[axis_i, 1].scatter(pos_df[true_col], pos_df['residual'])
            axes[axis_i, 1].plot([min_pts, max_pts], [0, 0], '-k')
            
            axis_i += 1