In [38]:
import sys
# sys.path.append('')
from datetime import date
import pytest
from pandas import date_range
import pandas as pd
from horse.betsim.wrap.jcapper import JCapper
from horse.betsim.math import compute_probs_from_odds
import numpy as np

## Load JCapper data

In [2]:
# load JCapper Breeders Cup Results file
jcp = JCapper(verbose=True)
jcp.load(date_range(date(2017, 7, 3), date(2017, 7, 9)))
jcp.add_computed_columns()
print('load_jcapper: loaded %d rows' % (len(jcp.df)))

jcapper.load(2017-07-03)
jcapper.load(2017-07-04)
jcapper.load(2017-07-05)
jcapper.load(2017-07-06)
jcapper.load(2017-07-07)
jcapper.load(2017-07-08)
jcapper.load(2017-07-09)
JCapper.load() filtering scratched horses reduced from 9447 to 8500


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  df_payout = concat([df_payout, wps])
  self.df['log_ratio_effectivestarters_morningline'] = -1.0 * log(self.df.num_effective_starters_morning_line / self.df.num_starters)


load_jcapper: loaded 8500 rows


In [19]:
df_payout = jcp.df_payout.copy() #exotic bets payouts in easier form
df_result = jcp.df.copy() #race results dataframe

In [16]:
def compute_simple_payout(df_res, attr_model, ascending=False, bet_amount=1.0):
    """
    Add columns for quick calculation of Win bets % payout
    :param df: Dataframe from dataset for multiple races
    :param attr_model: (string)an attribute / prob or score in the dataframe that can be ranked
    :param ascending: False if best is higher score i.e. probs
    :param bet_amount: default to 1.0
    :return: Dataframe with columns added

    """
    df = df_res.copy()
    df['is_win'] = df['official_finish_position'].map(lambda x:int(x==1))
    df['rank_' + attr_model] = df.groupby('race_id')[attr_model].transform(lambda x:x.rank(ascending=ascending))
    df['bet_amount'] = df['rank_' + attr_model].map(lambda x:int(x<1.5)*bet_amount)
    df['is_wager'] = df['bet_amount'].map(lambda x:int(x>0))
    df['is_paid'] = df['is_wager'] * df['is_win']
    df['payout'] = df['is_win']*df['bet_amount']*df['payout_win'].fillna(0.0)

    return df

In [5]:
def compute_advantage(df):

    pct_win = df.groupby('race_id')['is_paid'].sum().value_counts(normalize=True)[1]

    pct_loss = 1.0 - pct_win
    mean_odds = df[df.is_paid>0]['final_tote_odds'].mean()
    advantage = pct_win - pct_loss / mean_odds
    print(advantage)
    return advantage

## Implied probability derived from final tote odds.

We used the favorite bet each time for the benchmark

    def compute_probs_from_odds(odds):
        dirty_probs = 1.0 / (odds + 1)
        clean_probs = dirty_probs / dirty_probs.sum()
        return clean_probs

In [21]:
df_result.loc[:,['prob_final_tote_odds',"final_tote_odds"]].head()

Unnamed: 0,prob_final_tote_odds,final_tote_odds
0,0.343878,1.4
1,0.317426,1.6
2,0.086874,8.5
3,0.036844,21.4
4,0.12697,5.5


In [17]:
attr_bench = 'prob_final_tote_odds'
df_result_payout = compute_simple_payout(df_result, attr_model=attr_bench, ascending=False, bet_amount=1.0)

## Advantage
A dirty measure the win versus loss. Negative means disappreciated, positive means appreciated.

In [22]:
advantage_final_odds = compute_advantage(df_result_payout)

-0.17148271984334124


## Probability derived from any factor via integral

In [40]:
import scipy.spatial.distance as ssd
from scipy.stats import norm
import math
import itertools
import numpy as np
import pandas as pd

from math import ceil

In [41]:
class ScoreToProbViaIntegral(object):
    def __init__(self, func, scoreLabel):
        self.func = func
        self.scoreLabel = scoreLabel

    def __call__(self, df, addIndex=False):
        scores = self.func(df)

        
#         scores = pd.Series(scores)
#         clean_scores = scores[scores > 0]
#         clean_median = np.median(clean_scores)
#         mean_score = scores[scores > 0].mean()
        
        try:
            scores = pd.Series(scores)
            clean_scores = scores[scores > 0]
            clean_median = np.median(clean_scores)
            mean_score = scores[scores > 0].mean()
        except:
            print("no scores")
            return None

        try:
            scores = (scores - scores.mean()) / scores.std()
        except:
            print("could not compute normalized score")
            return None

        pdf, cdf = self.probDists(scores)
        pdfSeries = pd.Series(pdf).transpose()
        cdfSeries = pd.Series(cdf).transpose()
        probw = {}

        for winner in pdfSeries.index:
            probw[winner] = self.marginrunner(cdfSeries, pdfSeries, winner)
        probs = pd.Series(probw)
        probs = probs / probs.sum()

        if addIndex:
            probs_order = probs.order(ascending=False)
            idxABC = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M"]
            idxRunners = probs_order.index.values
            idxZipABC = pd.MultiIndex.from_tuples(zip(idxABC, idxRunners))
            probs_order.index = idxZipABC
        return(probs)

    def marginrunner(self, cdf, pdf, runner):
        '''Computes the win probs from for each horse from cdf and pdf'''
        '''std : standard deviation of score'''
        '''incr: discretization for solving integral'''
        cdfdrop = cdf.drop(runner)
        pdfmult = pdf.ix[runner,]
        # print(("Starting {}:\n{}".format(runner, sum(pdfmult)))
        for w in cdfdrop.index:
            pdfmult = pdfmult * cdfdrop.ix[w,]
            # print(("After {}:\n{}".format(runner, sum(pdfmult)))
        sumtest = sum(pdfmult)
        # print(("{} {}".format(runner, sumtest))
        return sumtest

    def probDists(self, scores, incr=.25, width=8.0):
        '''computes probabilities by assuming normal distribution of outcomes relative to score'''
        range = np.arange(-width, width, incr)
        probintegral = {}
        pdfslice = {}
        for s in scores.index:
            cdfdict = {}
            pdfdict = {}
            dist = norm(scores[s], 1)
            for r in range:
                cdfdict[r] = dist.cdf(r)
                pdfdict[r] = dist.pdf(r)
            cdfseries = pd.Series(cdfdict)
            probintegral[s] = cdfseries
            pdfseries = pd.Series(pdfdict)
            pdfslice[s] = pdfseries
        return (pdfslice, probintegral)

    def __str__(self):
        return "ScoreToProbViaIntegral({!r})".format(self.scoreLabel)

In [42]:
def Score(series):
    return series

## Take HDWPSRRating as an example

In [29]:
dfX_hist = pd.read_csv('df_factors_PILOT.csv')

In [33]:
dfX_hist['payout_win']=dfX_hist['final_tote_odds']+1

In [43]:
#if you want to analyze other factors, you just need to change 'ScoreLabel'
ScoreLabel = 'HDWPSRRating'
A = ScoreToProbViaIntegral(Score, ScoreLabel)
dfX_hist['prob_'+ScoreLabel] = dfX_hist.groupby('race_id')[ScoreLabel].transform(lambda x:A(x))

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [44]:
dfX_hist.head()

Unnamed: 0,race_id,track_id,date,race_number,distance,approx_dist,surface,race_type,sex_restriction,age_restriction,...,x8is_longshot,x8is_win_longshot,x8is_exacta_longshot,x8is_trifecta_lonsghot,x8is_superfecta_longshot,x8_outperform,pct_of_purse_earnings,runner_program_number_pp,prob_HDWPSRRating,payout_win
0,APX_20170703_1,APX,2017-07-03,1,1540.0,False,A,M,N,B,...,0,0.0,0.0,0.0,0.0,-2.0,0.015,7,0.158522,6.5
1,APX_20170703_1,APX,2017-07-03,1,1540.0,False,A,M,N,B,...,1,0.0,0.0,1.0,1.0,1.0,0.1,4,0.034693,9.5
2,APX_20170703_1,APX,2017-07-03,1,1540.0,False,A,M,N,B,...,1,0.0,0.0,0.0,0.0,1.0,0.0,5,0.008367,28.0
3,APX_20170703_1,APX,2017-07-03,1,1540.0,False,A,M,N,B,...,0,0.0,0.0,0.0,0.0,0.0,0.6,2,0.25466,2.4
4,APX_20170703_1,APX,2017-07-03,1,1540.0,False,A,M,N,B,...,0,0.0,0.0,0.0,0.0,0.0,0.25,6,0.447077,2.6


In [46]:
attr_bench = 'prob_HDWPSRRating'
df_result_payout = compute_simple_payout(dfX_hist, attr_model=attr_bench, ascending=False, bet_amount=1.0)
advantage_HDWPSRRating = compute_advantage(df_result_payout)

-0.1377475210411216


underperformance_weighted = rank_prob_final_tote_odds - official_finish_position * prob_final_tote_odds

So we want to find relation between underperformance_weighted and other factors.
In this notebook, we only use the difference between morning_line prob and prob_final_tote_odds and the number of starters as the 2 inputs to predict the underperformance. We can change factors later.

In the notebook, we created another factor: sum(abs(difference between morning_line prob and prob_final_tote_odds)/(number of starters)), as a measure of difference of probs for the race. Then we use 3 factors in analysis, difference of probs of favorite runner in every race, difference of probs of the race, the number of starters in the race.

In [48]:
#'diff_abs_logprob_final_tote_morning_line' is like log-return of one runner, we wanted to get 'sum return', so we weighted 'diff_abs_logprob_final_tote_morning_line' and got the sum of abs.
df_result['diff_abs_logprob_final_tote_morning_line'] = abs(df_result['diff_logprob_final_tote_morning_line']/df_result['num_starters'])
df_result['diff_sum_logprob_final_tote_morning_line'] = df_result.groupby('race_id')['diff_abs_logprob_final_tote_morning_line'].transform(lambda x:sum(x))
df_result.head()

Unnamed: 0,chart_file_sym,date,race_number,breed_code,distance,is_about_distance,surface_code,is_off_turf,course_type_code,race_type_code,...,cost_exacta_from_win_show,cost_trifecta_from_place_wc,cost_superfecta_from_show_a1,cost_synth_place_tri,log_ratio_effectivestarters_morningline,max_prob_morning_line_odds,max_prob_final_tote_odds,underperformance_weighted,diff_abs_logprob_final_tote_morning_line,diff_sum_logprob_final_tote_morning_line
0,AP,2017-07-03,1,TB,7.0,0,P,0,M,MCL,...,6,60,360,60,0.164764,0.364803,0.343878,0.0,0.057891,0.356776
1,AP,2017-07-03,1,TB,7.0,0,P,0,M,MCL,...,6,60,360,60,0.164764,0.364803,0.343878,0.0,0.019873,0.356776
2,AP,2017-07-03,1,TB,7.0,0,P,0,M,MCL,...,6,60,360,60,0.164764,0.364803,0.343878,0.086874,0.078462,0.356776
3,AP,2017-07-03,1,TB,7.0,0,P,0,M,MCL,...,6,60,360,60,0.164764,0.364803,0.343878,0.073688,0.044076,0.356776
4,AP,2017-07-03,1,TB,7.0,0,P,0,M,MCL,...,6,60,360,60,0.164764,0.364803,0.343878,-0.253941,0.019873,0.356776
