In [None]:
# this notebook is to get probability weighted return (prob_HDWPSRRating)

In [99]:
import scipy.spatial.distance as ssd
from scipy.stats import norm
import math
import itertools
import numpy as np
import pandas as pd

from math import ceil

In [100]:

def generate_combinations_factors(balls, num_balls):
    '''Generate combinations from N codes and k selections '''
    return [list(p) for p in itertools.permutations(balls, num_balls)]


def generate_system_space(balls, numballs2buckets):
    '''Generates all possible systems meeting criteria'''
    balls_in = {}
    for bucket, nballs in numballs2buckets.items():
        balls_in[bucket] = generate_combinations_factors(balls, nballs)
    return balls_in


def compute_probs_from_odds(odds):
    #TODO: handle NaNs and zeros
    dirty_probs = 1.0 / (odds + 1)
    clean_probs = dirty_probs / dirty_probs.sum()
    return clean_probs


def dmetric_L1_weighted(a_vector,b_vector, weight, funcdist):
    return ssd.minkowski(a_vector, b_vector, 1)


def log_safe(x,b):
    if x is None or x <= 0:
        # print("Log of {}".format(x))
        return 0
    else:
        return math.log(x,b)


def kl(p, q):
    """
    Specifically, the Kullback–Leibler divergence from Q to P, denoted DKL(P‖Q), is
    a measure of the information gained when one revises one's beliefs from the
    prior probability distribution Q to the posterior probability distribution P. In
    other words, it is the amount of information lost when Q is used to approximate P.
    Parameters
    ----------
    p, q : array-like, dtype=float, shape=n
    Discrete probability distributions.
    """
    p = np.asarray(p, dtype=np.float)
    q = np.asarray(q, dtype=np.float)

    return np.sum(np.where(p != 0, p * np.log(p / q), 0))


def week_of_month(dt):
    """
    Returns int of the week of the month for the specified date. Will always be 1-5
    """

    first_day = dt.replace(day=1)

    dom = dt.day
    adjusted_dom = dom + first_day.weekday()

    return int(ceil(adjusted_dom/7.0))


def get_freq_wom(target_date):
    # Here we determine what dates to use for simulation

    target_datetime = pd.to_datetime(target_date)  # use pandas datetime for datetime functions
    target_weekday_name = pd.to_datetime(target_datetime).weekday_name  # 7:Sunday, 6:Saturday
    target_weekday_prefix = target_weekday_name[0:3].upper()
    target_wom = week_of_month(target_date)

    # 'WOM-1SUN is first sunday of month
    freq_wom = 'WOM-' + str(target_wom) + target_weekday_prefix

    return freq_wom


def mean_best_N_of_K(row, n, k):
    # e.g.
    # df[['HDWSpeedRating_0', 'HDWSpeedRating_1', 'HDWSpeedRating_2']].apply(lambda row: mean_best_N_of_K(row, n=2, k=3), axis=1)
    return row[0:k].nlargest(n).mean()


class ScoreToProbViaIntegral(object):
    def __init__(self, func, scoreLabel):
        self.func = func
        self.scoreLabel = scoreLabel

    def __call__(self, df, addIndex=False):
        scores = self.func(df)

        try:
            scores = pd.Series(scores)
            clean_scores = scores[scores > 0]
            clean_median = np.median(clean_scores)
            mean_score = scores[scores > 0].mean()
        except:
            print("no scores")
            return None

        try:
            scores = (scores - scores.mean()) / scores.std()
        except:
            print("could not compute normalized score")
            return None

        pdf, cdf = self.probDists(scores)
        pdfSeries = pd.Series(pdf).transpose()
        cdfSeries = pd.Series(cdf).transpose()
        probw = {}

        for winner in pdfSeries.index:
            probw[winner] = self.marginrunner(cdfSeries, pdfSeries, winner)
        probs = pd.Series(probw)
        probs = probs / probs.sum()

        if addIndex:
            probs_order = probs.order(ascending=False)
            idxABC = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M"]
            idxRunners = probs_order.index.values
            idxZipABC = pd.MultiIndex.from_tuples(zip(idxABC, idxRunners))
            probs_order.index = idxZipABC
        return(probs)

    def marginrunner(self, cdf, pdf, runner):
        '''Computes the win probs from for each horse from cdf and pdf'''
        '''std : standard deviation of score'''
        '''incr: discretization for solving integral'''
        cdfdrop = cdf.drop(runner)
        pdfmult = pdf.ix[runner,]
        # print(("Starting {}:\n{}".format(runner, sum(pdfmult)))
        for w in cdfdrop.index:
            pdfmult = pdfmult * cdfdrop.ix[w,]
            # print(("After {}:\n{}".format(runner, sum(pdfmult)))
        sumtest = sum(pdfmult)
        # print(("{} {}".format(runner, sumtest))
        return sumtest

    def probDists(self, scores, incr=.25, width=8.0):
        '''computes probabilities by assuming normal distribution of outcomes relative to score'''
        range = np.arange(-width, width, incr)
        probintegral = {}
        pdfslice = {}
        for s in scores.index:
            cdfdict = {}
            pdfdict = {}
            dist = norm(scores[s], 1)
            for r in range:
                cdfdict[r] = dist.cdf(r)
                pdfdict[r] = dist.pdf(r)
            cdfseries = pd.Series(cdfdict)
            probintegral[s] = cdfseries
            pdfseries = pd.Series(pdfdict)
            pdfslice[s] = pdfseries
        return (pdfslice, probintegral)

    def __str__(self):
        return "ScoreToProbViaIntegral({!r})".format(self.scoreLabel)

In [101]:
def Score(df):
    return df

In [102]:
#import dataset and add one column 'payout_win'='final_tote_odds'+1
dfX_hist = pd.read_csv('I:/YaoTony/saleem tasks/df_factors_PILOT.csv')
dfX_hist['payout_win']=dfX_hist['final_tote_odds']+1

In [103]:
#if you want to analyze other factors, you just need to change 'ScoreLabel'
ScoreLabel = 'HDWPSRRating'
A = ScoreToProbViaIntegral(Score, ScoreLabel)
dfX_hist['prob_'+ScoreLabel] = dfX_hist.groupby('race_id')[ScoreLabel].transform(lambda x:A(x))


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = (x >= self.b) & cond0
  return (self.a <= x) & (x <= self.b)
  return (self.a <= x) & (x <= self.b)


In [104]:
dfX_hist.iloc[223:228,:]
#we can find races with only one runners has none prob_scorelabel. Also in some other cases we have none prob_scorelabel.
#We want to delete these rows in following analysis.

Unnamed: 0.1,Unnamed: 0,index,race_id,track_id,date,race_number,distance,approx_dist,surface,race_type,...,x8is_longshot,x8is_win_longshot,x8is_exacta_longshot,x8is_trifecta_lonsghot,x8is_superfecta_longshot,x8_outperform,pct_of_purse_earnings,runner_program_number_pp,payout_win,prob_HDWPSRRating
223,223,230,CBY_20170703_4,CBY,2017-07-03,4,1870.0,False,T,N,...,1,0.0,0.0,0.0,0.0,0.0,0.015,4,17.6,0.011317
224,224,231,CBY_20170703_4,CBY,2017-07-03,4,1870.0,False,T,N,...,0,0.0,0.0,0.0,0.0,0.0,0.6,3,2.8,0.111844
225,225,240,CBY_20170703_5,CBY,2017-07-03,5,1100.0,False,D,S,...,1,0.0,0.0,0.0,1.0,3.0,0.035,7,18.1,
226,226,242,CBY_20170703_6,CBY,2017-07-03,6,1760.0,False,D,C,...,1,0.0,0.0,0.0,0.0,0.0,0.0,8,11.2,0.030751
227,227,243,CBY_20170703_6,CBY,2017-07-03,6,1760.0,False,D,C,...,0,0.0,0.0,0.0,0.0,-3.0,0.035,4,3.3,0.311585


In [105]:
#delete rows with na in prob or x8is_win
dfX_hist_noNA = dfX_hist[np.isfinite(dfX_hist['prob_'+ScoreLabel])]
dfX_hist_noNA = dfX_hist_noNA[np.isfinite(dfX_hist_noNA['x8is_win'])]
#a = dfX_hist_noNA['x8is_win'].isna()
#b= [i for i, x in enumerate(a) if x]

In [106]:
#normalize probability after deletinng rows
def normprob(prob):
    probnew = prob/sum(prob)
    return probnew
dfX_hist_noNA['prob_'+ScoreLabel+'_norm'] = dfX_hist_noNA.groupby('race_id')['prob_'+ScoreLabel].transform(lambda x:normprob(x))


In [107]:
#expected prob_scorelabel to win of winners
sum(dfX_hist_noNA['prob_'+ScoreLabel+'_norm']*dfX_hist_noNA['x8is_win'])/len(dfX_hist_noNA['race_id'].unique())


0.27546613016362187

In [108]:
#fill 0 in missing values in payout_win
a = dfX_hist_noNA['payout_win'].isna()
b= [i for i, x in enumerate(a) if x]
dfX_hist_noNA['payout_win'].iloc[b]=0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [109]:
sum(dfX_hist_noNA['prob_'+ScoreLabel+'_norm']*dfX_hist_noNA['x8is_win']*dfX_hist_noNA['payout_win'])

825.2212637050291

In [110]:
#using load_benchmark functions to get advantage
def compute_simple_payout(df, attr_model, ascending=False, bet_amount=1.0):
    """
    Add columns for quick calculation of Win bets % payout
    :param df: Dataframe from dataset for multiple races
    :param attr_model: (string)an attribute / prob or score in the dataframe that can be ranked
    :param ascending: False if best is higher score i.e. probs
    :param bet_amount: default to 1.0
    :return: Dataframe with columns added

    """
    df['is_win'] = df['official_finish_position'].map(lambda x:int(x==1))
    df['rank_' + attr_model] = df.groupby('race_id')[attr_model].transform(lambda x:x.rank(ascending=False))
    df['bet_amount'] = df['rank_' + attr_model].map(lambda x:int(x<1.5)*bet_amount)
    df['is_wager'] = df['bet_amount'].map(lambda x:int(x>0))
    df['is_paid'] = df['is_wager'] * df['is_win']
    df['payout'] = df['is_win']*df['bet_amount']*df['payout_win'].fillna(0.0)

    return df

def compute_advantage(df):

    pct_win = df.groupby('race_id')['is_paid'].sum().value_counts(normalize=True)[1]

    pct_loss = 1.0 - pct_win
    mean_odds = df[df.is_paid>0]['final_tote_odds'].mean()
    advantage = pct_win - pct_loss / mean_odds
    print(advantage)
    return advantage

In [111]:
#calculate advantage as previously did
factor = 'prob_'+ScoreLabel
dfX_hist_noNA['rank_' + factor] = dfX_hist_noNA.groupby("race_id")[factor].rank("min", ascending=False)
df = dfX_hist_noNA
attr_bench_final = factor
df_1 = compute_simple_payout(df, attr_model=attr_bench_final, ascending=False, bet_amount=1.0)
advantage = compute_advantage(df_1)

-0.0960620973913


In [112]:
#using adjusted load_benchmark functions to get advantage
def compute_simple_payout_new(df, attr_model, ascending=False, bet_amount_label=False,bet_amount=1.0):
    """
    Add columns for quick calculation of Win bets % payout
    :param df: Dataframe from dataset for multiple races
    :param attr_model: (string)an attribute / prob or score in the dataframe that can be ranked
    :param ascending: False if best is higher score i.e. probs
    :param bet_amount: default to 1.0
    :return: Dataframe with columns added

    """
    df['is_win'] = df['official_finish_position'].map(lambda x:int(x==1))
    df['rank_' + attr_model] = df.groupby('race_id')[attr_model].transform(lambda x:x.rank(ascending=False))
    if bet_amount_label == False:
        df['bet_amount'] = df['rank_' + attr_model].map(lambda x:int(x<1.5)*bet_amount)
        
    else:
        df['bet_amount'] = df['rank_' + attr_model].map(lambda x:int(x<1.5)*bet_amount)*df['prob_'+bet_amount_label+'_norm']
    df['is_wager'] = df['bet_amount'].map(lambda x:int(x>0))
    df['is_paid'] = df['is_wager'] * df['is_win']
    df['payout'] = df['is_win']*df['bet_amount']*df['payout_win'].fillna(0.0)
    

    return df

def compute_Return(df):
    Return = (sum(df[df.is_paid>0]['payout'])-sum(df['bet_amount']))/sum(df['bet_amount'])
    print(Return)
    return Return

In [114]:
#calculate advantage new
factor = 'prob_'+ScoreLabel
dfX_hist_noNA['rank_' + factor] = dfX_hist_noNA.groupby("race_id")[factor].rank("min", ascending=False)
df = dfX_hist_noNA
attr_bench_final = factor
#bet $1 on every favourate
df_old = compute_simple_payout_new(df, attr_model=attr_bench_final, ascending=False, bet_amount_label = False, bet_amount=1.0)
return_old = compute_Return(df_old)
#bet amount equal to probability to win on every favorate
df_new = compute_simple_payout_new(df, attr_model=attr_bench_final, ascending=False, bet_amount_label = ScoreLabel, bet_amount=1.0)
return_new = compute_Return(df_new)

-0.09422028353326012
-0.07923271233849702
