In [None]:
import pandas as pd
import numpy as np
import re
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
types_dict = {'race_id': int, 'race': int, 'date':str, 'place':int, 'horse_id':str, 'horse_no':str,
              'horse':str, 'jockey':str, 'trainer':str, 'actual_weight':float, 'declared_horse_weight':float,
              'draw':int, 'lbw':float, 'running_position': str,'win_odds':float, 'class':str, 'going':str,
              'track':str, 'prize':int, 'location':str, 'distance_m':int, 'finish_time':str, 'finish_time_s':float}
parse_dates = ['date', 'finish_time']

raw=pd.read_csv('../data/race_2016_to_2020.csv',dtype=types_dict,parse_dates=parse_dates)

In [None]:
raw

In [None]:
raw.columns

In [None]:
raw.info()

In [None]:
def tran_running_position_to_list(r_pos):
    r_pos = r_pos.strip("[]")
    str_list = re.findall('(\d+)',r_pos)
    result = [int(s) for s in str_list]
    return result

def get_season(race_date):
    if race_date.month<9:
        season=str(race_date.year-1)+'/'+str(race_date.year)
    else:
        season=str(race_date.year)+'/'+str(race_date.year+1)
    return season

In [None]:
#further data cleaning
df=raw.copy()

# rest day = day difference between current race and last race,fillna with 45 days for summer holidays
df['rest_day']=df.groupby('horse_id')['date'].diff().apply(lambda x: x.days).fillna(45) 

# generate last running position of horse and drop running position, as running position leak current result
df['running_position'] = df['running_position'].apply(tran_running_position_to_list)
df['last_running_position'] = df.groupby(['horse_id'])['running_position'].transform(lambda x:x.shift()).apply(lambda x: x if isinstance(x,list) else [0])
df.drop('running_position',inplace=True,axis=1)

# generate last place 
df['last_place'] = df.groupby(['horse_id'])['place'].transform(lambda x:x.shift()).apply(lambda x: 0. if np.isnan(x) else x)
# df.drop('place',inplace=True,axis=1)

# get season for each record
df['season'] = df['date'].apply(get_season)
#drop unwanted columns
df.drop('finish_time',inplace=True,axis=1) #replaced by finish_time_s
df.columns

In [None]:
df=df[['season', 'race_id', 'race', 'date', 'place', 'last_place', 'horse_id', 'horse_no', 'horse',
       'jockey', 'trainer', 'actual_weight', 'declared_horse_weight', 'draw',
       'lbw', 'last_running_position', 'win_odds', 'class', 'going', 'track',
       'prize', 'location', 'distance_m', 'finish_time_s', 'rest_day']]
df

In [None]:
# generate jockey df with win rate of last season
# get name for all jockeys in data set, order by asc
df_jockey=pd.DataFrame({'jockey':df['jockey'].sort_values().unique()})
# get no. of 1st,2nd,3rd,4th place in last season
def cal_pct_1st(x):
    return (x==1).sum()/len(x)*100
def cal_pct_2nd(x):
    return (x==2).sum()/len(x)*100
def cal_pct_3rd(x):
    return (x==3).sum()/len(x)*100
def cal_pct_4th(x):
    return (x==4).sum()/len(x)*100

#create dataframe containing percentage of 1st,2nd,3rd,4th place of last season
df_1st_rate=df.groupby(['season','jockey'],as_index=False)['place'].agg(cal_pct_1st).round(decimals=2)
df_2nd_rate=df.groupby(['season','jockey'],as_index=False)['place'].agg(cal_pct_2nd).round(decimals=2)
df_3rd_rate=df.groupby(['season','jockey'],as_index=False)['place'].agg(cal_pct_3rd).round(decimals=2)
df_4th_rate=df.groupby(['season','jockey'],as_index=False)['place'].agg(cal_pct_4th).round(decimals=2)
df_1st_rate.columns=['season','jockey','pct_1st']
df_2nd_rate.columns=['season','jockey','pct_2nd']
df_3rd_rate.columns=['season','jockey','pct_3rd']
df_4th_rate.columns=['season','jockey','pct_4th']

#join pct information to df_jockey dataframe
df_jockey = pd.merge(df_jockey,df_1st_rate,on='jockey')
df_jockey = pd.merge(df_jockey,df_2nd_rate,on=['jockey','season'])
df_jockey = pd.merge(df_jockey,df_3rd_rate,on=['jockey','season'])
df_jockey = pd.merge(df_jockey,df_4th_rate,on=['jockey','season'])

#get last season performance by shifting
df_jockey['pct_1st_last'] = df_jockey.groupby(['jockey'],as_index=False)['pct_1st'].transform(lambda x:x.shift())
df_jockey['pct_2nd_last'] = df_jockey.groupby(['jockey'],as_index=False)['pct_2nd'].transform(lambda x:x.shift())
df_jockey['pct_3rd_last'] = df_jockey.groupby(['jockey'],as_index=False)['pct_3rd'].transform(lambda x:x.shift())
df_jockey['pct_4th_last'] = df_jockey.groupby(['jockey'],as_index=False)['pct_4th'].transform(lambda x:x.shift())

#keep all pact columns of last season only
df_jockey=df_jockey[['jockey','season','pct_1st_last','pct_2nd_last','pct_3rd_last','pct_4th_last']]

#fill nan with 0 for the first season record
df_jockey['pct_1st_last'] = df_jockey['pct_1st_last'].apply(lambda x: 0. if np.isnan(x) else x)
df_jockey['pct_2nd_last'] = df_jockey['pct_1st_last'].apply(lambda x: 0. if np.isnan(x) else x) 
df_jockey['pct_3rd_last'] = df_jockey['pct_1st_last'].apply(lambda x: 0. if np.isnan(x) else x) 
df_jockey['pct_4th_last'] = df_jockey['pct_1st_last'].apply(lambda x: 0. if np.isnan(x) else x)

df_jockey

In [None]:
df_trainer=pd.DataFrame({'trainer':df['trainer'].sort_values().unique()})
# get no. of 1st,2nd,3rd,4th place in last season
def cal_pct_1st(x):
    return (x==1).sum()/len(x)*100
def cal_pct_2nd(x):
    return (x==2).sum()/len(x)*100
def cal_pct_3rd(x):
    return (x==3).sum()/len(x)*100
def cal_pct_4th(x):
    return (x==4).sum()/len(x)*100

#create dataframe containing percentage of 1st,2nd,3rd,4th place of last season
df_1st_rate=df.groupby(['season','trainer'],as_index=False)['place'].agg(cal_pct_1st).round(decimals=2)
df_2nd_rate=df.groupby(['season','trainer'],as_index=False)['place'].agg(cal_pct_2nd).round(decimals=2)
df_3rd_rate=df.groupby(['season','trainer'],as_index=False)['place'].agg(cal_pct_3rd).round(decimals=2)
df_4th_rate=df.groupby(['season','trainer'],as_index=False)['place'].agg(cal_pct_4th).round(decimals=2)
df_1st_rate.columns=['season','trainer','pct_1st']
df_2nd_rate.columns=['season','trainer','pct_2nd']
df_3rd_rate.columns=['season','trainer','pct_3rd']
df_4th_rate.columns=['season','trainer','pct_4th']

#join pct information to df_jockey dataframe
df_trainer = pd.merge(df_trainer,df_1st_rate,on='trainer')
df_trainer = pd.merge(df_trainer,df_2nd_rate,on=['trainer','season'])
df_trainer = pd.merge(df_trainer,df_3rd_rate,on=['trainer','season'])
df_trainer = pd.merge(df_trainer,df_4th_rate,on=['trainer','season'])

#get last season performance by shifting
df_trainer['pct_1st_last'] = df_trainer.groupby(['trainer'],as_index=False)['pct_1st'].transform(lambda x:x.shift())
df_trainer['pct_2nd_last'] = df_trainer.groupby(['trainer'],as_index=False)['pct_2nd'].transform(lambda x:x.shift())
df_trainer['pct_3rd_last'] = df_trainer.groupby(['trainer'],as_index=False)['pct_3rd'].transform(lambda x:x.shift())
df_trainer['pct_4th_last'] = df_trainer.groupby(['trainer'],as_index=False)['pct_4th'].transform(lambda x:x.shift())

#keep all pact columns of last season only
df_trainer=df_trainer[['trainer','season','pct_1st_last','pct_2nd_last','pct_3rd_last','pct_4th_last']]

#fill nan with 0 for the first season record
df_trainer['pct_1st_last'] = df_trainer['pct_1st_last'].apply(lambda x: 0. if np.isnan(x) else x)
df_trainer['pct_2nd_last'] = df_trainer['pct_1st_last'].apply(lambda x: 0. if np.isnan(x) else x) 
df_trainer['pct_3rd_last'] = df_trainer['pct_1st_last'].apply(lambda x: 0. if np.isnan(x) else x) 
df_trainer['pct_4th_last'] = df_trainer['pct_1st_last'].apply(lambda x: 0. if np.isnan(x) else x)

df_trainer