# Data Dictionary:

race_id: str
>Identifier for each race. The first race of the season will be 1. The last race will be 819. 

race: str
>The race number of each racing day.

date: time
>Date of the race.

place: str
>The order of horse complete the race track

horse_id: str
>Identifier for each horse

horse_no: str
>Number of the horse for that race

horse: str
>Name of the horse

jockey: str
>Name of the jockey

trainer: str
>Name of the trainer

actual_weight: float
>Weight that added on the horse

declared_horse_weight: float
>Weight of the horse

draw: str
>The stall where the horse started from.

lbw: float
>Distance between the horse and the winner when winner pass the rush line

running_position: array
>Place of the horse pass different check points

win_odds: float
>Ration of beting this horse if the horse win the 1st place

class: str
>The level/grade of the horse. If there is 'Group' in class, it is a international standard race.

going: str
>Condition of race track

track: str
>Field type of the race.Turf or mud (all weather track).

prize: int
>Prize for winner

location: str
>Location of the racecourse

distance_m: int
>Distance for this race

finish_time: time
>Time to finish the race

finish_time_s: float
>Time to finish the race in second

## Please use the follow dictionary for read_csv
```
types_dict = {'race_id': int, 'race': int, 'date':str, 'place':int, 'horse_id':str, 'horse_no':str,
    'horse':str, 'jockey':str, 'trainer':str, 'actual_weight':float, 'declared_horse_weight':float,
    'draw':int, 'lbw':float, 'running_position': str,'win_odds':float, 'class':str, 'going':str,
    'track':str, 'prize':int, 'location':str, 'distance_m':int, 'finish_time':str, 'finish_time_s':float}
parse_dates = ['date', 'finish_time']
raw=pd.read_csv('../data/race_2016_to_2020.csv',dtype=types_dict,parse_dates=parse_dates)
```

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
raw = pd.read_csv('../data/raw/all_season.csv')
raw.head()

Unnamed: 0,place,horse_no,horse,jockey,trainer,actual_weight,declared_horse_weight,draw,lbw,running_position,finish_time,win_odds,race,class,going,turf,prize,location,date
0,1,10.0,SEASONS KING(T032),N Callan,D J Hall,125,1048,10,-,12 ...,1:09.35,9.4,1(1),Class 5 - 1200M - (40-15),GOOD,"TURF - ""B"" Course","HK$ 630,000",Sha Tin,2016/09/03
1,2,2.0,HAPPY SOUND(V107),J Moreira,A Lee,129,1120,3,1-1/4,1 ...,1:09.56,1.8,1(1),Class 5 - 1200M - (40-15),GOOD,"TURF - ""B"" Course","HK$ 630,000",Sha Tin,2016/09/03
2,3,14.0,NATURAL FRIENDSHIP(S359),B Prebble,W Y So,119,1021,2,1-1/2,4 ...,1:09.59,16.0,1(1),Class 5 - 1200M - (40-15),GOOD,"TURF - ""B"" Course","HK$ 630,000",Sha Tin,2016/09/03
3,4,13.0,LUCKY PLACE(T004),O Doleuze,C W Chang,120,1054,14,3,14 ...,1:09.84,76.0,1(1),Class 5 - 1200M - (40-15),GOOD,"TURF - ""B"" Course","HK$ 630,000",Sha Tin,2016/09/03
4,5,9.0,NO LAUGHING MATTER(V032),N Rawiller,C S Shum,125,1086,7,3-1/2,9 ...,1:09.89,70.0,1(1),Class 5 - 1200M - (40-15),GOOD,"TURF - ""B"" Course","HK$ 630,000",Sha Tin,2016/09/03


In [3]:
raw.shape

(41590, 19)

In [4]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41590 entries, 0 to 41589
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   place                  41590 non-null  object 
 1   horse_no               41126 non-null  float64
 2   horse                  41590 non-null  object 
 3   jockey                 41590 non-null  object 
 4   trainer                41590 non-null  object 
 5   actual_weight          41590 non-null  int64  
 6   declared_horse_weight  41590 non-null  object 
 7   draw                   41590 non-null  object 
 8   lbw                    41590 non-null  object 
 9   running_position       41590 non-null  object 
 10  finish_time            41590 non-null  object 
 11  win_odds               41590 non-null  object 
 12  race                   41590 non-null  object 
 13  class                  41590 non-null  object 
 14  going                  41590 non-null  object 
 15  tu

In [5]:
#Drop samples with place has no digits
#Change places with DH to digit only
#reset index of dataframe
df_place=raw.copy()
df_place=df_place.drop_duplicates()
df_place=df_place[(df_place['place']!='WV')&(df_place['place']!='WV-A')]
df_place=df_place[(df_place['place']!='WX')&(df_place['place']!='WX-A')&(df_place['place']!='WXNR')]
df_place=df_place[(df_place['place']!='PU')&(df_place['place']!='UR')&(df_place['place']!='DNF')&(df_place['place']!='TNP')]
df_place=df_place[(df_place['place']!='FE')&(df_place['place']!='DISQ')]
df_place['place']=df_place['place'].str.extract(r'(\d+)')
df_place=df_place.reset_index(drop=True)
#Change horse no. to int
df_place['horse_no']=df_place['horse_no'].astype(int).astype(str)

In [6]:
#split horse name into house name and horse id
df_horse = df_place.copy()
df_horse['horse_id']=df_horse['horse'].str.extract(r'\((.+)\)')
df_horse['horse']=df_horse['horse'].str.extract(r'^(.+)\(')

#change actual weight into float
df_horse['actual_weight']=df_horse['actual_weight'].astype(float)

#change declared horse weight into float
df_horse['declared_horse_weight']=df_horse['declared_horse_weight'].astype(float)

In [7]:
#change lbw into float
df_lbw=df_horse.copy()
df_lbw.loc[df_lbw['lbw']=='-','lbw']='0'

def lbw_to_float(lbw_string):
    if lbw_string=='N':
        return 0.33
    elif lbw_string=='SH':
        return 0.1
    elif lbw_string=='HD':
        return 0.25
    elif lbw_string=='NOSE':
        return 0.02
    elif lbw_string=='ML':
        return 150
    nums = [int(i) for i in re.findall('\d+',lbw_string)]
    if len(nums)==1:
        return nums[0]
    elif len(nums)==2:
        return nums[0]/nums[1]
    elif len(nums)==3:
        return nums[0]+nums[1]/nums[2]
    else:
        return 999999

df_lbw['lbw']=df_lbw['lbw'].apply(lbw_to_float)

#change running position into array
df_lbw['running_position']=df_lbw['running_position'].str.findall(r'(\d+)')

In [8]:
#change finish_time into second with type float
df_time=df_lbw.copy()
def get_sec(finish_time):
    result=[float(i) for i in finish_time.split(':')]
    return result[0]*60+result[1]
df_time['finish_time_s']=df_time['finish_time'].apply(get_sec)
df_time['finish_time']=pd.to_datetime(df_time['finish_time'],format='%M:%S.%f')
df_time['win_odds']=df_time['win_odds'].astype(float)

In [9]:
#split race and race_id
df_class=df_time.copy()
df_class['race_id']=df_class['race'].str.extract(r'\((\d+)')
df_class['race']=df_class['race'].str.extract(r'(\d+)\(')

#extract number and class from original class data
def get_level(race_class):
    result=race_class.split('-')
    return result[0].strip()

def get_distance(race_class):
    result=race_class.split('-')
    return int(result[1].strip()[:-1])

df_class['distance_m']=df_class['class'].apply(get_distance)
df_class['class']=df_class['class'].apply(get_level)

In [10]:
#extract digit from prize
#transform date to datetime type
df_prize=df_class.copy()
df_prize['prize']=df_prize['prize'].str.replace(r'[^0-9]', "").astype(int)
df_prize['date']=pd.to_datetime(df_prize['date'])

In [11]:
#rename column turf to track 
#reorder the columns
df_eda=df_prize.copy()
df_eda['track']=df_eda['turf']
df_eda=df_eda.drop(columns='turf',axis=1)
df_eda=df_eda[['race_id', 'race', 'date', 'place', 'horse_id', 'horse_no', 'horse', 'jockey', 'trainer', 'actual_weight',
       'declared_horse_weight', 'draw', 'lbw', 'running_position', 'win_odds', 'class', 'going', 'track',
       'prize', 'location', 'distance_m', 'finish_time', 'finish_time_s']]

In [12]:
df_eda

Unnamed: 0,race_id,race,date,place,horse_id,horse_no,horse,jockey,trainer,actual_weight,...,running_position,win_odds,class,going,track,prize,location,distance_m,finish_time,finish_time_s
0,1,1,2016-09-03,1,T032,10,SEASONS KING,N Callan,D J Hall,125.0,...,"[12, 7, 1]",9.4,Class 5,GOOD,"TURF - ""B"" Course",630000,Sha Tin,1200,1900-01-01 00:01:09.350,69.35
1,1,1,2016-09-03,2,V107,2,HAPPY SOUND,J Moreira,A Lee,129.0,...,"[1, 1, 2]",1.8,Class 5,GOOD,"TURF - ""B"" Course",630000,Sha Tin,1200,1900-01-01 00:01:09.560,69.56
2,1,1,2016-09-03,3,S359,14,NATURAL FRIENDSHIP,B Prebble,W Y So,119.0,...,"[4, 4, 3]",16.0,Class 5,GOOD,"TURF - ""B"" Course",630000,Sha Tin,1200,1900-01-01 00:01:09.590,69.59
3,1,1,2016-09-03,4,T004,13,LUCKY PLACE,O Doleuze,C W Chang,120.0,...,"[14, 13, 4]",76.0,Class 5,GOOD,"TURF - ""B"" Course",630000,Sha Tin,1200,1900-01-01 00:01:09.840,69.84
4,1,1,2016-09-03,5,V032,9,NO LAUGHING MATTER,N Rawiller,C S Shum,125.0,...,"[9, 9, 5]",70.0,Class 5,GOOD,"TURF - ""B"" Course",630000,Sha Tin,1200,1900-01-01 00:01:09.890,69.89
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40669,92,8,2020-10-07,8,B343,6,GENTLE BREEZE,K H Chan,P F Yiu,113.0,...,"[10, 8, 8]",109.0,Class 2,GOOD,"TURF - ""A"" Course",2200000,Happy Valley,1000,1900-01-01 00:00:57.410,57.41
40670,92,8,2020-10-07,9,C314,10,ALLIED AGILITY,A Badel,P F Yiu,118.0,...,"[4, 4, 9]",7.4,Class 2,GOOD,"TURF - ""A"" Course",2200000,Happy Valley,1000,1900-01-01 00:00:57.420,57.42
40671,92,8,2020-10-07,10,A293,12,SAUL'S SPECIAL,V Borges,C W Chang,118.0,...,"[12, 11, 10]",28.0,Class 2,GOOD,"TURF - ""A"" Course",2200000,Happy Valley,1000,1900-01-01 00:00:57.620,57.62
40672,92,8,2020-10-07,11,C124,5,BEAUTY AMIGO,M L Yeung,D A Hayes,121.0,...,"[9, 10, 11]",122.0,Class 2,GOOD,"TURF - ""A"" Course",2200000,Happy Valley,1000,1900-01-01 00:00:57.870,57.87


In [13]:
df_eda.to_csv('../data/race_eda.csv',index=False)

In [14]:
types_dict = {'race_id': int, 'race': int, 'date':str, 'place':int, 'horse_id':str, 'horse_no':str,
              'horse':str, 'jockey':str, 'trainer':str, 'actual_weight':float, 'declared_horse_weight':float,
              'draw':int, 'lbw':float, 'running_position': str,'win_odds':float, 'class':str, 'going':str,
              'track':str, 'prize':int, 'location':str, 'distance_m':int, 'finish_time':str, 'finish_time_s':float}
parse_dates = ['date', 'finish_time']

raw=pd.read_csv('../data/race_eda.csv',dtype=types_dict,parse_dates=parse_dates)

In [15]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40674 entries, 0 to 40673
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   race_id                40674 non-null  int64         
 1   race                   40674 non-null  int64         
 2   date                   40674 non-null  datetime64[ns]
 3   place                  40674 non-null  int64         
 4   horse_id               40674 non-null  object        
 5   horse_no               40674 non-null  object        
 6   horse                  40674 non-null  object        
 7   jockey                 40674 non-null  object        
 8   trainer                40674 non-null  object        
 9   actual_weight          40674 non-null  float64       
 10  declared_horse_weight  40674 non-null  float64       
 11  draw                   40674 non-null  int64         
 12  lbw                    40674 non-null  float64       
 13  r

In [16]:
raw

Unnamed: 0,race_id,race,date,place,horse_id,horse_no,horse,jockey,trainer,actual_weight,...,running_position,win_odds,class,going,track,prize,location,distance_m,finish_time,finish_time_s
0,1,1,2016-09-03,1,T032,10,SEASONS KING,N Callan,D J Hall,125.0,...,"['12', '7', '1']",9.4,Class 5,GOOD,"TURF - ""B"" Course",630000,Sha Tin,1200,1900-01-01 00:01:09.350,69.35
1,1,1,2016-09-03,2,V107,2,HAPPY SOUND,J Moreira,A Lee,129.0,...,"['1', '1', '2']",1.8,Class 5,GOOD,"TURF - ""B"" Course",630000,Sha Tin,1200,1900-01-01 00:01:09.560,69.56
2,1,1,2016-09-03,3,S359,14,NATURAL FRIENDSHIP,B Prebble,W Y So,119.0,...,"['4', '4', '3']",16.0,Class 5,GOOD,"TURF - ""B"" Course",630000,Sha Tin,1200,1900-01-01 00:01:09.590,69.59
3,1,1,2016-09-03,4,T004,13,LUCKY PLACE,O Doleuze,C W Chang,120.0,...,"['14', '13', '4']",76.0,Class 5,GOOD,"TURF - ""B"" Course",630000,Sha Tin,1200,1900-01-01 00:01:09.840,69.84
4,1,1,2016-09-03,5,V032,9,NO LAUGHING MATTER,N Rawiller,C S Shum,125.0,...,"['9', '9', '5']",70.0,Class 5,GOOD,"TURF - ""B"" Course",630000,Sha Tin,1200,1900-01-01 00:01:09.890,69.89
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40669,92,8,2020-10-07,8,B343,6,GENTLE BREEZE,K H Chan,P F Yiu,113.0,...,"['10', '8', '8']",109.0,Class 2,GOOD,"TURF - ""A"" Course",2200000,Happy Valley,1000,1900-01-01 00:00:57.410,57.41
40670,92,8,2020-10-07,9,C314,10,ALLIED AGILITY,A Badel,P F Yiu,118.0,...,"['4', '4', '9']",7.4,Class 2,GOOD,"TURF - ""A"" Course",2200000,Happy Valley,1000,1900-01-01 00:00:57.420,57.42
40671,92,8,2020-10-07,10,A293,12,SAUL'S SPECIAL,V Borges,C W Chang,118.0,...,"['12', '11', '10']",28.0,Class 2,GOOD,"TURF - ""A"" Course",2200000,Happy Valley,1000,1900-01-01 00:00:57.620,57.62
40672,92,8,2020-10-07,11,C124,5,BEAUTY AMIGO,M L Yeung,D A Hayes,121.0,...,"['9', '10', '11']",122.0,Class 2,GOOD,"TURF - ""A"" Course",2200000,Happy Valley,1000,1900-01-01 00:00:57.870,57.87


In [17]:
def tran_running_position_to_list(r_pos):
    r_pos = r_pos.strip("[]")
    str_list = re.findall('(\d+)',r_pos)
    result = [int(s) for s in str_list]
    return result

def get_season(race_date):
    if race_date.month<9:
        season=str(race_date.year-1)+'/'+str(race_date.year)
    else:
        season=str(race_date.year)+'/'+str(race_date.year+1)
    return season

In [18]:
#further data cleaning
df=raw.copy()

# get season for each record
df['season'] = df['date'].apply(get_season)

# rest day = day difference between current race and last race,fillna with 45 days for summer holidays
df['rest_day']=df.groupby('horse_id')['date'].diff().apply(lambda x: x.days).fillna(45) 

# generate last running position of horse and drop running position, as running position leak current result
df['running_position'] = df['running_position'].apply(tran_running_position_to_list)
df['last_running_position'] = df.groupby(['horse_id'])['running_position'].transform(lambda x:x.shift()).apply(lambda x: x if isinstance(x,list) else [0])

# generate last place 
df['last_place'] = df.groupby(['horse_id'])['place'].transform(lambda x:x.shift()).apply(lambda x: 0. if np.isnan(x) else x)

# generate last declared_horse_weight
df['last_declared_horse_weight'] = df.groupby(['horse_id'])['declared_horse_weight'].transform(lambda x:x.shift()).apply(lambda x: 0. if np.isnan(x) else x)

df.columns

Index(['race_id', 'race', 'date', 'place', 'horse_id', 'horse_no', 'horse',
       'jockey', 'trainer', 'actual_weight', 'declared_horse_weight', 'draw',
       'lbw', 'running_position', 'win_odds', 'class', 'going', 'track',
       'prize', 'location', 'distance_m', 'finish_time', 'finish_time_s',
       'season', 'rest_day', 'last_running_position', 'last_place',
       'last_declared_horse_weight'],
      dtype='object')

In [19]:
df_ml=df[['season', 'race_id', 'race', 'date', 'place', 'last_place', 'horse_id', 'horse_no', 'horse',
        'jockey', 'trainer', 'actual_weight', 'declared_horse_weight', 'draw','lbw', 'last_running_position',
        'win_odds', 'class', 'going', 'track','prize', 'location', 'distance_m', 'finish_time_s', 'rest_day']]
df_ml

Unnamed: 0,season,race_id,race,date,place,last_place,horse_id,horse_no,horse,jockey,...,last_running_position,win_odds,class,going,track,prize,location,distance_m,finish_time_s,rest_day
0,2016/2017,1,1,2016-09-03,1,0.0,T032,10,SEASONS KING,N Callan,...,[0],9.4,Class 5,GOOD,"TURF - ""B"" Course",630000,Sha Tin,1200,69.35,45.0
1,2016/2017,1,1,2016-09-03,2,0.0,V107,2,HAPPY SOUND,J Moreira,...,[0],1.8,Class 5,GOOD,"TURF - ""B"" Course",630000,Sha Tin,1200,69.56,45.0
2,2016/2017,1,1,2016-09-03,3,0.0,S359,14,NATURAL FRIENDSHIP,B Prebble,...,[0],16.0,Class 5,GOOD,"TURF - ""B"" Course",630000,Sha Tin,1200,69.59,45.0
3,2016/2017,1,1,2016-09-03,4,0.0,T004,13,LUCKY PLACE,O Doleuze,...,[0],76.0,Class 5,GOOD,"TURF - ""B"" Course",630000,Sha Tin,1200,69.84,45.0
4,2016/2017,1,1,2016-09-03,5,0.0,V032,9,NO LAUGHING MATTER,N Rawiller,...,[0],70.0,Class 5,GOOD,"TURF - ""B"" Course",630000,Sha Tin,1200,69.89,45.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40669,2020/2021,92,8,2020-10-07,8,12.0,B343,6,GENTLE BREEZE,K H Chan,...,"[8, 11, 12]",109.0,Class 2,GOOD,"TURF - ""A"" Course",2200000,Happy Valley,1000,57.41,24.0
40670,2020/2021,92,8,2020-10-07,9,2.0,C314,10,ALLIED AGILITY,A Badel,...,"[1, 1, 2]",7.4,Class 2,GOOD,"TURF - ""A"" Course",2200000,Happy Valley,1000,57.42,31.0
40671,2020/2021,92,8,2020-10-07,10,2.0,A293,12,SAUL'S SPECIAL,V Borges,...,"[5, 5, 2]",28.0,Class 2,GOOD,"TURF - ""A"" Course",2200000,Happy Valley,1000,57.62,21.0
40672,2020/2021,92,8,2020-10-07,11,12.0,C124,5,BEAUTY AMIGO,M L Yeung,...,"[7, 8, 12]",122.0,Class 2,GOOD,"TURF - ""A"" Course",2200000,Happy Valley,1000,57.87,14.0


In [20]:
# generate jockey df with win rate of last season
# get name for all jockeys in data set, order by asc
df_jockey=pd.DataFrame({'jockey':df['jockey'].sort_values().unique()})
# get no. of 1st,2nd,3rd,4th place in last season
def cal_pct_1st(x):
    return (x==1).sum()/len(x)*100
def cal_pct_2nd(x):
    return (x==2).sum()/len(x)*100
def cal_pct_3rd(x):
    return (x==3).sum()/len(x)*100
def cal_pct_4th(x):
    return (x==4).sum()/len(x)*100

#create dataframe containing percentage of 1st,2nd,3rd,4th place of last season
df_1st_rate=df.groupby(['season','jockey'],as_index=False)['place'].agg(cal_pct_1st).round(decimals=2)
df_2nd_rate=df.groupby(['season','jockey'],as_index=False)['place'].agg(cal_pct_2nd).round(decimals=2)
df_3rd_rate=df.groupby(['season','jockey'],as_index=False)['place'].agg(cal_pct_3rd).round(decimals=2)
df_4th_rate=df.groupby(['season','jockey'],as_index=False)['place'].agg(cal_pct_4th).round(decimals=2)
df_1st_rate.columns=['season','jockey','pct_1st']
df_2nd_rate.columns=['season','jockey','pct_2nd']
df_3rd_rate.columns=['season','jockey','pct_3rd']
df_4th_rate.columns=['season','jockey','pct_4th']

#join pct information to df_jockey dataframe
df_jockey = pd.merge(df_jockey,df_1st_rate,on='jockey')
df_jockey = pd.merge(df_jockey,df_2nd_rate,on=['jockey','season'])
df_jockey = pd.merge(df_jockey,df_3rd_rate,on=['jockey','season'])
df_jockey = pd.merge(df_jockey,df_4th_rate,on=['jockey','season'])

#get last season performance by shifting
df_jockey['pct_1st_last_j'] = df_jockey.groupby(['jockey'],as_index=False)['pct_1st'].transform(lambda x:x.shift())
df_jockey['pct_2nd_last_j'] = df_jockey.groupby(['jockey'],as_index=False)['pct_2nd'].transform(lambda x:x.shift())
df_jockey['pct_3rd_last_j'] = df_jockey.groupby(['jockey'],as_index=False)['pct_3rd'].transform(lambda x:x.shift())
df_jockey['pct_4th_last_j'] = df_jockey.groupby(['jockey'],as_index=False)['pct_4th'].transform(lambda x:x.shift())

#keep all pact columns of last season only
df_jockey=df_jockey[['jockey','season','pct_1st_last_j','pct_2nd_last_j','pct_3rd_last_j','pct_4th_last_j']]

#fill nan with 0 for the first season record
df_jockey['pct_1st_last_j'] = df_jockey['pct_1st_last_j'].apply(lambda x: 0. if np.isnan(x) else x)
df_jockey['pct_2nd_last_j'] = df_jockey['pct_2nd_last_j'].apply(lambda x: 0. if np.isnan(x) else x) 
df_jockey['pct_3rd_last_j'] = df_jockey['pct_3rd_last_j'].apply(lambda x: 0. if np.isnan(x) else x) 
df_jockey['pct_4th_last_j'] = df_jockey['pct_4th_last_j'].apply(lambda x: 0. if np.isnan(x) else x)

df_jockey

Unnamed: 0,jockey,season,pct_1st_last_j,pct_2nd_last_j,pct_3rd_last_j,pct_4th_last_j
0,A Atzeni,2017/2018,0.00,0.00,0.00,0.00
1,A Badel,2016/2017,0.00,0.00,0.00,0.00
2,A Badel,2017/2018,5.98,6.84,6.84,10.26
3,A Badel,2018/2019,6.83,7.80,9.27,7.80
4,A Badel,2019/2020,6.17,11.11,5.56,8.64
...,...,...,...,...,...,...
245,Z Purton,2016/2017,0.00,0.00,0.00,0.00
246,Z Purton,2017/2018,17.72,15.23,9.93,7.95
247,Z Purton,2018/2019,21.45,16.88,10.09,9.46
248,Z Purton,2019/2020,24.92,16.37,9.91,9.31


In [21]:
df_trainer=pd.DataFrame({'trainer':df['trainer'].sort_values().unique()})

#create dataframe containing percentage of 1st,2nd,3rd,4th place of last season
df_1st_rate=df.groupby(['season','trainer'],as_index=False)['place'].agg(cal_pct_1st).round(decimals=2)
df_2nd_rate=df.groupby(['season','trainer'],as_index=False)['place'].agg(cal_pct_2nd).round(decimals=2)
df_3rd_rate=df.groupby(['season','trainer'],as_index=False)['place'].agg(cal_pct_3rd).round(decimals=2)
df_4th_rate=df.groupby(['season','trainer'],as_index=False)['place'].agg(cal_pct_4th).round(decimals=2)
df_1st_rate.columns=['season','trainer','pct_1st']
df_2nd_rate.columns=['season','trainer','pct_2nd']
df_3rd_rate.columns=['season','trainer','pct_3rd']
df_4th_rate.columns=['season','trainer','pct_4th']

#join pct information to df_jockey dataframe
df_trainer = pd.merge(df_trainer,df_1st_rate,on='trainer')
df_trainer = pd.merge(df_trainer,df_2nd_rate,on=['trainer','season'])
df_trainer = pd.merge(df_trainer,df_3rd_rate,on=['trainer','season'])
df_trainer = pd.merge(df_trainer,df_4th_rate,on=['trainer','season'])

#get last season performance by shifting
df_trainer['pct_1st_last_t'] = df_trainer.groupby(['trainer'],as_index=False)['pct_1st'].transform(lambda x:x.shift())
df_trainer['pct_2nd_last_t'] = df_trainer.groupby(['trainer'],as_index=False)['pct_2nd'].transform(lambda x:x.shift())
df_trainer['pct_3rd_last_t'] = df_trainer.groupby(['trainer'],as_index=False)['pct_3rd'].transform(lambda x:x.shift())
df_trainer['pct_4th_last_t'] = df_trainer.groupby(['trainer'],as_index=False)['pct_4th'].transform(lambda x:x.shift())

#keep all pact columns of last season only
df_trainer=df_trainer[['trainer','season','pct_1st_last_t','pct_2nd_last_t','pct_3rd_last_t','pct_4th_last_t']]

#fill nan with 0 for the first season record
df_trainer['pct_1st_last_t'] = df_trainer['pct_1st_last_t'].apply(lambda x: 0. if np.isnan(x) else x)
df_trainer['pct_2nd_last_t'] = df_trainer['pct_2nd_last_t'].apply(lambda x: 0. if np.isnan(x) else x) 
df_trainer['pct_3rd_last_t'] = df_trainer['pct_3rd_last_t'].apply(lambda x: 0. if np.isnan(x) else x) 
df_trainer['pct_4th_last_t'] = df_trainer['pct_4th_last_t'].apply(lambda x: 0. if np.isnan(x) else x)

df_trainer

Unnamed: 0,trainer,season,pct_1st_last_t,pct_2nd_last_t,pct_3rd_last_t,pct_4th_last_t
0,A Bull,2016/2017,0.0,0.0,0.0,0.0
1,A Couetil,2017/2018,0.0,0.0,0.0,0.0
2,A Fabre,2016/2017,0.0,0.0,0.0,0.0
3,A Fabre,2017/2018,0.0,0.0,0.0,0.0
4,A Fabre,2018/2019,0.0,100.0,0.0,0.0
...,...,...,...,...,...,...
217,Y Tomomichi,2018/2019,0.0,0.0,0.0,0.0
218,Y Tomomichi,2019/2020,0.0,100.0,0.0,0.0
219,Y Yahagi,2018/2019,0.0,0.0,0.0,0.0
220,de Royer Dupre,2016/2017,0.0,0.0,0.0,0.0


In [22]:
# export race,jockey,trainer dataframe to csv file
df_ml.to_csv('../data/race.csv')
df_jockey.to_csv('../data/jockey.csv')
df_trainer.to_csv('../data/trainer.csv')

In [23]:
# join df_ml,df_jockey,df_trainer together
df_r_j=pd.merge(df_ml, df_jockey, how='left', left_on=['season','jockey'], right_on=['season','jockey'] )
df_all=pd.merge(df_r_j,df_trainer, how='left', left_on=['season','trainer'], right_on=['season','trainer'])
df_all

Unnamed: 0,season,race_id,race,date,place,last_place,horse_id,horse_no,horse,jockey,...,finish_time_s,rest_day,pct_1st_last_j,pct_2nd_last_j,pct_3rd_last_j,pct_4th_last_j,pct_1st_last_t,pct_2nd_last_t,pct_3rd_last_t,pct_4th_last_t
0,2016/2017,1,1,2016-09-03,1,0.0,T032,10,SEASONS KING,N Callan,...,69.35,45.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,2016/2017,1,1,2016-09-03,2,0.0,V107,2,HAPPY SOUND,J Moreira,...,69.56,45.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,2016/2017,1,1,2016-09-03,3,0.0,S359,14,NATURAL FRIENDSHIP,B Prebble,...,69.59,45.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3,2016/2017,1,1,2016-09-03,4,0.0,T004,13,LUCKY PLACE,O Doleuze,...,69.84,45.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,2016/2017,1,1,2016-09-03,5,0.0,V032,9,NO LAUGHING MATTER,N Rawiller,...,69.89,45.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40669,2020/2021,92,8,2020-10-07,8,12.0,B343,6,GENTLE BREEZE,K H Chan,...,57.41,24.0,7.10,6.45,9.03,8.06,14.02,9.00,9.21,9.21
40670,2020/2021,92,8,2020-10-07,9,2.0,C314,10,ALLIED AGILITY,A Badel,...,57.42,31.0,10.28,9.09,8.30,9.09,14.02,9.00,9.21,9.21
40671,2020/2021,92,8,2020-10-07,10,2.0,A293,12,SAUL'S SPECIAL,V Borges,...,57.62,21.0,6.64,7.47,7.05,6.64,4.85,9.44,5.61,3.83
40672,2020/2021,92,8,2020-10-07,11,12.0,C124,5,BEAUTY AMIGO,M L Yeung,...,57.87,14.0,1.92,3.60,3.84,5.76,0.00,0.00,0.00,0.00


In [24]:
df_all.columns

Index(['season', 'race_id', 'race', 'date', 'place', 'last_place', 'horse_id',
       'horse_no', 'horse', 'jockey', 'trainer', 'actual_weight',
       'declared_horse_weight', 'draw', 'lbw', 'last_running_position',
       'win_odds', 'class', 'going', 'track', 'prize', 'location',
       'distance_m', 'finish_time_s', 'rest_day', 'pct_1st_last_j',
       'pct_2nd_last_j', 'pct_3rd_last_j', 'pct_4th_last_j', 'pct_1st_last_t',
       'pct_2nd_last_t', 'pct_3rd_last_t', 'pct_4th_last_t'],
      dtype='object')

In [25]:
df_all.head().T

Unnamed: 0,0,1,2,3,4
season,2016/2017,2016/2017,2016/2017,2016/2017,2016/2017
race_id,1,1,1,1,1
race,1,1,1,1,1
date,2016-09-03 00:00:00,2016-09-03 00:00:00,2016-09-03 00:00:00,2016-09-03 00:00:00,2016-09-03 00:00:00
place,1,2,3,4,5
last_place,0,0,0,0,0
horse_id,T032,V107,S359,T004,V032
horse_no,10,2,14,13,9
horse,SEASONS KING,HAPPY SOUND,NATURAL FRIENDSHIP,LUCKY PLACE,NO LAUGHING MATTER
jockey,N Callan,J Moreira,B Prebble,O Doleuze,N Rawiller


In [26]:
df_train=df_all[df_all['date']<'2020-09-01']
df_test=df_all[df_all['date']>='2020-09-01']

In [27]:
df_train.to_csv('../data/train/train.csv')
df_train

Unnamed: 0,season,race_id,race,date,place,last_place,horse_id,horse_no,horse,jockey,...,finish_time_s,rest_day,pct_1st_last_j,pct_2nd_last_j,pct_3rd_last_j,pct_4th_last_j,pct_1st_last_t,pct_2nd_last_t,pct_3rd_last_t,pct_4th_last_t
0,2016/2017,1,1,2016-09-03,1,0.0,T032,10,SEASONS KING,N Callan,...,69.35,45.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,2016/2017,1,1,2016-09-03,2,0.0,V107,2,HAPPY SOUND,J Moreira,...,69.56,45.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,2016/2017,1,1,2016-09-03,3,0.0,S359,14,NATURAL FRIENDSHIP,B Prebble,...,69.59,45.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3,2016/2017,1,1,2016-09-03,4,0.0,T004,13,LUCKY PLACE,O Doleuze,...,69.84,45.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,2016/2017,1,1,2016-09-03,5,0.0,V032,9,NO LAUGHING MATTER,N Rawiller,...,69.89,45.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39580,2019/2020,828,9,2020-07-15,8,4.0,A177,6,ENCORE BOY,T H So,...,69.10,21.0,2.33,2.33,2.72,6.23,6.94,8.37,8.61,6.46
39581,2019/2020,828,9,2020-07-15,9,6.0,B285,12,RACING FIGHTER,M F Poon,...,69.24,21.0,6.79,4.59,9.18,9.98,7.55,4.86,7.37,8.63
39582,2019/2020,828,9,2020-07-15,10,1.0,A379,9,SUNNY BOY,V Borges,...,69.36,10.0,0.00,9.68,0.00,0.00,13.77,7.36,11.32,8.30
39583,2019/2020,828,9,2020-07-15,11,8.0,A293,8,SAUL'S SPECIAL,K H Chan,...,69.93,10.0,7.96,10.62,8.85,7.96,4.61,7.59,5.42,8.13


In [28]:
df_test.to_csv('../data/test/test.csv')