# Data Cleaning and Processing

In this notebook, we will extract the useful variables from the data for building the prediction model in next notebook.

In [1]:
import pickle
import requests
import pandas as pd
import numpy as np
import shelve
import re

First, we load the HKJC data from the previous notebook.

In [2]:
with shelve.open('HKJCData') as db:
       df_all=db['df_all']

In [21]:
len(df_all)

7927

### Convert Date from factor to date format

In [3]:
df_all['Date']=pd.to_datetime(df_all['Date'], format='%Y%m%d')

### Convert finish time from factor to integer

In [4]:
df_all=df_all[df_all.Place.apply(lambda x: x.isnumeric())]
#convert time from factor to time
FinishTime=[]
for item in df_all['FinishTime']:
    time=item.split(".")
    time = list(map(int, time))
    time1= time[0]*60+time[1]+time[2]/100
    FinishTime.append(time1)
df_all["FinishTime"]=FinishTime

In [19]:
df_all.FinishTime.head()

0     71.86
1     70.04
2    133.85
3     95.07
4    108.15
Name: FinishTime, dtype: float64

### Jockey Statisitc(the winning percentage of jockey)

In [5]:
# Jockey Place statistics
table=df_all.groupby(["Jockey","Place"]).agg({"Place":"count"})
table2=table.groupby(level=0).apply(lambda x:100 * x / float(x.sum()))
Jockey=table2.unstack().fillna(0).reset_index()
Jockey.columns = Jockey.columns.get_level_values(0)
Jockey=Jockey.iloc[:,0:4]
#We only the the win and place percentage
Jockey.columns=['Jockey','J_1st','J_2nd','J_3rd']

#DF 2016 merge with Jockey basic stat
df_all = df_all.merge(Jockey, how = 'inner', on = ['Jockey'])


In [14]:
df_all.head()

Unnamed: 0,Place,HorseNumber,HorseName,HorseCode,Jockey,Trainer,ActualWeight,DeclarWeight,RunningPosition,LBW,...,RaceIndex,J_1st,J_2nd,J_3rd,T_1st_x,T_2nd_x,T_3rd_x,T_1st_y,T_2nd_y,T_3rd_y
0,1,10,SEASONS KING,(T032),N Callan,D J Hall,125,1048,10,-,...,20160907_1,7.87037,7.87037,6.712963,9.797297,5.405405,5.743243,9.797297,5.405405,5.743243
1,3,3,HAPPY AND HEALTHY,(T056),N Callan,D J Hall,132,1111,3,2,...,20160907_5,7.87037,7.87037,6.712963,9.797297,5.405405,5.743243,9.797297,5.405405,5.743243
2,10,9,LUCKY BALL,(T130),N Callan,D J Hall,129,1045,6,7,...,20160907_2,7.87037,7.87037,6.712963,9.797297,5.405405,5.743243,9.797297,5.405405,5.743243
3,6,2,SUPER TURBO,(V310),N Callan,D J Hall,130,1023,2,2-1/4,...,20160907_8,7.87037,7.87037,6.712963,9.797297,5.405405,5.743243,9.797297,5.405405,5.743243
4,2,6,LUCKY BALL,(T130),N Callan,D J Hall,128,1050,3,N,...,20160907_1,7.87037,7.87037,6.712963,9.797297,5.405405,5.743243,9.797297,5.405405,5.743243


### Trainer Statisitc(the winning percentage of Trainer)

In [12]:
# Jockey Place statistics
table=df_all.groupby(["Trainer","Place"]).agg({"Place":"count"})
table2=table.groupby(level=0).apply(lambda x:100 * x / float(x.sum()))
Trainer=table2.unstack().fillna(0).reset_index()
Trainer.columns = Trainer.columns.get_level_values(0)
Trainer=Trainer.iloc[:,0:4]
#We only the the win and place percentage
Trainer.columns=['Trainer','T_1st','T_2nd','T_3rd']

#DF 2016 merge with Jockey basic stat
df_all = df_all.merge(Trainer, how = 'inner', on = ['Trainer'])

In [13]:
df_all.head()

Unnamed: 0,Place,HorseNumber,HorseName,HorseCode,Jockey,Trainer,ActualWeight,DeclarWeight,RunningPosition,LBW,...,RaceIndex,J_1st,J_2nd,J_3rd,T_1st_x,T_2nd_x,T_3rd_x,T_1st_y,T_2nd_y,T_3rd_y
0,1,10,SEASONS KING,(T032),N Callan,D J Hall,125,1048,10,-,...,20160907_1,7.87037,7.87037,6.712963,9.797297,5.405405,5.743243,9.797297,5.405405,5.743243
1,3,3,HAPPY AND HEALTHY,(T056),N Callan,D J Hall,132,1111,3,2,...,20160907_5,7.87037,7.87037,6.712963,9.797297,5.405405,5.743243,9.797297,5.405405,5.743243
2,10,9,LUCKY BALL,(T130),N Callan,D J Hall,129,1045,6,7,...,20160907_2,7.87037,7.87037,6.712963,9.797297,5.405405,5.743243,9.797297,5.405405,5.743243
3,6,2,SUPER TURBO,(V310),N Callan,D J Hall,130,1023,2,2-1/4,...,20160907_8,7.87037,7.87037,6.712963,9.797297,5.405405,5.743243,9.797297,5.405405,5.743243
4,2,6,LUCKY BALL,(T130),N Callan,D J Hall,128,1050,3,N,...,20160907_1,7.87037,7.87037,6.712963,9.797297,5.405405,5.743243,9.797297,5.405405,5.743243


### DaySince(Days since the last race)

Split the dataset by the horsecode and calculate the days since the last race

In [15]:
df_all_split = {k: v for k, v in df_all.groupby('HorseCode')}
for k, v in df_all_split.items():
    #sort the date
    df_all_split[k]=df_all_split[k].reset_index(drop=True)
    df_all_split[k].sort_values(by='Date', ascending=False)
    df_all_split[k]['DaySince'] = df_all_split[k]['Date'].diff().astype('timedelta64[D]')
df_all=pd.concat(df_all_split.values(), ignore_index=True)

#Fill the Nan with 0 since those are new horses
df_all.DaySince.fillna(0, inplace=True)