In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from geopy.distance import geodesic
import utm
import math
import seaborn as sns

%matplotlib inline

#### Bringing in all the datasets

In [2]:
races = pd.read_csv('../data/nyra_race_table.csv')
races.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   track_id         2000 non-null   object
 1   race_date        2000 non-null   object
 2   race_number      2000 non-null   int64 
 3   distance_id      2000 non-null   int64 
 4   course_type      2000 non-null   object
 5   track_condition  2000 non-null   object
 6   run_up_distance  2000 non-null   int64 
 7   race_type        2000 non-null   object
 8   purse            2000 non-null   int64 
 9   post_time        2000 non-null   int64 
dtypes: int64(5), object(5)
memory usage: 156.4+ KB


In [3]:
starts = pd.read_csv('../data/nyra_start_table.csv')
starts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14915 entries, 0 to 14914
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   track_id        14915 non-null  object
 1   race_date       14915 non-null  object
 2   race_number     14915 non-null  int64 
 3   program_number  14915 non-null  object
 4   weight_carried  14915 non-null  int64 
 5   jockey          14915 non-null  object
 6   odds            14915 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 815.8+ KB


In [4]:
h_id = pd.read_csv('../data/horse_ids.csv')
h_id.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14916 entries, 0 to 14915
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       14916 non-null  int64 
 1   track_id         14916 non-null  object
 2   race_date        14916 non-null  object
 3   race             14916 non-null  int64 
 4   program_number   14916 non-null  object
 5   horse_id         14916 non-null  int64 
 6   finishing_place  14916 non-null  int64 
dtypes: int64(4), object(3)
memory usage: 815.8+ KB


In [5]:
h_names = pd.read_csv('../data/horse_names.csv')
h_names.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4638 entries, 0 to 4637
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  4638 non-null   int64 
 1   horse_id    4638 non-null   int64 
 2   horse_name  4638 non-null   object
dtypes: int64(2), object(1)
memory usage: 108.8+ KB


In [6]:
tracking = pd.read_csv('../data/nyra_tracking_table.csv')
tracking.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5228430 entries, 0 to 5228429
Data columns (total 7 columns):
 #   Column          Dtype  
---  ------          -----  
 0   track_id        object 
 1   race_date       object 
 2   race_number     int64  
 3   program_number  object 
 4   trakus_index    int64  
 5   latitude        float64
 6   longitude       float64
dtypes: float64(2), int64(2), object(3)
memory usage: 279.2+ MB


##### Merging all data sets

In [7]:
# merging Horse ID and Horse Name data sets
h_id = h_id[['track_id', 'race_date', 'race', 'program_number', 'horse_id', 'finishing_place']]
h_names = h_names[['horse_id', 'horse_name']]
horses = pd.merge(h_id, h_names, how='left', on='horse_id')
horses = horses.rename(columns = {'race' : 'race_number'})

# Adding in Starts data
starts['race_date'] = pd.to_datetime(starts['race_date'])
horses['race_date'] = pd.to_datetime(horses['race_date'])
# strip spaces in the starts program_number column
starts['program_number'] = starts['program_number'].str.strip()
# merge
st_h = pd.merge(starts, horses, how='outer', on=['track_id', 'race_date', 'race_number', 'program_number'])
st_h = st_h.dropna(subset='odds')

# Adding in race details
races['race_date'] = pd.to_datetime(races['race_date'])
ra_st_h = pd.merge(st_h, races, how='left', on=['track_id', 'race_date', 'race_number'])

# Adding in tracking data
tracking['program_number'] = tracking['program_number'].str.strip()
tracking['race_date'] = pd.to_datetime(tracking['race_date'])
full = pd.merge(ra_st_h, tracking, how='left', on=['track_id', 'race_date', 'race_number', 'program_number'])

#Because races that include jumps are so drastically different than flat races, and such a comparatively small proportion of races
#we decided to drop those from our data set for the majority of our analysis
full = full.loc[full['course_type'] != 'M']

full = full.sort_values(['track_id', 'race_date', 'race_number', 'program_number', 'trakus_index']).reset_index(drop=True)

full.head()

Unnamed: 0,track_id,race_date,race_number,program_number,weight_carried,jockey,odds,horse_id,finishing_place,horse_name,distance_id,course_type,track_condition,run_up_distance,race_type,purse,post_time,trakus_index,latitude,longitude
0,AQU,2019-01-01,1,1,123.0,Dylan Davis,130.0,1,2,Sounds Delicious,650,D,MY,36,AOC,80000,1220,1,40.669401,-73.829205
1,AQU,2019-01-01,1,1,123.0,Dylan Davis,130.0,1,2,Sounds Delicious,650,D,MY,36,AOC,80000,1220,2,40.669405,-73.829203
2,AQU,2019-01-01,1,1,123.0,Dylan Davis,130.0,1,2,Sounds Delicious,650,D,MY,36,AOC,80000,1220,3,40.669411,-73.8292
3,AQU,2019-01-01,1,1,123.0,Dylan Davis,130.0,1,2,Sounds Delicious,650,D,MY,36,AOC,80000,1220,4,40.669421,-73.829196
4,AQU,2019-01-01,1,1,123.0,Dylan Davis,130.0,1,2,Sounds Delicious,650,D,MY,36,AOC,80000,1220,5,40.669433,-73.82919


In [8]:
full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5162881 entries, 0 to 5162880
Data columns (total 20 columns):
 #   Column           Dtype         
---  ------           -----         
 0   track_id         object        
 1   race_date        datetime64[ns]
 2   race_number      int64         
 3   program_number   object        
 4   weight_carried   float64       
 5   jockey           object        
 6   odds             float64       
 7   horse_id         int64         
 8   finishing_place  int64         
 9   horse_name       object        
 10  distance_id      int64         
 11  course_type      object        
 12  track_condition  object        
 13  run_up_distance  int64         
 14  race_type        object        
 15  purse            int64         
 16  post_time        int64         
 17  trakus_index     int64         
 18  latitude         float64       
 19  longitude        float64       
dtypes: datetime64[ns](1), float64(4), int64(8), object(7)
memory usage: 

In [10]:
full['latitude'].describe()

count    5.162881e+06
mean     4.118758e+01
std      9.656976e-01
min      4.066671e+01
25%      4.067341e+01
50%      4.071387e+01
75%      4.071732e+01
max      4.307399e+01
Name: latitude, dtype: float64

In [None]:
geodesic([full['latitude'][1], full['longitude'][1]], [full['latitude'][0], full['longitude'][0]]).feet

In [11]:
full['p_lat'] = full.groupby(['track_id', 'race_date', 'race_number', 
                              'program_number'])['latitude'].shift(1)
full['p_lon'] = full.groupby(['track_id', 'race_date', 'race_number', 
                              'program_number'])['longitude'].shift(1)
full

Unnamed: 0,track_id,race_date,race_number,program_number,weight_carried,jockey,odds,horse_id,finishing_place,horse_name,...,track_condition,run_up_distance,race_type,purse,post_time,trakus_index,latitude,longitude,p_lat,p_lon
0,AQU,2019-01-01,1,1,123.0,Dylan Davis,130.0,1,2,Sounds Delicious,...,MY,36,AOC,80000,1220,1,40.669401,-73.829205,,
1,AQU,2019-01-01,1,1,123.0,Dylan Davis,130.0,1,2,Sounds Delicious,...,MY,36,AOC,80000,1220,2,40.669405,-73.829203,40.669401,-73.829205
2,AQU,2019-01-01,1,1,123.0,Dylan Davis,130.0,1,2,Sounds Delicious,...,MY,36,AOC,80000,1220,3,40.669411,-73.829200,40.669405,-73.829203
3,AQU,2019-01-01,1,1,123.0,Dylan Davis,130.0,1,2,Sounds Delicious,...,MY,36,AOC,80000,1220,4,40.669421,-73.829196,40.669411,-73.829200
4,AQU,2019-01-01,1,1,123.0,Dylan Davis,130.0,1,2,Sounds Delicious,...,MY,36,AOC,80000,1220,5,40.669433,-73.829190,40.669421,-73.829196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5162876,SAR,2019-09-02,11,8,119.0,Luis Saez,110.0,2295,6,Mine the Coin,...,SY,32,MCL,40000,614,329,43.071944,-73.771727,43.071955,-73.771696
5162877,SAR,2019-09-02,11,8,119.0,Luis Saez,110.0,2295,6,Mine the Coin,...,SY,32,MCL,40000,614,330,43.071933,-73.771760,43.071944,-73.771727
5162878,SAR,2019-09-02,11,8,119.0,Luis Saez,110.0,2295,6,Mine the Coin,...,SY,32,MCL,40000,614,331,43.071922,-73.771792,43.071933,-73.771760
5162879,SAR,2019-09-02,11,8,119.0,Luis Saez,110.0,2295,6,Mine the Coin,...,SY,32,MCL,40000,614,332,43.071911,-73.771825,43.071922,-73.771792


In [69]:
pd.isna(full['p_lat'][1]) == False

True

In [70]:
full['seg_dist_yd'] = [geodesic([full['latitude'][i], full['longitude'][i]], 
                                              [full['p_lat'][i], full['p_lon'][i]]).feet/3 
                       if pd.isna(full['p_lat'][i]) == False else float('nan') for i in range(len(full))] 

In [71]:
full

Unnamed: 0,track_id,race_date,race_number,program_number,weight_carried,jockey,odds,horse_id,finishing_place,horse_name,...,run_up_distance,race_type,purse,post_time,trakus_index,latitude,longitude,p_lat,p_lon,seg_dist_yd
0,AQU,2019-01-01,1,1,123.0,Dylan Davis,130.0,1,2,Sounds Delicious,...,36,AOC,80000,1220,1,40.669401,-73.829205,,,
1,AQU,2019-01-01,1,1,123.0,Dylan Davis,130.0,1,2,Sounds Delicious,...,36,AOC,80000,1220,2,40.669405,-73.829203,40.669401,-73.829205,0.578236
2,AQU,2019-01-01,1,1,123.0,Dylan Davis,130.0,1,2,Sounds Delicious,...,36,AOC,80000,1220,3,40.669411,-73.829200,40.669405,-73.829203,0.802984
3,AQU,2019-01-01,1,1,123.0,Dylan Davis,130.0,1,2,Sounds Delicious,...,36,AOC,80000,1220,4,40.669421,-73.829196,40.669411,-73.829200,1.213156
4,AQU,2019-01-01,1,1,123.0,Dylan Davis,130.0,1,2,Sounds Delicious,...,36,AOC,80000,1220,5,40.669433,-73.829190,40.669421,-73.829196,1.507487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5162876,SAR,2019-09-02,11,8,119.0,Luis Saez,110.0,2295,6,Mine the Coin,...,32,MCL,40000,614,329,43.071944,-73.771727,43.071955,-73.771696,3.061578
5162877,SAR,2019-09-02,11,8,119.0,Luis Saez,110.0,2295,6,Mine the Coin,...,32,MCL,40000,614,330,43.071933,-73.771760,43.071944,-73.771727,3.167151
5162878,SAR,2019-09-02,11,8,119.0,Luis Saez,110.0,2295,6,Mine the Coin,...,32,MCL,40000,614,331,43.071922,-73.771792,43.071933,-73.771760,3.203113
5162879,SAR,2019-09-02,11,8,119.0,Luis Saez,110.0,2295,6,Mine the Coin,...,32,MCL,40000,614,332,43.071911,-73.771825,43.071922,-73.771792,3.247697


In [72]:
full['seg_speed_yd/s'] = full['seg_dist_yd']*4

full['seg_mph'] = 2.0454545454545 * full['seg_speed_yd/s']

full['cum_dist_yd'] = full.groupby(['track_id', 'race_date', 'race_number', 
                              'program_number'])['seg_dist_yd'].cumsum()

full['avg_speed_yd/s'] = full['cum_dist_yd'] / (full['trakus_index']/4)

full['avg_mph'] = 2.0454545454545 * full['avg_speed_yd/s']

full['seg_accel_yd/s2'] = 4 * full['seg_speed_yd/s'].diff()

full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5162881 entries, 0 to 5162880
Data columns (total 29 columns):
 #   Column           Dtype         
---  ------           -----         
 0   track_id         object        
 1   race_date        datetime64[ns]
 2   race_number      int64         
 3   program_number   object        
 4   weight_carried   float64       
 5   jockey           object        
 6   odds             float64       
 7   horse_id         int64         
 8   finishing_place  int64         
 9   horse_name       object        
 10  distance_id      int64         
 11  course_type      object        
 12  track_condition  object        
 13  run_up_distance  int64         
 14  race_type        object        
 15  purse            int64         
 16  post_time        int64         
 17  trakus_index     int64         
 18  latitude         float64       
 19  longitude        float64       
 20  p_lat            float64       
 21  p_lon            float64       

In [153]:
full = full[['track_id', 'race_date', 'race_number', 'program_number', 'horse_id', 'weight_carried', 
                'jockey', 'odds', 'finishing_place', 'horse_name', 'course_type', 'track_condition',
                'race_type', 'purse', 'post_time', 'latitude', 'longitude', 'p_lat', 'p_lon', 'distance_id', 
                'run_up_distance', 'trakus_index', 'seg_dist_yd', 'seg_speed_yd/s', 'seg_mph', 'cum_dist_yd', 
                'avg_speed_yd/s', 'avg_mph', 'seg_accel_yd/s2']]

In [154]:
import pickle

with open('../data/full_race_data.pickle', 'wb') as file:
    pickle.dump(full, file)

In [156]:
full.head(50)

Unnamed: 0,track_id,race_date,race_number,program_number,horse_id,weight_carried,jockey,odds,finishing_place,horse_name,...,distance_id,run_up_distance,trakus_index,seg_dist_yd,seg_speed_yd/s,seg_mph,cum_dist_yd,avg_speed_yd/s,avg_mph,seg_accel_yd/s2
0,AQU,2019-01-01,1,1,1,123.0,Dylan Davis,130.0,2,Sounds Delicious,...,650,36,1,,,,,,,
1,AQU,2019-01-01,1,1,1,123.0,Dylan Davis,130.0,2,Sounds Delicious,...,650,36,2,0.578236321,2.312945286,4.731024448,0.578236321,1.156472643,2.365512224,
2,AQU,2019-01-01,1,1,1,123.0,Dylan Davis,130.0,2,Sounds Delicious,...,650,36,3,0.802984235,3.211936942,6.569871017,1.381220557,1.841627409,3.766965155,3.595966625
3,AQU,2019-01-01,1,1,1,123.0,Dylan Davis,130.0,2,Sounds Delicious,...,650,36,4,1.213156191,4.852624763,9.925823379,2.594376748,2.594376748,5.306679711,6.562751285
4,AQU,2019-01-01,1,1,1,123.0,Dylan Davis,130.0,2,Sounds Delicious,...,650,36,5,1.507487478,6.029949911,12.333988454,4.101864225,3.28149138,6.71214146,4.709300592
5,AQU,2019-01-01,1,1,1,123.0,Dylan Davis,130.0,2,Sounds Delicious,...,650,36,6,1.964884405,7.859537622,16.076326953,6.066748631,4.044499087,8.272839042,7.318350842
6,AQU,2019-01-01,1,1,1,123.0,Dylan Davis,130.0,2,Sounds Delicious,...,650,36,7,2.27343321,9.093732839,18.600817171,8.340181841,4.765818195,9.748264489,4.93678087
7,AQU,2019-01-01,1,1,1,123.0,Dylan Davis,130.0,2,Sounds Delicious,...,650,36,8,2.7678799,11.071519601,22.646290092,11.108061741,5.55403087,11.360517689,7.911147046
8,AQU,2019-01-01,1,1,1,123.0,Dylan Davis,130.0,2,Sounds Delicious,...,650,36,9,3.125547898,12.502191592,25.57266462,14.233609639,6.326048728,12.939645126,5.722687966
9,AQU,2019-01-01,1,1,1,123.0,Dylan Davis,130.0,2,Sounds Delicious,...,650,36,10,3.15316816,12.612672642,25.798648585,17.386777799,6.95471112,14.225545472,0.441924198
