In [1]:
# need to get RaceResult 15-16-17
# need to get Workout 0-1

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
injury_raw = pd.read_csv('../data/injury.csv')
vets_list_raw = pd.read_csv('../data/vets_list.csv')
people_raw = pd.read_csv('../data/people.csv')
race_result0_raw = pd.read_parquet('../data/race_result0.parquet')
race_result0_raw = race_result0_raw.iloc[0:10000]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
def clean_injury_data(df: pd.DataFrame) -> pd.DataFrame:
    injury = []
    for _, row in df.iterrows():
        try:
            horse_id = row['Horses'].split('HisaHorseId":"')[1].split('"')[0]
        except AttributeError:
            # NULL value
            horse_id = None
        except IndexError:
            # string split error
            horse_id = None

        try:
            responsible_person_id = row['Horses'].split('ResponsiblePersonHisaId":"')[1].split('"')[0]
        except AttributeError:
            # NULL value
            responsible_person_id = None
        except IndexError:
            # string split error
            responsible_person_id = None

        injury.append({
            'date': row['DateOfInjury'],
            'injury_id': row['HisaInjuryId'],
            'horse_id': horse_id,
            'responsible_person_id': responsible_person_id,
            'vets_list_id': row['RelatedVetsListId'],
            'location_id': row['LocationId'],
            'injury_type': row['Type'],
            'circumstances': row['Circumstance'],
            'race_number': row['RaceNumber'],
        })

    return pd.DataFrame(injury)


In [5]:
def clean_vets_list_data(df: pd.DataFrame) -> pd.DataFrame:
    
    vets_list = df[['VetsListId', 'HisaHorseId', 'CurrentResponsiblePersonId', 'LocationId', 'DatePlacedOnList', 'DateToComeOffList', 'DaysOnList', 'Reason', 'OtherStateReason']]
    vets_list.columns = ['vets_list_id', 'horse_id', 'responsible_person_id', 'location_id', 'date_on_list', 'date_off_list', 'days_on_list', 'reason', 'other_reason']

    return vets_list


In [6]:
def clean_people_data(df: pd.DataFrame) -> pd.DataFrame:
    
    df['trainer'] = np.where(
        df['Roles'].str.contains('Trainer'),
        1,
        0
    )

    df = df[['HisaPersonId', 'DisplayName', 'trainer']]
    df = df.rename(columns={
        'HisaPersonId': 'person_id',
        'DisplayName': 'name',
        'trainer': 'is_trainer'
    })

    return df

In [60]:
def clean_race_results_data(df: pd.DataFrame) -> pd.DataFrame:
    columns = {
        'PostPosition': 'post_position',
        'Race_RaceNumber': 'race_number',
        'Horse_Foaled': 'horse_foaled_date',
        'OfficialPosition': 'finish',
        'Race_Date': 'race_date',
        'Horse_HisaId': 'horse_id',
        'Horse_ReferenceNumber': 'horse_reference_number',
        'Horse_Name': 'horse_name', 
        'Race_Condition': 'race_condition',
        'Race_Purse' : 'purse',
        'Race_LocationId' : 'location_id',
        'Race_LocationCode': 'location_code',
        'Race_ClaimingPrice': 'claiming_price',
        'Race_LocationName': 'location_name',
        'Race_Type': 'race_type',
        'Race_DistanceFurlong': 'race_distance',
        'Race_Course': 'surface',
        'Trainer_HisaId': 'trainer_id',
        'Trainer_LastName' : 'trainer_last_name',
        'Trainer_FirstName': 'trainer_first_name',
        'Trainer_ReferenceNumber': 'trainer_reference_number',
        'Scratched': 'scratched_bool',
        'Earnings': 'earnings',
        'Odds': 'odds',
        'Dnf': 'dnf_bool',
        'ProgramNumber': 'program_number'
    }

    df = df[[c for c in columns.keys()]].rename(columns=columns)
    df['scratched'] = np.where(
        df['scratched_bool'],
        1,
        0
    )

    df['dnf'] = np.where(
        df['dnf_bool'],
        1,
        0
    )

    df = df.drop(['scratched_bool', 'dnf_bool'], axis=1)
    df = df.sort_values(by=['horse_reference_number', 'race_date'])

    df = df[df['horse_reference_number'] != 0]

    return df


In [7]:
injury = clean_injury_data(injury_raw)
injury.head(2)

Unnamed: 0,date,injury_id,horse_id,responsible_person_id,vets_list_id,location_id,injury_type,circumstances,race_number
0,2024-04-18,I000005415,H000047885,P000001110,,L000000054,Horse,Racing,6.0
1,2023-09-16,I000003504,H000016895,P000023932,V000069978,L000000002,Horse,Other,


In [8]:
vets_list = clean_vets_list_data(vets_list_raw)
vets_list.head(2)

Unnamed: 0,vets_list_id,horse_id,responsible_person_id,location_id,date_on_list,date_off_list,days_on_list,reason,other_reason
0,V000040228,H000035486,P000002232,L000000047,2023-03-09,2023-03-09,0.0,Other,Recency
1,V000058307,H000007442,P000016423,L000000081,2023-07-15,2023-07-29,14.0,IntraArticularInjection,


In [9]:
people = clean_people_data(people_raw)
people.head(2)

Unnamed: 0,person_id,name,is_trainer
0,P000016780,Brenda Donaire,0
1,P000013204,James Brigmon,0


In [61]:
race_results = clean_race_results_data(race_result0_raw)
race_results.head(2)

Unnamed: 0,post_position,race_number,horse_foaled_date,finish,race_date,horse_id,horse_reference_number,horse_name,race_condition,purse,...,surface,trainer_id,trainer_last_name,trainer_first_name,trainer_reference_number,earnings,odds,program_number,scratched,dnf
184378,4,8,2018-03-29,5,2020-03-06,,19446,Tellers Cartel Dash,Fast,6000,...,Dirt,,Enlow,Ray,44221,169.0,20.8,4,0,0
156160,2,1,2002-04-19,2,2016-04-23,,6555653,And the Eagle Flys,Firm,30000,...,Timber,P000015573,Meister,William,4066,5400.0,0.0,2,0,0


In [73]:
race_results['age'] = (race_results['race_date'] - race_results['horse_foaled_date']).dt.days
df = race_results.copy()


df['previous_race_date'] = df.groupby('horse_reference_number')['race_date'].shift(1)
df['previous_race_dnf'] = df.groupby('horse_reference_number')['dnf'].shift(1)
df['previous_race_scratch'] = df.groupby('horse_reference_number')['scratched'].shift(1)
df['previous_race_distance'] = df.groupby('horse_reference_number')['scratched'].shift(1)
df['days_since_last_race'] = (df['race_date'] - df['previous_race_date']).dt.days

df['rest_after_dnf'] = np.where(
    df['previous_race_dnf'] == 1,
    df['days_since_last_race'],
    None
)

df['rest_after_scratch'] = np.where(
    df['previous_race_scratch'] == 1,
    df['days_since_last_race'],
    None
)




In [76]:
df.columns

Index(['post_position', 'race_number', 'horse_foaled_date', 'finish',
       'race_date', 'horse_id', 'horse_reference_number', 'horse_name',
       'race_condition', 'purse', 'location_id', 'location_code',
       'claiming_price', 'location_name', 'race_type', 'race_distance',
       'surface', 'trainer_id', 'trainer_last_name', 'trainer_first_name',
       'trainer_reference_number', 'earnings', 'odds', 'program_number',
       'scratched', 'dnf', 'age', 'previous_race_date', 'previous_race_dnf',
       'previous_race_scratch', 'days_since_last_race', 'rest_after_dnf',
       'rest_after_scratch'],
      dtype='object')

In [74]:
df[df['horse_reference_number'] == 10072913]

Unnamed: 0,post_position,race_number,horse_foaled_date,finish,race_date,horse_id,horse_reference_number,horse_name,race_condition,purse,...,program_number,scratched,dnf,age,previous_race_date,previous_race_dnf,previous_race_scratch,days_since_last_race,rest_after_dnf,rest_after_scratch
87852,10,8,2016-03-23,7,2018-07-06,H000018614,10072913,El Ahijado,Firm,42000,...,9,0,0,835,,,,,,
136746,4,8,2016-03-23,9,2018-08-24,H000018614,10072913,El Ahijado,Firm,42000,...,4,0,0,884,2018-07-06,0.0,0.0,49.0,,
118957,7,7,2016-03-23,11,2019-10-11,H000018614,10072913,El Ahijado,Firm,73000,...,7,0,0,1297,2018-08-24,0.0,0.0,413.0,,
55056,7,6,2016-03-23,10,2020-07-08,H000018614,10072913,El Ahijado,Good,77000,...,8,0,0,1568,2019-10-11,0.0,0.0,271.0,,
187207,2,7,2016-03-23,11,2020-12-11,H000018614,10072913,El Ahijado,Fast,35000,...,2,0,0,1724,2020-07-08,0.0,0.0,156.0,,
94117,3,7,2016-03-23,5,2021-04-11,H000018614,10072913,El Ahijado,Good,81000,...,3,0,0,1845,2020-12-11,0.0,0.0,121.0,,
99972,4,7,2016-03-23,7,2021-07-31,H000018614,10072913,El Ahijado,Firm,52000,...,4,0,0,1956,2021-04-11,0.0,0.0,111.0,,
130571,3,5,2016-03-23,7,2021-12-26,H000018614,10072913,El Ahijado,Fast,63000,...,4,0,0,2104,2021-07-31,0.0,0.0,148.0,,
188566,2,8,2016-03-23,5,2022-05-22,H000018614,10072913,El Ahijado,Fast,127000,...,2,0,0,2251,2021-12-26,0.0,0.0,147.0,,
90586,99,7,2016-03-23,0,2023-05-13,H000018614,10072913,El Ahijado,Firm,127000,...,SCR,1,0,2607,2022-05-22,0.0,0.0,356.0,,


In [75]:
trainers = df.groupby(['trainer_reference_number', 'trainer_last_name', 'trainer_first_name']).agg({
    'race_number': 'count',
    'horse_reference_number': 'nunique',
    'scratched': 'sum',
    'dnf': 'sum',
    'age': 'min',
    'days_since_last_race': ['min', 'median'],
    'rest_after_dnf': 'median',
    'rest_after_scratch': 'median'
}).reset_index()

trainers.columns = ['trainer_reference_number', 'trainer_last_name', 'trainer_first_name', 
'n_entries', 'unique_horses', 'scratched', 'dnf', 'min_age', 'days_since_last_race', 'days_since_last_race', 
'rest_after_dnf_median', 'rest_after_scratch_median']

trainers['scratches_per_entry'] = trainers['scratched'] / trainers['n_entries']
trainers['dnf_per_entry'] = trainers['dnf'] / trainers['n_entries']
trainers.head(2)

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, ou

Unnamed: 0,trainer_reference_number,trainer_last_name,trainer_first_name,n_entries,unique_horses,scratched,dnf,min_age,days_since_last_race,days_since_last_race.1,rest_after_dnf_median,rest_after_scratch_median,scratches_per_entry,dnf_per_entry
0,5,Abrams,Ronald,53,33,11,1,1465,15.0,258.0,,100.0,0.207547,0.018868
1,7,Ackerman,D.,35,27,8,0,871,48.0,215.0,,642.5,0.228571,0.0


In race results need to do some processing then can aggregate on trainer
- median min age of all trainer's horses
- races last N days
- age at N distance
- distance jump 
- switching surfaces
- longer layoffs for distance / surface / condition?
- abbornally long layoff
- lost by N+ lengths 
- medications
- travel ?


for an expected model,
- tenperature
- sex
- class
- surface
- condtions
- age
- surface fast/slow




