In [4]:
import pandas as pd
import numpy as np

In [10]:
# from utils.load_data import load_data
BADLY_BEATEN_THRESHOLD = 2900 # 99th percentile

In [16]:
import pandas as pd
from utils.constants import YEARS_TO_MODEL, DATA_BUCKET


def load_data(local: bool, load_2020: bool) -> pd.DataFrame:
    if local:
        df = pd.read_csv('/users/jameshull/documents/github/hisa-data/races_2023.csv', nrows=10000)
    
    else:
        df = pd.DataFrame()
        for yr in YEARS_TO_MODEL:
            if yr == 2020 and not load_2020:
                continue
            data_key = f'races_{yr}.csv' 
            data_location = 's3://{}/{}'.format(DATA_BUCKET, data_key) 
            df_yr = pd.read_csv(data_location) 
            df = pd.concat([df, df_yr], ignore_index=True)

    
    return df



In [17]:
df = load_data(True)

In [41]:
def get_dnf(df: pd.DataFrame) -> pd.Series:
    dnf = np.where(
            df['length_behind_at_finish'] > 9000,
            1,
            0
        )
    
    return pd.Series(dnf, name='dnf')



def get_scratches(df: pd.DataFrame) -> pd.Series:
    scratched = np.where(
            df['scratch_indicator'] == 'Y',
            1,
            0
        )
    
    vets_scratch = np.where(
        (df['scratch_indicator'] == 'Y') & (df['scratch_reason'].isin(['I', 'J', 'N', 'U', 'V', 'Z'])),
        1,
        0
    )

    return pd.Series(scratched, name='scratched'), pd.Series(vets_scratch, name='vet_scratched')
    

def get_medication(df: pd.DataFrame) -> pd.Series:

    lasix = np.where(
        df['medication'].str.contains('L'),
        1,
        0
    )
    
    bute = np.where(
        df['medication'].str.contains('B'),
        1,
        0
    )

    return pd.Series(lasix, name='lasix'), pd.Series(bute, name='bute')
    

def get_badly_beaten(df: pd.DataFrame) -> pd.Series:
    badly_beaten = np.where(
        (df['length_behind_at_finish'] > BADLY_BEATEN_THRESHOLD) & (df['dnf'] == 0),
        1,
        0
    )

    return pd.Series(badly_beaten, name='badly_beaten')


def get_breakdown(df: pd.DataFrame) -> pd.Series:
    breakdown = np.where(
        (df['long_comment'].str.contains('vanned')) & (df['dnf'] == 1), 
        1,
        0
    )

    return pd.Series(breakdown, name='breakdown')



In [42]:
df['dnf'] = get_dnf(df)
df['scratched'], df['vet_scratched'] = get_scratches(df)
df['lasix'], df['bute'] = get_medication(df)
df['badly_beaten'] = get_badly_beaten(df)
df['breakdown'] = get_breakdown(df)

df[df['vet_scratched'] == 1]

Unnamed: 0,race_date,track_id,race_number,race_type,distance_id,distance_unit,surface,course_type,track_condition,weather,...,horse_name,sex,age,dnf,scratched,lasix,bute,badly_beaten,breakdown,vet_scratched
74,2023-05-17 00:00:00,AJX,1,SPI,250,Y,D,D,FT,C,...,Pagranas Boy,G,3.175342,0,1,1,0,0,0,1
112,2023-05-17 00:00:00,AJX,1,SPI,250,Y,D,D,FT,C,...,Ruby Sunday,F,3.002739,0,1,0,0,0,0,1
169,2023-05-31 00:00:00,AJX,5,ALW,300,Y,D,D,FT,C,...,Just a Kvn,F,3.200000,0,1,0,0,0,0,1
175,2023-05-17 00:00:00,AJX,5,ALW,300,Y,D,D,FT,C,...,Silver Lyning,F,3.254794,0,1,1,0,0,0,1
296,2023-06-07 00:00:00,AJX,2,SPI,330,Y,D,D,FT,C,...,Maryland Magic,H,9.150684,0,1,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9947,2023-04-20 00:00:00,AQU,8,MCL,600,F,D,D,FT,C,...,Eridromos,G,3.928767,0,1,0,0,0,0,1
9951,2023-09-09 00:00:00,ARP,8,ALW,700,F,D,D,FT,C,...,Sue Ettas Ghost,M,4.367123,0,1,1,0,0,0,1
9968,2023-11-24 00:00:00,AQU,7,MCL,700,F,D,D,FT,C,...,Burton Way,G,4.550684,0,1,1,0,0,0,1
9969,2023-12-03 00:00:00,AQU,1,MCL,650,F,D,D,SY,R,...,Burton Way,G,4.575342,0,1,1,0,0,0,1


In [43]:
df.head(2)

Unnamed: 0,race_date,track_id,race_number,race_type,distance_id,distance_unit,surface,course_type,track_condition,weather,...,horse_name,sex,age,dnf,scratched,lasix,bute,badly_beaten,breakdown,vet_scratched
0,2023-03-25 00:00:00,AIK,3,HCP,1700,F,T,M,GD,L,...,Junonia,G,8.865753,0,0,1,0,1,0,0
1,2023-03-25 00:00:00,AIK,3,HCP,1700,F,T,M,GD,L,...,Anticipating,G,7.928767,0,0,1,0,0,0,0


In [7]:
def get_xDNF(df: pd.DataFrame) -> pd.DataFrame:
    scratches = df[df['scratch_indicator'] == 'Y']
    scratches['xDNF'] = np.nan
    racers = df[df['scratch_indicator'] != 'Y']
    racers['xDNF'] = baseline_inference(df)

    df = pd.concat([racers, scratches])
    
    return df

In [8]:
def extract_features(df: pd.DataFrame) -> pd.DataFrame:
    df['dnf'] = np.where(
        (df['trouble_indicator'] == 'Y') | (df['length_behind_at_finish'] == 9999),
        1,
        0
    )

    df['scratched'] = np.where(
        df['scratch_indicator'] == 'Y',
        1,
        0
    )
    
    df['lasix'] = np.where(
        df['medication'].str.contains('L'),
        1,
        0
    )
    
    df['bute'] = np.where(
        df['medication'].str.contains('B'),
        1,
        0
    )
    
    df['badly_beaten'] = np.where(
        (df['length_behind_at_finish'] > BADLY_BEATEN_THRESHOLD) & (df['length_behind_at_finish'] < 9000),
        1,
        0
    )
    
    return df
    
    

In [None]:
# fit exponential to df['length_behind_at_finish']
from scipy.stats import expon

params = expon.fit(df['length_behind_at_finish'])

# get 95th percentile
expon.ppf(0.95, *params)

In [9]:
def get_prev_race_features(df: pd.DataFrame) -> pd.DataFrame:
    
    df['race_date'] = pd.to_datetime(df['race_date'])
    df = df.sort_values(by=['registration_number', 'race_date'])
    df = df.rename(columns={'distance_id': 'race_distance'})

    df['previous_race_date'] = df.groupby('registration_number')['race_date'].shift(1)
    df['previous_race_dnf'] = df.groupby('registration_number')['dnf'].shift(1)
    df['previous_race_scratch'] = df.groupby('registration_number')['scratched'].shift(1)
    df['previous_race_distance'] = df.groupby('registration_number')['race_distance'].shift(1)
    df['previous_surface'] = df.groupby('registration_number')['surface'].shift(1)
    df['days_since_last_race'] = (df['race_date'] - df['previous_race_date']).dt.days


    df['distance_delta'] = df['race_distance'] - df['previous_race_distance']
    df['distance_jump'] = np.where(
        df['distance_delta'] > 200,
        1,
        0
    )

    df['rest_after_dnf'] = np.where(
        df['previous_race_dnf'] == 1,
        df['days_since_last_race'],
        np.nan
    )

    df['rest_after_scratch'] = np.where(
        df['previous_race_scratch'] == 1,
        df['days_since_last_race'],
        np.nan
    )

    df['surface_change'] = np.where(
        df['surface'] != df['previous_surface'],
        1,
        0
    )

    # maybe should account for covid
    df['long_layoff'] = np.where(
        df['days_since_last_race'] > 365,
        1,
        0
    )
    
    return df

    

In [10]:
def fit_params(grouped_df: pd.DataFrame) -> pd.Series:
    try:
        params = lognorm.fit(grouped_df['days_since_last_race'].dropna())
    except Exception as e:
        # print(e)
        params = (None, None, None)

    trainer_params = pd.Series(params, index=['lognorm_p1', 'lognorm_p2', 'lognorm_p3'])
    
    return trainer_params



In [11]:
def get_first_long(df: pd.DataFrame) -> pd.DataFrame:
    
    first_long = df[df['race_distance'] > 800].sort_values(by=['race_date']).groupby(['registration_number', 'trainer_id']).first().reset_index()

    
    trainer_first_long = first_long.groupby(['trainer_id']).agg({
            'age': 'median'
        }).reset_index().rename(columns={'age': 'first_long_age'})

    return trainer_first_long


In [12]:
def group_trainer_data(df: pd.DataFrame) -> pd.DataFrame:
    trainers = df.groupby(['trainer_id']).agg({
        'race_number': 'count',
        'registration_number': 'nunique',
        'scratched': 'sum',
        'dnf': 'sum',
        'age': 'min',
        'lasix': 'sum',
        'bute': 'sum',
        'days_since_last_race': ['min', 'median'],
        'rest_after_dnf': 'median',
        'rest_after_scratch': 'median',
        'distance_jump': 'sum',
        'surface_change': 'sum',
        'long_layoff': 'sum',
        'badly_beaten': 'sum',
    }).reset_index()

    trainers.columns = ['trainer_id',
    'n_entries', 'unique_horses', 'scratched', 'dnf', 'min_age', 'lasix', 'bute', 'days_since_last_race_min', 'days_since_last_race_median', 
    'rest_after_dnf_median', 'rest_after_scratch_median', 'distance_jump', 'surface_changes', 'long_layoffs',
    'badly_beaten'
    ]

    trainers['scratches_per_entry'] = trainers['scratched'] / trainers['n_entries']
    trainers['dnf_per_entry'] = trainers['dnf'] / trainers['n_entries']
    trainers['badly_beaten_pct'] = trainers['badly_beaten'] / trainers['n_entries']
    trainers['lasix_pct'] = trainers['lasix'] / trainers['n_entries']
    trainers['bute_pct'] = trainers['bute'] / trainers['n_entries']
    
    # TODO
    # trainers['dnf_per_entry_smooth'] = None
    # trainers['scratches_per_entry'] = None
    
    
    return trainers


In [13]:
def create_features(df: pd.DataFrame, suffix: str = None) -> pd.DataFrame:

    df = extract_features(df)
    df = get_prev_race_features(df)
    trainer_params = df.groupby('trainer_id').apply(fit_params)
    first_long = get_first_long(df)
    trainers = group_trainer_data(df)
    trainers = trainers.merge(first_long, on=['trainer_id'], how='left')
    trainers = trainers.merge(trainer_params, on=['trainer_id'], how='left')
    
    for c in trainers.columns:
        if 'lognorm' in c:
            trainers[c] = round(trainers[c], 5)
    
    if suffix:
        trainers.columns = [c + f'_{suffix}' for c in trainers.columns]
        trainers = trainers.rename(columns={
            f'trainer_id_{suffix}': 'trainer_id'
        })
    
    return trainers


In [14]:
def create_targets(df: pd.DataFrame): 
    
    df = extract_features(df)
    
    trainers = df.groupby('trainer_id').agg({
        'race_number': 'count',
        'dnf': 'sum',
        'scratched': 'sum',
        'badly_beaten': 'sum',
        #'long_layoff': 'sum',
    })
    
    trainers = trainers.rename(columns={
        'race_number': 'n_entries'
    }).reset_index()
    
    trainers['scratches_per_entry'] = trainers['scratched'] / trainers['n_entries']
    trainers['dnf_per_entry'] = trainers['dnf'] / trainers['n_entries']
    trainers['badly_beaten_pct'] = trainers['badly_beaten'] / trainers['n_entries']
    
    # TODO
    # trainers['dnf_per_entry_smooth'] = None
    # trainers['scratches_per_entry'] = None
    
    trainers = trainers[['trainer_id', 'n_entries', 'dnf_per_entry']].rename(columns={
        'dnf_per_entry': 'target',
        'n_entries': 'target_n_entries',
    })
    
    return trainers

In [15]:

prediction_dates = [
    '2022-07-01', 
    '2022-12-01'
]


df = load_data(False)
df = get_xDNF(df)
df['race_date'] = pd.to_datetime(df['race_date'])
FEATURE_DAY_DELTA = 365
TARGET_DAY_DELTA = 180

full_model_df = pd.DataFrame()
for prediction_date in prediction_dates:
    feature_end_date = datetime.datetime.strptime(prediction_date, '%Y-%m-%d')
    feature_start_date = feature_end_date - datetime.timedelta(days=FEATURE_DAY_DELTA)
    
    target_start_date = feature_end_date + datetime.timedelta(days=1)
    target_end_date = target_start_date + datetime.timedelta(days=TARGET_DAY_DELTA)
    
    df_features = df[(df['race_date'] >= feature_start_date) & (df['race_date'] <= feature_end_date)]
    df_target = df[(df['race_date'] >= target_start_date) & (df['race_date'] <= target_end_date)]
    
    features = create_features(df_features, str(FEATURE_DAY_DELTA))
    targets = create_targets(df_target)
    
    df_date = features.merge(targets, how='inner', on='trainer_id')
    full_model_df = pd.concat([full_model_df, df_date], ignore_index=True)
    

for n_entries in [50, 100, 250]:
    # for each n_entries, group by trainer id and keep the most recent n_entries
    # then create features and targets

    df_features = df.groupby('trainer_id').apply(lambda x: x.tail(n_entries)).reset_index()


NameError: name 'feature_start_dates' is not defined

In [None]:
from models.model_builds import build_xgb_regressor, build_linear_regressor
from baseline_model.preprocessing import create_train_test_split

In [None]:
data = create_train_test_split(full_model_df, test_size=0.2, valid_size=0.1, split_column='trainer_id')

lin_reg_model = build_linear_regressor(data)
#xgb_model = build_xgb_classifier(data)


# TODOs


## Baseline Model
- lambda


## Risk Model
- preprocess
    - distance
    - beyer? 
    - smoothed values

- workouts
    




In [None]:

df['previous_performance_figure'] = df.groupby('registration_number')['performance_figure'].shift(1)
df['performance_figure'] = df['performance_figure'].str.replace('-', '0').astype(float, errors='ignore')
df['previous_performance_figure'] = df['previous_performance_figure'].str.replace('-', '0').astype(float, errors='ignore')

prev_perf = df.dropna(subset=['previous_performance_figure', 'performance_figure'])
prev_perf = prev_perf[(prev_perf['previous_performance_figure']) > 0 & (prev_perf['performance_figure'] > 0)]
prev_perf['performance_figure_ratio'] = prev_perf['performance_figure'] / prev_perf['previous_performance_figure']


In [None]:
# baseline testing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [None]:
from baseline_model.load_data import load_data
from baseline_model.preprocessing import preprocess_data

In [None]:
df = load_data(True)
df = preprocess_data(df)
df = df.drop(columns=['dnf', 'registration_number'])

In [None]:
with open('output/models/baseline_log_reg_model.pkl', 'rb') as f:
    log_reg_model = pickle.load(f)

with open('output/models/baseline_xgb_model.pkl', 'rb') as f:
    xgb_model = pickle.load(f)

In [None]:
ft_imp = []
for col, coef in zip(log_reg_model.feature_names_in_, log_reg_model.coef_[0]):
    ft_imp.append({
        'feature': col,
        'importance': coef
    })

ft_imp = pd.DataFrame(ft_imp)
ft_imp = ft_imp.sort_values(by='importance')

plt.barh(ft_imp['feature'], ft_imp['importance'])
plt.show()


In [None]:
ft_imp = []
for col, coef in zip(xgb_model.feature_names_in_, xgb_model.feature_importances_):
    ft_imp.append({
        'feature': col,
        'importance': coef
    })

ft_imp = pd.DataFrame(ft_imp)
ft_imp = ft_imp.sort_values(by='importance')

plt.barh(ft_imp['feature'], ft_imp['importance'])
plt.show()

In [None]:
# use shap to get feature importance
import shap

shap.initjs()
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(df[xgb_model.feature_names_in_])



In [None]:
# plot the SHAP values
shap.summary_plot(shap_values, df[xgb_model.feature_names_in_], plot_type='bar')

In [None]:
# plot the SHAP values for a single prediction
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0,:], df[xgb_model.feature_names_in_].iloc[0,:])


In [None]:
# plot shap for two features
shap.dependence_plot('surface_D', shap_values, df[xgb_model.feature_names_in_])

In [None]:
# plot predictions vs actual color cmap by data['target_n_entries]

plt.figure()
plt.scatter(xgb_model.predict(df[xgb_model.feature_names_in_]), df['target'], c=df['target_n_entries'], cmap='viridis')
plt.xlabel('Predictions')
plt.ylabel('Actual')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
# add r2 score to plot
plt.text(0.1, 0.9, f'R2: {xgb_model.score(df[xgb_model.feature_names_in_], df["target"])}', fontsize=12)
plt.colorbar()





In [None]:
plt.figure()
plt.hist(preds, bins=30)
# vertical black dotted line for mean
plt.axvline(np.mean(preds), color='black', linestyle='--')
# red dotted lines at +1 and -1 stdev 
plt.axvline(np.mean(preds) + np.std(preds), color='red', linestyle='--')
plt.axvline(np.mean(preds) - np.std(preds), color='red', linestyle='--')
plt.xlabel('Predictions')
plt.ylabel('Count')
plt.title('Predictions Histogram')

# add text in upper right with mean and stdev
# only show 3 decimal places
plt.text(0.6, 0.9, f'Mean: {np.mean(preds):.3f}', fontsize=12, transform=plt.gcf().transFigure)
plt.text(0.6, 0.85, f'Stdev: {np.std(preds):.3f}', fontsize=12, transform=plt.gcf().transFigure)

In [1]:
import pandas as pd

In [5]:
df = pd.read_csv('/users/jameshull/documents/github/hisa-data/test_2024.csv')

In [6]:
df.head(2)

Unnamed: 0,race_date,track_id,race_number,race_type,distance_id,distance_unit,surface,course_type,track_condition,weather,...,trainer_id,owner_id,trouble_indicator,scratch_indicator,scratch_reason,short_comment,long_comment,horse_name,sex,age
0,2024-01-01 00:00:00,AQU,2,ALW,600,F,D,D,FT,L,...,248028,2303261,N,N,,"prompted 3w, weakened","prompted 3w, coaxed 5/16, 4w upper, weakened",Bezos,H,5.876712
1,2024-01-01 00:00:00,AQU,2,ALW,600,F,D,D,FT,L,...,957331,1586302,N,N,,"7w upper, improved","brk in st, chased 2p, coaxed 3/8, angled 7w up...",Who Hoo Thats Me,H,4.802739


In [7]:
df['trouble_indicator'].value_counts(normalize=True)

N    0.920389
Y    0.079611
Name: trouble_indicator, dtype: float64

In [8]:
df[df['trouble_indicator'] == 'Y']['long_comment']

26       bumped start, set pressured pace inside, kicke...
32       bumped start, pressed pace between horses, won...
35       set pressured pace 3-2 wide, headed turn, bump...
39       tracked between, in tight 7/16, bumped rival 1...
48       bumped start, chased 2 wide, came 5 wide, need...
                               ...                        
26016             chased,4wd turn,forced out 1/8,flattened
26017    bobbled break,stalked 2&3p,challenged 3/8,up 1...
26039      very fractious gate, 2nd flight 2-3w btw, tired
26047    jostled, closed quarters, bumped, swung 6w, gr...
26096    stalked pace, rally 3w, angled in, off heels, ...
Name: long_comment, Length: 2078, dtype: object