In [3]:
import pandas as pd
import numpy as np
from scipy.stats import lognorm
import datetime
import warnings
warnings.simplefilter("ignore")

In [4]:
BADLY_BEATEN_THRESHOLD = 2490

In [5]:
!pip install xgboost



In [6]:
from baseline_model.load_data import load_data
from baseline_model.inference import baseline_inference

In [7]:
def get_xDNF(df: pd.DataFrame) -> pd.DataFrame:
    scratches = df[df['scratch_indicator'] == 'Y']
    scratches['xDNF'] = np.nan
    racers = df[df['scratch_indicator'] != 'Y']
    racers['xDNF'] = baseline_inference(df)

    df = pd.concat([racers, scratches])
    
    return df

In [8]:
def extract_features(df: pd.DataFrame) -> pd.DataFrame:
    df['dnf'] = np.where(
        (df['trouble_indicator'] == 'Y') | (df['length_behind_at_finish'] == 9999),
        1,
        0
    )

    df['scratched'] = np.where(
        df['scratch_indicator'] == 'Y',
        1,
        0
    )
    
    df['lasix'] = np.where(
        df['medication'].str.contains('L'),
        1,
        0
    )
    
    df['bute'] = np.where(
        df['medication'].str.contains('B'),
        1,
        0
    )
    
    df['badly_beaten'] = np.where(
        (df['length_behind_at_finish'] > BADLY_BEATEN_THRESHOLD) & (df['length_behind_at_finish'] < 9000),
        1,
        0
    )
    
    return df
    
    

In [9]:
def get_prev_race_features(df: pd.DataFrame) -> pd.DataFrame:
    
    df['race_date'] = pd.to_datetime(df['race_date'])
    df = df.sort_values(by=['registration_number', 'race_date'])
    df = df.rename(columns={'distance_id': 'race_distance'})

    df['previous_race_date'] = df.groupby('registration_number')['race_date'].shift(1)
    df['previous_race_dnf'] = df.groupby('registration_number')['dnf'].shift(1)
    df['previous_race_scratch'] = df.groupby('registration_number')['scratched'].shift(1)
    df['previous_race_distance'] = df.groupby('registration_number')['race_distance'].shift(1)
    df['previous_surface'] = df.groupby('registration_number')['surface'].shift(1)
    df['days_since_last_race'] = (df['race_date'] - df['previous_race_date']).dt.days


    df['distance_delta'] = df['race_distance'] - df['previous_race_distance']
    df['distance_jump'] = np.where(
        df['distance_delta'] > 200,
        1,
        0
    )

    df['rest_after_dnf'] = np.where(
        df['previous_race_dnf'] == 1,
        df['days_since_last_race'],
        np.nan
    )

    df['rest_after_scratch'] = np.where(
        df['previous_race_scratch'] == 1,
        df['days_since_last_race'],
        np.nan
    )

    df['surface_change'] = np.where(
        df['surface'] != df['previous_surface'],
        1,
        0
    )

    # maybe should account for covid
    df['long_layoff'] = np.where(
        df['days_since_last_race'] > 365,
        1,
        0
    )
    
    return df

    

In [10]:
def fit_params(grouped_df: pd.DataFrame) -> pd.Series:
    try:
        params = lognorm.fit(grouped_df['days_since_last_race'].dropna())
    except Exception as e:
        # print(e)
        params = (None, None, None)

    trainer_params = pd.Series(params, index=['lognorm_p1', 'lognorm_p2', 'lognorm_p3'])
    
    return trainer_params



In [11]:
def get_first_long(df: pd.DataFrame) -> pd.DataFrame:
    
    first_long = df[df['race_distance'] > 800].sort_values(by=['race_date']).groupby(['registration_number', 'trainer_id']).first().reset_index()

    
    trainer_first_long = first_long.groupby(['trainer_id']).agg({
            'age': 'median'
        }).reset_index().rename(columns={'age': 'first_long_age'})

    return trainer_first_long


In [12]:
def group_trainer_data(df: pd.DataFrame) -> pd.DataFrame:
    trainers = df.groupby(['trainer_id']).agg({
        'race_number': 'count',
        'registration_number': 'nunique',
        'scratched': 'sum',
        'dnf': 'sum',
        'age': 'min',
        'lasix': 'sum',
        'bute': 'sum',
        'days_since_last_race': ['min', 'median'],
        'rest_after_dnf': 'median',
        'rest_after_scratch': 'median',
        'distance_jump': 'sum',
        'surface_change': 'sum',
        'long_layoff': 'sum',
        'badly_beaten': 'sum',
    }).reset_index()

    trainers.columns = ['trainer_id',
    'n_entries', 'unique_horses', 'scratched', 'dnf', 'min_age', 'lasix', 'bute', 'days_since_last_race_min', 'days_since_last_race_median', 
    'rest_after_dnf_median', 'rest_after_scratch_median', 'distance_jump', 'surface_changes', 'long_layoffs',
    'badly_beaten'
    ]

    trainers['scratches_per_entry'] = trainers['scratched'] / trainers['n_entries']
    trainers['dnf_per_entry'] = trainers['dnf'] / trainers['n_entries']
    trainers['badly_beaten_pct'] = trainers['badly_beaten'] / trainers['n_entries']
    trainers['lasix_pct'] = trainers['lasix'] / trainers['n_entries']
    trainers['bute_pct'] = trainers['bute'] / trainers['n_entries']
    
    # TODO
    # trainers['dnf_per_entry_smooth'] = None
    # trainers['scratches_per_entry'] = None
    
    
    return trainers


In [13]:
def create_features(df: pd.DataFrame, suffix: str = None) -> pd.DataFrame:

    df = extract_features(df)
    df = get_prev_race_features(df)
    trainer_params = df.groupby('trainer_id').apply(fit_params)
    first_long = get_first_long(df)
    trainers = group_trainer_data(df)
    trainers = trainers.merge(first_long, on=['trainer_id'], how='left')
    trainers = trainers.merge(trainer_params, on=['trainer_id'], how='left')
    
    for c in trainers.columns:
        if 'lognorm' in c:
            trainers[c] = round(trainers[c], 5)
    
    if suffix:
        trainers.columns = [c + f'_{suffix}' for c in trainers.columns]
        trainers = trainers.rename(columns={
            f'trainer_id_{suffix}': 'trainer_id'
        })
    
    return trainers


In [14]:
def create_targets(df: pd.DataFrame): 
    
    df = extract_features(df)
    
    trainers = df.groupby('trainer_id').agg({
        'race_number': 'count',
        'dnf': 'sum',
        'scratched': 'sum',
        'badly_beaten': 'sum',
        #'long_layoff': 'sum',
    })
    
    trainers = trainers.rename(columns={
        'race_number': 'n_entries'
    }).reset_index()
    
    trainers['scratches_per_entry'] = trainers['scratched'] / trainers['n_entries']
    trainers['dnf_per_entry'] = trainers['dnf'] / trainers['n_entries']
    trainers['badly_beaten_pct'] = trainers['badly_beaten'] / trainers['n_entries']
    
    # TODO
    # trainers['dnf_per_entry_smooth'] = None
    # trainers['scratches_per_entry'] = None
    
    trainers = trainers[['trainer_id', 'n_entries', 'dnf_per_entry']].rename(columns={
        'dnf_per_entry': 'target',
        'n_entries': 'target_n_entries',
    })
    
    return trainers

In [15]:

prediction_dates = [
    '2022-07-01', 
    '2022-12-01'
]


df = load_data(False)
df = get_xDNF(df)
df['race_date'] = pd.to_datetime(df['race_date'])
FEATURE_DAY_DELTA = 365
TARGET_DAY_DELTA = 180

full_model_df = pd.DataFrame()
for prediction_date in prediction_dates:
    feature_end_date = datetime.datetime.strptime(prediction_date, '%Y-%m-%d')
    feature_start_date = feature_end_date - datetime.timedelta(days=FEATURE_DAY_DELTA)
    
    target_start_date = feature_end_date + datetime.timedelta(days=1)
    target_end_date = target_start_date + datetime.timedelta(days=TARGET_DAY_DELTA)
    
    df_features = df[(df['race_date'] >= feature_start_date) & (df['race_date'] <= feature_end_date)]
    df_target = df[(df['race_date'] >= target_start_date) & (df['race_date'] <= target_end_date)]
    
    features = create_features(df_features, str(FEATURE_DAY_DELTA))
    targets = create_targets(df_target)
    
    df_date = features.merge(targets, how='inner', on='trainer_id')
    full_model_df = pd.concat([full_model_df, df_date], ignore_index=True)
    

NameError: name 'feature_start_dates' is not defined

In [None]:
from models.model_builds import build_xgb_regressor, build_linear_regressor
from baseline_model.preprocessing import create_train_test_split

In [None]:
data = create_train_test_split(full_model_df, test_size=0.2, valid_size=0.1, split_column='trainer_id')

lin_reg_model = build_linear_regressor(data)
#xgb_model = build_xgb_classifier(data)


# TODOs


## Baseline Model
- lambda


## Risk Model
- preprocess
    - distance
    - beyer
    - smoothed values

- workouts
    




In [None]:

df['previous_performance_figure'] = df.groupby('registration_number')['performance_figure'].shift(1)
df['performance_figure'] = df['performance_figure'].str.replace('-', '0').astype(float, errors='ignore')
df['previous_performance_figure'] = df['previous_performance_figure'].str.replace('-', '0').astype(float, errors='ignore')

prev_perf = df.dropna(subset=['previous_performance_figure', 'performance_figure'])
prev_perf = prev_perf[(prev_perf['previous_performance_figure']) > 0 & (prev_perf['performance_figure'] > 0)]
prev_perf['performance_figure_ratio'] = prev_perf['performance_figure'] / prev_perf['previous_performance_figure']


In [None]:
# baseline testing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [None]:
from baseline_model.load_data import load_data
from baseline_model.preprocessing import preprocess_data

In [None]:
df = load_data(True)
df = preprocess_data(df)
df = df.drop(columns=['dnf', 'registration_number'])

In [None]:
with open('output/models/baseline_log_reg_model.pkl', 'rb') as f:
    log_reg_model = pickle.load(f)

with open('output/models/baseline_xgb_model.pkl', 'rb') as f:
    xgb_model = pickle.load(f)

In [None]:
ft_imp = []
for col, coef in zip(log_reg_model.feature_names_in_, log_reg_model.coef_[0]):
    ft_imp.append({
        'feature': col,
        'importance': coef
    })

ft_imp = pd.DataFrame(ft_imp)
ft_imp = ft_imp.sort_values(by='importance')

plt.barh(ft_imp['feature'], ft_imp['importance'])
plt.show()


In [None]:
ft_imp = []
for col, coef in zip(xgb_model.feature_names_in_, xgb_model.feature_importances_):
    ft_imp.append({
        'feature': col,
        'importance': coef
    })

ft_imp = pd.DataFrame(ft_imp)
ft_imp = ft_imp.sort_values(by='importance')

plt.barh(ft_imp['feature'], ft_imp['importance'])
plt.show()

In [None]:
# use shap to get feature importance
import shap

shap.initjs()
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(df[xgb_model.feature_names_in_])



In [None]:
# plot the SHAP values
shap.summary_plot(shap_values, df[xgb_model.feature_names_in_], plot_type='bar')

In [None]:
# plot the SHAP values for a single prediction
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0,:], df[xgb_model.feature_names_in_].iloc[0,:])


In [None]:
# plot shap for two features
shap.dependence_plot('surface_D', shap_values, df[xgb_model.feature_names_in_])