In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA


In [None]:
race_df = pd.read_csv('../datasets/preprocessed_races.csv')
print(race_df.shape)
race_df.head()


In [None]:
weather_df = pd.read_csv('../datasets/weather.csv')
print(weather_df.shape)
weather_df.head()


In [None]:
horse_df = pd.read_csv('../datasets/hkjc_horses.csv')
print(horse_df.shape)
horse_df.head()


### Merge Weather data into Race data

In [None]:
agg_race_df = race_df.set_index('race_date').join(weather_df.set_index('race_date'), how='inner').reset_index()
print(agg_race_df.shape)
agg_race_df.head()


### Merge Horse data into Race data

In [None]:
agg_race_df = agg_race_df.set_index('horse_id').join(horse_df.set_index('horse_id'), how='inner').reset_index()
print(agg_race_df.shape)
agg_race_df.head()


In [None]:
agg_race_df.isna().sum()


# Feature Engineering

### Analysing Weather Features

In [None]:
WEATHER_FEATURES = [
    'temperature_2m_max', 'temperature_2m_min', 'temperature_2m_mean', 
    'precipitation_sum', 'rain_sum', 'wind_speed_10m_max', 'wind_gusts_10m_max',
    'wind_direction_10m_dominant', 'finishing_position', 'finish_time_s', 'race_speed'
]

weather_features_df = agg_race_df[WEATHER_FEATURES]
print(weather_features_df.shape)
weather_features_df.head()


In [None]:
fig = px.imshow(
    weather_features_df.corr(),
    labels=dict(color="Correlation")
)

fig.update_layout(
    title="Correlation Heatmap",
)

fig.show()


In [None]:
SCALED_WEATHER_FEATURES = [
    'temperature_2m_max', 'temperature_2m_min', 'temperature_2m_mean', 
    'precipitation_sum', 'rain_sum', 'wind_speed_10m_max', 'wind_gusts_10m_max',
    'wind_direction_10m_dominant'
]

scaler = StandardScaler()
scaled_weather_features_np = scaler.fit_transform(weather_features_df[SCALED_WEATHER_FEATURES])
scaled_weather_features_df = pd.DataFrame(scaled_weather_features_np, columns=SCALED_WEATHER_FEATURES)
print(scaled_weather_features_df.shape)
scaled_weather_features_df.head()


In [None]:
scaled_weather_features_df = pd.concat([
    scaled_weather_features_df,
    weather_features_df[['finishing_position', 'finish_time_s', 'race_speed']]
], axis=1)

print(scaled_weather_features_df.shape)
scaled_weather_features_df.head()


In [None]:
fig = px.imshow(
    scaled_weather_features_df.corr(),
    labels=dict(color="Correlation")
)

fig.update_layout(
    title="Correlation Heatmap",
)

fig.show()


### Analysing Race Features

In [None]:
RACE_FEATURES = [
    'race_distance', 'additional_weight', 'proportion_of_additional_weight', 
    'horse_number', 'draw', 'finishing_position', 'finish_time_s', 'race_speed'
]

RACE_CAT_FEATURES = [
    'race_course', 'race_class', 'track_condition', 'track', 'horse_number', 'draw',
    'finishing_position', 'finish_time_s', 'race_speed'
]

race_features_df = agg_race_df[RACE_FEATURES]
print(race_features_df.shape)
race_features_df.head()


In [None]:
race_cat_features_df = agg_race_df[RACE_CAT_FEATURES]
print(race_features_df.shape)
race_features_df.head()


In [None]:
fig = px.imshow(
    race_features_df.corr(),
    labels=dict(color="Correlation")
)

fig.update_layout(
    title="Correlation Heatmap",
)

fig.show()


In [None]:
SCALED_RACE_FEATURES = [
    'race_distance', 'additional_weight', 'proportion_of_additional_weight',
]

scaler = StandardScaler()
scaled_race_features_np = scaler.fit_transform(race_features_df[SCALED_RACE_FEATURES])
scaled_race_features_df = pd.DataFrame(scaled_race_features_np, columns=FEATURES)
scaled_race_features_df.head()


In [None]:
fig = px.imshow(
    scaled_race_features_df.corr(),
    labels=dict(color="Correlation")
)

fig.update_layout(
    title="Correlation Heatmap",
)

fig.show()


### Analysing categorical race features

In [None]:
race_cat_winner_df = race_cat_features_df[race_cat_features_df['finishing_position'] == 1]
print(race_cat_winner_df.shape)
race_cat_winner_df.head()


In [None]:
draw_count = race_cat_winner_df['draw'].value_counts()
draw_count_df = pd.DataFrame(draw_count).reset_index()
draw_count_df.columns = ['draw', 'wins']

fig = px.bar(
    draw_count_df, x='draw', y='wins',
    title='Wins by Draw Position',
)
fig.show()


In [None]:
total_draw_count = race_cat_features_df['draw'].value_counts()
win_count = race_cat_winner_df['draw'].value_counts()
win_percentage = (win_count / total_draw_count)
win_percentage_df = pd.DataFrame(win_percentage).reset_index()
win_percentage_df.columns = ['draw', 'win_percentage']

fig = px.bar(
    win_percentage_df, x='draw', y='win_percentage',
    title='Probability of Winning given a Draw Position',
)

fig.show()


In [None]:
horse_number_count = race_cat_winner_df['horse_number'].value_counts()
horse_number_count_df = pd.DataFrame(horse_number_count).reset_index()
horse_number_count_df.columns = ['horse_number', 'wins']

fig = px.bar(
    horse_number_count_df, x='horse_number', y='wins',
    title='Wins by Horse Number',
)
fig.show()


In [None]:
total_horse_number_count = race_cat_features_df['horse_number'].value_counts()
win_count = race_cat_winner_df['horse_number'].value_counts()
win_percentage = (win_count / total_horse_number_count)
win_percentage_df = pd.DataFrame(win_percentage).reset_index()
win_percentage_df.columns = ['horse_number', 'win_percentage']

fig = px.bar(
    win_percentage_df, x='horse_number', y='win_percentage',
    title='Probability of Winning given a Horse Number',
)
fig.show()


In [None]:
fig = px.box(
    race_cat_features_df, x='race_class', y='finish_time_s', 
    title='Finish Time Distribution for Different Race Classes'
)

fig.show()


In [None]:
fig = px.box(
    race_cat_features_df, x='track', y='finish_time_s', 
    title='Finish Time Distribution for Different Tracks'
)

fig.show()


In [None]:
fig = px.box(
    race_cat_features_df, x='track_condition', y='finish_time_s', 
    title='Finish Time Distribution for Different Track Conditions'
)

fig.show()


In [None]:
def interpret_track_widths(row):
    if row['race_course'] == 'Sha Tin':
        return interpret_sha_tin_track_widths(row['track'])
    elif row['race_course'] == 'Happy Valley':
        return interpret_happy_valley_track_widths(row['track'])

def interpret_sha_tin_track_widths(row):
    if row == 'TURF - "A" COURSE':
        return 30.5
    elif row == 'TURF - "A+3" COURSE':
        return 27.5
    elif row == 'TURF - "B" COURSE':
        return 26
    elif row == 'TURF - "B+2" COURSE':
        return 24
    elif row == 'TURF - "C" COURSE':
        return 21.3
    elif row == 'TURF - "C+3" COURSE':
        return 18.3
    elif row == 'ALL WEATHER TRACK':
        return 22.8

def interpret_happy_valley_track_widths(row):
    if row == 'TURF - "A" COURSE':
        return 30.5
    elif row == 'TURF - "A+3" COURSE':
        return 27.5
    elif row == 'TURF - "B" COURSE':
        return 26.5
    elif row == 'TURF - "B+2" COURSE':
        return 24.5
    elif row == 'TURF - "C" COURSE':
        return 22.5
    elif row == 'TURF - "C+3" COURSE':
        return 19.5

race_cat_features_df['track_width'] = race_cat_features_df.apply(interpret_track_widths, axis=1)
race_cat_features_df.head()


In [None]:
def interpret_track_conditions(row):
    if row['track'] == 'ALL WEATHER TRACK':
        return interpret_all_weather_track_conditions(row['track_condition'])
    else:
        return interpret_turf_track_conditions(row['track_condition'])

def interpret_turf_track_conditions(row):
    if row == 'GOOD TO FIRM':
        return 2.5
    elif row == 'GOOD':
        return 2.75
    elif row == 'GOOD TO YIELDING':
        return 3
    elif row == 'YIELDING':
        return 3.25
    elif row in {'YIELDING TO SOFT', 'SOFT'}:
        return 3.5

def interpret_all_weather_track_conditions(row):
    if row == 'FAST':
        return 2.5
    elif row == 'GOOD':
        return 2.75
    elif row == 'WET FAST':
        return 3.25
    elif row in 'WET SLOW':
        return 3.5

race_cat_features_df['track_moisture'] = race_cat_features_df.apply(interpret_track_conditions, axis=1)
race_cat_features_df.head()


In [None]:
race_cat_features_df[['track_width', 'track_moisture']].isna().sum()


In [None]:
fig = px.imshow(
    race_cat_features_df[['track_width', 'track_moisture', 'finishing_position', 'finish_time_s', 'race_speed']].corr(),
    labels=dict(color="Correlation")
)

fig.update_layout(
    title="Correlation Heatmap",
)

fig.show()


In [None]:
scaler = StandardScaler()
scaled_race_cat_features_np = scaler.fit_transform(race_cat_features_df[['track_width', 'track_moisture']])
scaled_race_cat_features_df = pd.DataFrame(scaled_race_cat_features_np, columns=['track_width', 'track_moisture'])
scaled_race_cat_features_df.head()


In [None]:
scaled_race_cat_features_df = pd.concat([
    scaled_race_cat_features_df,
    race_cat_features_df[['finishing_position', 'finish_time_s', 'race_speed']]
], axis=1)

fig = px.imshow(
    scaled_race_cat_features_df.corr(),
    labels=dict(color="Correlation")
)

fig.update_layout(
    title="Correlation Heatmap",
)

fig.show()


### Clustering similar value classes for categorical race features based on conditional probability

In [None]:
def segment_draw(row):
    if row <= 4:
        return 1
    elif row <= 8:
        return 2
    else:
        return 3

race_features_df['draw_segments'] = race_features_df['draw'].apply(segment_draw)

total_draw_count = race_features_df['draw_segments'].value_counts()
win_count = race_features_df[race_features_df['finishing_position'] == 1]['draw_segments'].value_counts()
win_percentage = (win_count / total_draw_count)
win_percentage_df = pd.DataFrame(win_percentage).reset_index()
win_percentage_df.columns = ['draw_segments', 'win_percentage']

fig = px.bar(
    win_percentage_df, x='draw_segments', y='win_percentage',
    title='Probability of Winning given a Draw Position by Segment',
)
fig.show()


In [None]:
def segment_horse_number(row):
    if row <= 4:
        return 1
    elif row <= 8:
        return 2
    else:
        return 3

race_features_df['horse_number_segments'] = race_features_df['horse_number'].apply(segment_horse_number)

total_horse_number_count = race_features_df['horse_number_segments'].value_counts()
win_count = race_features_df[race_features_df['finishing_position'] == 1]['horse_number_segments'].value_counts()
win_percentage = (win_count / total_horse_number_count)
win_percentage_df = pd.DataFrame(win_percentage).reset_index()
win_percentage_df.columns = ['horse_number_segments', 'win_percentage']

fig = px.bar(
    win_percentage_df, x='horse_number_segments', y='win_percentage',
    title='Probability of Winning given a Horse Number by Segment',
)
fig.show()


### Analysing Horse Features

In [None]:
HORSE_FEATURES = [
    'country', 'colour', 'sex', 'import_type', 'total_stakes', 'wins', 
    'places', 'total_races', 'win_freq', 'place_freq', 'average_rating', 
    'average_placing', 'average_race_class', 'average_speed', 
    'finishing_position', 'finish_time_s', 'race_speed'
]

horse_features_df = agg_race_df[HORSE_FEATURES]
print(horse_features_df.shape)
horse_features_df.head()


In [None]:
horse_winner_df = horse_features_df[horse_features_df['finishing_position'] == 1]
horse_winner_df.reset_index(drop=True, inplace=True)
print(horse_winner_df.shape)
horse_winner_df.head()


In [None]:
country_count = horse_winner_df['country'].value_counts()
country_count_df = pd.DataFrame(country_count).reset_index()
country_count_df.columns = ['country', 'wins']

fig = px.bar(
    country_count_df, x='country', y='wins',
    title='Wins by Country',
)

fig.show()


In [None]:
total_country_count = horse_features_df['country'].value_counts()
win_percentage = (country_count / total_country_count)
win_percentage_df = pd.DataFrame(win_percentage).reset_index()
win_percentage_df.columns = ['country', 'win_percentage']

win_percentage_df_sorted = win_percentage_df.sort_values(by='win_percentage')
fig = px.bar(
    win_percentage_df_sorted, x='country', y='win_percentage',
    title='Probability of Winning given a Country',
)

fig.show()


In [None]:
gender_count = horse_winner_df['gender'].value_counts()
gender_count_df = pd.DataFrame(gender_count).reset_index()
gender_count_df.columns = ['gender', 'wins']

fig = px.bar(
    sex_count_df, x='gender', y='wins',
    title='Wins by Gender',
)

fig.show()


In [None]:
total_gender_count = horse_features_df['gender'].value_counts()
win_percentage = (gender_count / total_gender_count)
win_percentage_df = pd.DataFrame(win_percentage).reset_index()
win_percentage_df.columns = ['gender', 'win_percentage']

win_percentage_df_sorted = win_percentage_df.sort_values(by='win_percentage')
fig = px.bar(
    win_percentage_df_sorted, x='gender', y='win_percentage',
    title='Probability of Winning given a Gender',
)

fig.show()


In [None]:
colour_count = horse_winner_df['colour'].value_counts()
colour_count_df = pd.DataFrame(colour_count).reset_index()
colour_count_df.columns = ['colour', 'wins']

fig = px.bar(
    colour_count_df, x='colour', y='wins',
    title='Wins by Colour',
)

fig.show()


In [None]:
total_colour_count = horse_features_df['colour'].value_counts()
win_percentage = (colour_count / total_colour_count)
win_percentage_df = pd.DataFrame(win_percentage).reset_index()
win_percentage_df.columns = ['colour', 'win_percentage']

win_percentage_df_sorted = win_percentage_df.sort_values(by='win_percentage')
fig = px.bar(
    win_percentage_df_sorted, x='colour', y='win_percentage',
    title='Probability of Winning given a Colour',
)
fig.show()


### Clustering similar value classes for categorical horse features based on conditional probability

In [None]:
def segment_colour(row):
    if row == 'Dark Bay':
        return 4
    elif row == 'Black':
        return 3
    elif row == 'Grey':
        return 1
    else:
        return 2

def segment_sex(row):
    if row == 'Horse':
        return 4
    elif row == 'Gelding':
        return 3
    elif row == 'Mare':
        return 2
    else:
        return 1

def segment_country(row):
    if row in {'AUS', 'NZ', 'SAF'}:
        return 4
    elif row in {'USA', 'GB', 'IRE'}:
        return 3
    elif row in {'ARG', 'GER', 'FR'}:
        return 2
    else:
        return 1

horse_features_df['colour_segment'] = horse_features_df['colour'].apply(segment_colour)
horse_features_df['sex_segment'] = horse_features_df['sex'].apply(segment_sex)
horse_features_df['country_segment'] = horse_features_df['country'].apply(segment_country)

horse_features_df.head()


In [None]:
fig = px.imshow(
    horse_features_df.corr(),
    labels=dict(color="Correlation")
)

fig.update_layout(
    title="Correlation Heatmap",
)

fig.show()


In [None]:
print(horse_features_df.shape)
horse_features_df.columns


### Scaling numerical horse features

In [None]:
SCALED_HORSE_FEATURES = [
    'total_stakes', 'wins', 'places', 'total_races', 
    'win_freq', 'place_freq', 'average_rating',
    'average_placing', 'average_race_class', 'average_speed'
]

scaler = StandardScaler()
scaled_horse_features_np = scaler.fit_transform(horse_features_df[SCALED_HORSE_FEATURES])
scaled_horse_features_df = pd.DataFrame(scaled_horse_features_np, columns=SCALED_HORSE_FEATURES)
print(scaled_horse_features_df.shape)
scaled_horse_features_df.head()


In [None]:
scaled_horse_features_df = pd.concat([
    scaled_horse_features_df,
    horse_features_df[['finishing_position', 'finish_time_s', 'race_speed']]
], axis=1)

print(scaled_horse_features_df.shape)
scaled_horse_features_df.head()


In [None]:
fig = px.imshow(
    scaled_horse_features_df.corr(),
    labels=dict(color="Correlation")
)

fig.update_layout(
    title="Correlation Heatmap",
)

fig.show()


# Final Features

In [None]:
FINAL_SCALED_WEATHER_FEATURES = ['temperature_2m_max', 'wind_speed_10m_max']

FINAL_RACE_FEATURES = ['draw_segments', 'horse_number_segments']
FINAL_SCALED_RACE_CAT_FEATURES = ['track_width', 'track_moisture']
FINAL_SCALED_RACE_FEATURES = [
    'race_distance', 'proportion_of_additional_weight', 'track_width', 'track_moisture'
]

FINAL_HORSE_FEATURES = ['colour_segment', 'sex_segment', 'country_segment']
FINAL_SCALED_HORSE_FEATURES = [
    'total_stakes', 'win_freq', 'place_freq', 'average_placing', 'average_speed', 
    'average_rating', 'average_race_class'
]

OUTCOMES = ['finish_time_s', 'race_speed']


In [None]:
final_df = pd.concat([
    scaled_weather_features_df[FINAL_SCALED_WEATHER_FEATURES],
    race_features_df[FINAL_RACE_FEATURES],
    scaled_race_cat_features[FINAL_SCALED_RACE_CAT_FEATURES],
    scaled_race_features_df[FINAL_SCALED_RACE_FEATURES],
    horse_features_df[FINAL_HORSE_FEATURES],
    scaled_horse_features_df[FINAL_SCALED_HORSE_FEATURES],
    agg_race_df[OUTCOMES]
], axis=1)

print(final_df.shape)
final_df.head()


In [None]:
final_df.to_csv('final.csv', index=False)


# Dimension Reduction using PCA

In [None]:
final_features_df = pd.concat([
    scaled_weather_features_df[FINAL_WEATHER_FEATURES],
    race_features_df[FINAL_RACE_FEATURES],
    scaled_race_features_df[FINAL_SCALED_RACE_FEATURES],
    horse_features_df[FINAL_HORSE_FEATURES],
    scaled_horse_features_df[FINAL_SCALED_HORSE_FEATURES]
], axis=1)


In [None]:
pca = PCA(n_components=3)
transformed_data = pca.fit_transform(final_features_df)
print(f"Variance explained by each component: {pca.explained_variance_ratio_}")

pca_df = pd.DataFrame(transformed_data, columns=["PCA 1", "PCA 2", "PCA3"])
pca_df.head()


In [None]:
final_pca_df = pd.concat([
    pca_df, 
    agg_race_df[OUTCOMES]
], axis=1)

final_pca_df.head()


In [None]:
fig = px.imshow(
    final_pca_df.corr(),
    labels=dict(color="Correlation")
)

fig.update_layout(
    title="Correlation Heatmap",
)

fig.show()


In [None]:
final_pca_df.to_csv('pca.csv', index=False)
