In [4]:
import pickle
import pandas as pd

In [5]:
from scripts.flatten_day_batch_pkl import run_batch_flatten
from utils.clean_flattened_df import clean_flattened_dataframe
from utils.add_embedding_indices import add_embedding_indices
from utils.process_text_fields import process_text_fields
from utils.process_text_fields import extract_race_phrases
from utils.embedding_io import save_embeddings_npz
from utils.save_model_ready_df import save_model_ready_pickle

In [6]:
input_raw_path = 'data/raw/'
output_df_path = 'data/processed/model_ready_flat.pkl'
output_npz_path = 'data/processed/text_embeddings.npz'

In [7]:
# Run flatten/merge module.
run_batch_flatten()

[94m[INFO] 2025-03-28T04-59.json: 362 runners processed[0m
[94m        Valid horse IDs: 360/362[0m
[94m        class_num present: 277/362[0m
[94m[INFO] 2025-03-26T08-58.json: 296 runners processed[0m
[94m        Valid horse IDs: 295/296[0m
[94m        class_num present: 163/296[0m
[94m[INFO] 2025-03-27T07-37.json: 387 runners processed[0m
[94m        Valid horse IDs: 383/387[0m
[94m        class_num present: 256/387[0m
[94m[INFO] 2025-03-29T04-58.json: 659 runners processed[0m
[94m        Valid horse IDs: 653/659[0m
[94m        class_num present: 561/659[0m
[92m[SUCCESS] Flattened 1704 runners from 4 files[0m
[92m[✓] CSV saved to data/processed/2025-03-31T17-22.csv[0m
[92m[✓] Pickle saved to data/processed/2025-03-31T17-22.pkl[0m


In [8]:
# Create a new datafrmae from the saved .pkl file.
new_df = pd.read_pickle('data/processed/2025-03-31T15-59.pkl')

In [9]:
# Analyse column attributes prior to cleaning
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 59 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   horse_id                 1704 non-null   object 
 1   horse_id_valid           1704 non-null   int64  
 2   course                   1704 non-null   object 
 3   country                  1704 non-null   object 
 4   going                    1704 non-null   object 
 5   GoingStick               0 non-null      object 
 6   distance_f               1704 non-null   float64
 7   field_size               1704 non-null   int64  
 8   race_class               1704 non-null   object 
 9   class_num                1257 non-null   float64
 10  class_label              1257 non-null   object 
 11  type                     1704 non-null   object 
 12  name                     1704 non-null   object 
 13  draw                     1590 non-null   float64
 14  age                     

In [10]:
df_clean = clean_flattened_dataframe(new_df)

In [11]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 58 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   horse_id                 1704 non-null   object        
 1   horse_id_valid           1704 non-null   int64         
 2   course                   1704 non-null   object        
 3   country                  1704 non-null   category      
 4   going                    1704 non-null   category      
 5   distance_f               1704 non-null   float64       
 6   field_size               1704 non-null   int64         
 7   race_class               1704 non-null   category      
 8   class_num                1257 non-null   float64       
 9   class_label              1257 non-null   category      
 10  type                     1704 non-null   category      
 11  name                     1704 non-null   object        
 12  draw                     1590 non-

In [12]:
df_indexed, encoders = add_embedding_indices(df_clean)

In [13]:
# This will give you a preview of the new integer columns like:
# country_idx, going_idx, venue_idx, etc.
df_indexed.filter(like='_idx').head()

Unnamed: 0,country_idx,going_idx,sex_idx,type_idx,class_label_idx,headgear_idx,race_class_idx,venue_idx
0,4,4,2,1,1,11,0,4
1,4,4,2,1,1,0,0,4
2,4,4,2,1,1,16,0,4
3,4,4,2,1,1,0,0,4
4,4,4,1,1,1,4,0,4


In [14]:
df_indexed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 66 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   horse_id                 1704 non-null   object        
 1   horse_id_valid           1704 non-null   int64         
 2   course                   1704 non-null   object        
 3   country                  1704 non-null   category      
 4   going                    1704 non-null   category      
 5   distance_f               1704 non-null   float64       
 6   field_size               1704 non-null   int64         
 7   race_class               1704 non-null   category      
 8   class_num                1257 non-null   float64       
 9   class_label              1257 non-null   category      
 10  type                     1704 non-null   category      
 11  name                     1704 non-null   object        
 12  draw                     1590 non-

In [15]:
encoders

{'country': LabelEncoder(),
 'going': LabelEncoder(),
 'sex': LabelEncoder(),
 'type': LabelEncoder(),
 'class_label': LabelEncoder(),
 'headgear': LabelEncoder(),
 'race_class': LabelEncoder(),
 'venue': LabelEncoder()}

In [16]:
# Save encoders
import joblib
joblib.dump(encoders, 'data/processed/embedding_encoders_2025-03-31.pkl')

['data/processed/embedding_encoders_2025-03-31.pkl']

In [17]:
df_nlp, embeddings_dict, regex_features = process_text_fields(
    df_indexed,
    fields=["comment", "spotlight"],
    enable_regex=True
)

Embedding field: comment


Batches:   0%|          | 0/54 [00:00<?, ?it/s]

Embedding field: spotlight


Batches:   0%|          | 0/54 [00:00<?, ?it/s]

In [20]:
df_nlp.info(all)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 102 columns):
 #    Column                                    Dtype         
---   ------                                    -----         
 0    horse_id                                  object        
 1    horse_id_valid                            int64         
 2    course                                    object        
 3    country                                   category      
 4    going                                     category      
 5    distance_f                                float64       
 6    field_size                                int64         
 7    race_class                                category      
 8    class_num                                 float64       
 9    class_label                               category      
 10   type                                      category      
 11   name                                      object        
 12   draw

In [28]:
# Save the embeddings.
save_embeddings_npz(
    embeddings_dict,
    'data/processed/text_embeddings_2025-03-31.npz'
)

[✓] Saved 2 embedding blocks to data/processed/text_embeddings_2025-03-31.npz


In [30]:
# To load embeddings later.
from utils.embedding_io import load_embeddings_npz
# Optional; validate shape.
expected = {"comment": (1704, 384), "spotlight": (1704, 384)}
embeddings = load_embeddings_npz(
    "data/processed/text_embeddings_2025-03-31.npz",
    expected_schema=expected
)

[✓] Loaded 2 embedding blocks from data/processed/text_embeddings_2025-03-31.npz
[✓] Schema validation passed.


In [21]:
df_nlp.head(10)

Unnamed: 0,horse_id,horse_id_valid,course,country,going,distance_f,field_size,race_class,class_num,class_label,...,mentions_fitness_query_comment,mentions_fitness_query_spotlight,mentions_positive_trainer_note_comment,mentions_positive_trainer_note_spotlight,mentions_jockey_combo_comment,mentions_jockey_combo_spotlight,mentions_improver_flag_comment,mentions_improver_flag_spotlight,mentions_loser_flag_comment,mentions_loser_flag_spotlight
0,3679956,1,Dundalk (AW),IRE,Standard,7.0,11,,,,...,0,0,0,0,0,0,0,0,0,0
1,5148102,1,Dundalk (AW),IRE,Standard,7.0,11,,,,...,0,0,0,0,0,0,0,0,0,0
2,2455830,1,Dundalk (AW),IRE,Standard,7.0,11,,,,...,0,0,0,0,0,0,0,0,0,0
3,5301548,1,Dundalk (AW),IRE,Standard,7.0,11,,,,...,0,0,0,0,0,0,0,0,0,0
4,5021592,1,Dundalk (AW),IRE,Standard,7.0,11,,,,...,0,0,0,0,0,0,0,0,0,0
5,5329156,1,Dundalk (AW),IRE,Standard,7.0,11,,,,...,0,0,0,0,0,0,0,0,0,0
6,3279638,1,Dundalk (AW),IRE,Standard,7.0,11,,,,...,0,0,0,0,0,0,0,0,0,0
7,2645561,1,Dundalk (AW),IRE,Standard,7.0,11,,,,...,0,0,0,0,0,0,0,0,0,0
8,4274148,1,Dundalk (AW),IRE,Standard,7.0,11,,,,...,0,0,0,0,0,0,0,0,0,0
9,5237784,1,Dundalk (AW),IRE,Standard,7.0,11,,,,...,0,0,0,0,0,0,0,0,0,0


In [33]:
df_nlp.to_pickle("data/processed/model_ready_2025-03-31.pkl")
df_nlp.to_csv("data/processed/model_ready_2025-03-31.csv", index=False)

In [None]:
MISCELLANEOUS VALIDATION

In [25]:
# We have had issues with trainer/jockey stats not populating correctly.
trainer_cols = [col for col in df_nlp.columns if col.startswith("trainer_")]
jockey_cols = [col for col in df_nlp.columns if col.startswith("jockey_")]

df_nlp[trainer_cols + jockey_cols].isna().sum()


trainer_id                   0
trainer_ovr_runs            49
trainer_ovr_wins            49
trainer_ovr_win_pct         97
trainer_ovr_profit          49
trainer_last_14_runs        49
trainer_last_14_wins        49
trainer_last_14_win_pct    275
trainer_last_14_profit      49
jockey_id                    0
jockey_ovr_runs             49
jockey_ovr_wins             49
jockey_ovr_win_pct          87
jockey_ovr_profit           49
jockey_last_14_runs         49
jockey_last_14_wins         49
jockey_last_14_win_pct     160
jockey_last_14_profit       49
dtype: int64

In [26]:
# We to a sanity check to see if both these below return zero.
trainer_issues = df_nlp[
    (df_nlp['trainer_ovr_runs'] > 0) &
    (df_nlp['trainer_ovr_win_pct'].isna())
]
print(f"Trainer win % missing despite having runs: {len(trainer_issues)}")

jockey_issues = df_nlp[
    (df_nlp['jockey_ovr_runs'] > 0) &
    (df_nlp['jockey_ovr_win_pct'].isna())
]
print(f"Jockey win % missing despite having runs: {len(jockey_issues)}")


Trainer win % missing despite having runs: 0
Jockey win % missing despite having runs: 0
