In [106]:
import polars as pl
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from sklearn.impute import SimpleImputer

In [2]:
forward_df = pl.read_csv('C:\\Users\\jeetg\\code\\horse race prediction\\cleaned_forward.csv')
horse_df = pl.read_csv('C:\\Users\\jeetg\\code\\horse race prediction\\cleaned_horse.csv')
race_df = pl.read_csv('C:\\Users\\jeetg\\code\\horse race prediction\\cleaned_race.csv')

In [35]:
def preprocess_data(forward_df, horse_df, race_df):
    # Handle missing values
    forward_df = forward_df.fill_nan(None).fill_null('Unknown')
    horse_df = horse_df.fill_nan(None).fill_null('Unknown')
    race_df = race_df.fill_nan(None).fill_null('Unknown')

    # Normalize numerical features if necessary
    numerical_features = ['RPRc', 'TRc', 'OR', 'weightSt', 'weightLb', 'age', 'decimalPrice']  # Add other numerical features as needed
    for feature in numerical_features:
        mean = forward_df[feature].mean()
        std = forward_df[feature].std()
        forward_df = forward_df.with_columns(((forward_df[feature] - mean) / std).alias(feature))

    # Encode categorical features
    label_encoders = {}
    categorical_features = ['countryCode', 'condition', 'horseName', 'trainerName', 'jockeyName']  # Add other categorical features as needed
    for feature in categorical_features:
        le = LabelEncoder()
        forward_df = forward_df.with_columns(pl.Series(feature, le.fit_transform(forward_df[feature].to_list())))
        label_encoders[feature] = le

    return forward_df, horse_df, race_df, label_encoders


In [36]:
forward_df, horse_df, race_df, label_encoders = preprocess_data(forward_df, horse_df, race_df)

In [37]:
print("forward: ",forward_df.columns)
print("race: ",race_df.columns)
print("horse: ",horse_df.columns)

forward:  ['course', 'countryCode', 'marketTime', 'title', 'runners', 'condition', 'prize', 'rclass', 'horseName', 'trainerName', 'jockeyName', 'RPRc', 'TRc', 'OR', 'weightSt', 'weightLb', 'age', 'decimalPrice']
race:  ['rid', 'course', 'time', 'date', 'title', 'rclass', 'band', 'ages', 'distance', 'condition', 'hurdles', 'prizes', 'winningTime', 'prize', 'metric', 'countryCode', 'ncond', 'class', 'currency']
horse:  ['rid', 'horseName', 'age', 'saddle', 'decimalPrice', 'isFav', 'trainerName', 'jockeyName', 'position', 'positionL', 'dist', 'weightSt', 'weightLb', 'overWeight', 'outHandicap', 'headGear', 'RPR', 'TR', 'OR', 'father', 'mother', 'gfather', 'runners', 'margin', 'weight', 'res_win', 'res_place', 'price']


In [44]:
forward_df = forward_df.with_columns([
    pl.col("course").str.strip_chars().str.to_lowercase(),
    pl.col("horseName").cast(pl.Utf8)
])
horse_df = horse_df.with_columns(pl.col("horseName").cast(pl.Utf8))
race_df = race_df.with_columns(pl.col("course").str.strip_chars().str.to_lowercase())



In [41]:
print("Data types in forward_df:", forward_df.dtypes)
print("Data types in horse_df:", horse_df.dtypes)
print("Data types in race_df:", race_df.dtypes)


Data types in forward_df: [String, Int64, String, String, Int64, Int64, Float64, String, String, Int64, Int64, Float64, Float64, Float64, Float64, Float64, Float64, Float64]
Data types in horse_df: [Float64, String, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Float64, Int64, Int64, Int64, Int64, Int64, Int64, Float64, Float64, Int64, Int64, Int64, Int64, Float64, Int64, Float64, Float64, Int64]
Data types in race_df: [Int64, String, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64]


In [45]:
common_courses = set(forward_df['course'].to_list()).intersection(set(race_df['course'].to_list()))
print("Number of common courses:", len(common_courses))
print("Sample common courses:", list(common_courses)[:10])

Number of common courses: 0
Sample common courses: []


In [88]:
def feature_engineering(forward_df, horse_df, race_df):
    # Ensure the join keys are of the same datatype and normalized
    forward_df = forward_df.with_columns([
        pl.col("horseName").cast(pl.Utf8),
        pl.col("course").str.strip_chars().str.to_lowercase()
    ])
    horse_df = horse_df.with_columns(pl.col("horseName").cast(pl.Utf8))
    race_df = race_df.with_columns(pl.col("course").str.strip_chars().str.to_lowercase())

    # Join dataframes
    df_temp = forward_df.join(horse_df, on='horseName', how='inner')
    print("Shape after joining forward_df and horse_df:", df_temp.shape)
    df = df_temp.join(race_df, on='course', how='left', coalesce=True)
    print("Shape after joining with race_df:", df.shape)

    # Create target variable
    if 'position' in horse_df.columns:
        df = df.with_columns((df['position'] == 1).cast(pl.Int8).alias('target'))
    elif 'position' in race_df.columns:
        df = df.with_columns((df['position'] == 1).cast(pl.Int8).alias('target'))
    else:
        raise KeyError("Column 'position' not found in horse_df or race_df")

        # Convert marketTime to datetime and extract features
    df = df.with_columns(pl.col("marketTime").str.strptime(pl.Datetime, format="%Y-%m-%d %H:%M:%S%z").alias("marketTime"))
    df = df.with_columns([
        pl.col("marketTime").dt.year().alias("marketYear"),
        pl.col("marketTime").dt.month().alias("marketMonth"),
        pl.col("marketTime").dt.day().alias("marketDay"),
        pl.col("marketTime").dt.hour().alias("marketHour"),
        pl.col("marketTime").dt.minute().alias("marketMinute"),
    ])

    # Drop the original marketTime column
    df = df.drop("marketTime")


    return df

In [89]:
df = feature_engineering(forward_df, horse_df, race_df)

Shape after joining forward_df and horse_df: (364653, 45)
Shape after joining with race_df: (364653, 63)


In [29]:
forward_df = forward_df.with_columns(pl.col("horseName").cast(pl.Utf8))
horse_df = horse_df.with_columns(pl.col("horseName").cast(pl.Utf8))
forward_df = forward_df.with_columns(pl.col("course").cast(pl.Utf8))
race_df = race_df.with_columns(pl.col("course").cast(pl.Utf8))


In [90]:
df_pd = df.to_pandas()

In [91]:
print(df_pd)

              course  countryCode  \
0          newmarket            0   
1             redcar            0   
2       market rasen            0   
3         huntingdon            0   
4          kilbeggan            0   
...              ...          ...   
364648      fontwell            0   
364649      wetherby            0   
364650        exeter            0   
364651     wincanton            0   
364652     catterick            0   

                                                    title  runners  condition  \
0                 Close Brothers Invoice Finance Handicap        9          3   
1                        Racing TV Straight-Mile Handicap       16          7   
2                  Thank You Pipers Crisps Handicap Chase       10          2   
3          MansionBet Best Odds Guaranteed Handicap Chase        9          2   
4       Kilmurray's Homevalue Hardware Mullingar Handi...        9         12   
...                                                   ...      ...   

In [92]:
df_pd.dropna(subset=['target'], inplace=True)

In [111]:
all_missing = df_pd.columns[df_pd.isnull().all()]

# Drop columns with all missing values
df_pd = df_pd.drop(columns=all_missing)

# Ensure correct column selection for numerical features
numerical_features = df_pd.select_dtypes(include=['float64', 'int64']).columns
categorical_features = df_pd.select_dtypes(include=['object']).columns

# Fill missing values with the median for numerical features
imputer = SimpleImputer(strategy='median')
df_pd[numerical_features] = imputer.fit_transform(df_pd[numerical_features])


In [115]:
# Encode categorical features
label_encoders = {}
for feature in categorical_features:
    le = LabelEncoder()
    df_pd[feature] = le.fit_transform(df_pd[feature].astype(str))
    label_encoders[feature] = le

In [96]:
df_pd.head()

Unnamed: 0,course,countryCode,title,runners,condition,prize,rclass,horseName,trainerName,jockeyName,...,countryCode_right,ncond,class,currency,target,marketYear,marketMonth,marketDay,marketHour,marketMinute
0,62,0,449,9,7,7812.0,Class 4,3842,1234,135,...,,,,,0,2020,9,19,14,15
1,69,0,1686,16,11,8733.0,Class 4,3842,1234,980,...,,,,,0,2020,10,3,15,35
2,54,0,1982,10,6,6433.5,Class 5,7432,343,1024,...,,,,,0,2020,9,26,12,35
3,38,0,1340,9,6,6433.5,Class 4,7432,343,715,...,,,,,0,2020,9,30,12,30
4,44,0,1213,9,4,14160.0,Class 5,12907,312,814,...,,,,,1,2020,9,11,17,50


In [116]:
X = df_pd.drop(columns=['target'])
y = df_pd['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [117]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [118]:
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")


Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     65827
           1       1.00      1.00      1.00      7104

    accuracy                           1.00     72931
   macro avg       1.00      1.00      1.00     72931
weighted avg       1.00      1.00      1.00     72931



In [232]:
from datetime import datetime
def predict_outcome(horse_names, race_conditions, jockeys, course):
    # Create input data in a pandas DataFrame
    input_data = {
        'horseName': horse_names,
        'condition': race_conditions,
        'jockeyName': jockeys,
        'course': [course] * len(horse_names)
    }
    input_df = pd.DataFrame(input_data)
    
    # Initialize label encoders
    label_encoders = {}
    
    # Encode categorical variables using LabelEncoder
    for feature in ['horseName', 'condition', 'jockeyName', 'course']:
        le = LabelEncoder()
        input_df[feature] = le.fit_transform(input_df[feature])
        label_encoders[feature] = le
    
    # Create marketTime features (using current time as placeholder)
    current_time = datetime.now()
    input_df['marketYear'] = current_time.year
    input_df['marketMonth'] = current_time.month
    input_df['marketDay'] = current_time.day
    input_df['marketHour'] = current_time.hour
    input_df['marketMinute'] = current_time.minute
    
    # Ensure all features match the training data (assuming X_train is defined somewhere)
    X_train_columns = ['horseName', 'condition', 'jockeyName', 'course', 'marketYear', 'marketMonth', 'marketDay', 'marketHour', 'marketMinute']
    missing_cols = set(X_train_columns) - set(input_df.columns)
    for col in missing_cols:
        input_df[col] = 0
    
    # Reorder columns to match training data
    input_df = input_df[X_train_columns]

    # Dummy predictions (replace with your actual model prediction)
    predictions = np.random.randint(0, 2, size=len(input_df))  # Example random predictions
    
    return predictions

In [None]:
categorical_features = ['horseName', 'condition', 'jockeyName', 'course']
label_encoders = {}

for feature in categorical_features:
    le = LabelEncoder()
    le.fit(df_pd[feature].astype(str).fillna('missing'))
    label_encoders[feature] = le

# Transform the categorical features in the training data
for feature in categorical_features:
    X_train[feature] = label_encoders[feature].transform(X_train[feature].astype(str).fillna('missing'))

def predict_outcomes(horse_names, race_condition, jockeys, course):
    input_data = {
        'horseName': horse_names,
        'condition': [race_condition] * len(horse_names),
        'jockeyName': jockeys,
        'course': [course] * len(horse_names)
    }

    input_df = pd.DataFrame(input_data)

    # Encode input data using the same encoders used during training
    for feature in ['horseName', 'condition', 'jockeyName', 'course']:
        le = label_encoders.get(feature)
        if le:
            # Handle unseen labels
            input_df[feature] = input_df[feature].map(
                lambda s: '<unknown>' if s not in le.classes_ else s
            )
            # Add '<unknown>' to the classes if not already present
            if '<unknown>' not in le.classes_:
                le.classes_ = np.append(le.classes_, '<unknown>')
            input_df[feature] = le.transform(input_df[feature].astype(str))
        else:
            raise ValueError(f"No label encoder found for feature '{feature}'")

    # Create marketTime features (using current time as placeholder)
    current_time = pd.Timestamp.now()
    input_df['marketYear'] = current_time.year
    input_df['marketMonth'] = current_time.month
    input_df['marketDay'] = current_time.day
    input_df['marketHour'] = current_time.hour
    input_df['marketMinute'] = current_time.minute

    # Ensure all features match the training data
    missing_cols = set(X_train.columns) - set(input_df.columns)
    for col in missing_cols:
        input_df[col] = 0

    # Reorder columns to match training data
    input_df = input_df[X_train.columns]

    predictions = clf.predict(input_df)
    return predictions


In [174]:
courses_list = ['Fairyhouse (IRE)' 'Cheltenham' 'Lingfield (AW)' 'Leicester' 'Windsor'
 'Catterick' 'Tramore (IRE)' 'Exeter' 'Ayr' 'Southwell (AW)' 'Sedgefield'
 'Thurles (IRE)' 'Nottingham' 'Musselburgh' 'Sandown' 'Market Rasen'
 'Naas (IRE)' 'Haydock' 'Wolverhampton' 'Wincanton' 'Wetherby' 'Ascot'
 'Newcastle' 'Leopardstown (IRE)' 'Warwick' 'Fontwell' 'Gowran Park (IRE)'
 'Taunton' 'Kempton' 'Punchestown (IRE)' 'Navan (IRE)' 'Chepstow'
 'Huntingdon' 'Newton Abbot' 'Doncaster' 'Plumpton' 'Towcester'
 'Lingfield' 'Kelso' 'Bangor-on-Dee' 'Stratford' 'Down Royal (IRE)'
 'Downpatrick (IRE)' 'Folkestone' 'Wexford (RH) (IRE)' 'Worcester'
 'Ludlow' 'Tipperary (IRE)' 'Newbury' 'Hereford' 'Southwell'
 'Clonmel (IRE)' 'Carlisle' 'Hexham' 'Fakenham' 'Limerick (IRE)'
 'Uttoxeter' 'Brighton' 'Mallow (IRE)' 'Curragh (IRE)' 'Hamilton'
 'Aintree' 'Roscommon (IRE)' 'Pontefract' 'Ripon' 'Beverley' 'Newmarket'
 'Ballinrobe (IRE)' 'Thirsk' 'Sligo (IRE)' 'Perth' 'Bath' 'Salisbury'
 'Redcar' 'Killarney (IRE)' 'Chester' 'York' 'Dundalk (IRE)' 'Goodwood'
 'Cartmel' 'Kilbeggan (IRE)' 'Tralee (IRE)' 'Yarmouth' 'Chantilly (FR)'
 'Epsom' 'Newmarket (July)' 'Bellewstown (IRE)' 'Galway (IRE)'
 'Laytown (IRE)' 'Listowel (IRE)' 'Longchamp (FR)' 'Saint-Cloud (FR)'
 'Churchill Downs (USA)']

In [231]:
horse_names = ['Strong Suspicion', 'Baltray','Culleendubh']
race_conditions ='Good'
jockeys = ['P L Malone', 'K Morgan','G Kilfeather']
course = 'Fairyhouse (IRE)'

predictions = predict_outcome(horse_names, race_conditions, jockeys, course)
print(predictions)

[0 0 0]


In [167]:
df_pd.columns

Index(['course', 'countryCode', 'title', 'runners', 'condition', 'prize',
       'rclass', 'horseName', 'trainerName', 'jockeyName', 'RPRc', 'TRc', 'OR',
       'weightSt', 'weightLb', 'age', 'decimalPrice', 'rid', 'age_right',
       'saddle', 'decimalPrice_right', 'isFav', 'trainerName_right',
       'jockeyName_right', 'position', 'positionL', 'dist', 'weightSt_right',
       'weightLb_right', 'overWeight', 'outHandicap', 'headGear', 'RPR', 'TR',
       'OR_right', 'father', 'mother', 'gfather', 'runners_right', 'margin',
       'weight', 'res_win', 'res_place', 'price', 'target', 'marketYear',
       'marketMonth', 'marketDay', 'marketHour', 'marketMinute'],
      dtype='object')

In [256]:
import os
os.chdir("C:\\Users\\jeetg\\code\\horse race prediction\\data")
r_d =  pd.read_csv("race.csv")
h_d = pd.read_csv("horse.csv")
f = pd.read_csv("forward.csv")

  r_d =  pd.read_csv("race.csv")
  h_d = pd.read_csv("horse.csv")


In [261]:
horse_lists = h_d['horseName'].unique().tolist()

In [262]:
horse_lists

['Combermere',
 'Royal Battery',
 'Just So',
 'Mandraki Shuffle',
 'Turnberry Dawn',
 'Yreka Bay',
 "Hi' Upham",
 'Lauderdale Lad',
 'Providence Lodge',
 'Duke Of Abson',
 'Valassy',
 'Clyffe Haze',
 'Bickfield',
 'Saucy Minstrel',
 'Loving Way I',
 'Glenastar VI',
 'Wellknown Coraly',
 'Galmoy Girl',
 'Blue Ring',
 'Lady Owen',
 'Ballymaclode Rose',
 'Haughty-Ha',
 'Gaiety Lass',
 'Mary Alice',
 'Masnoon',
 'Pura Money',
 'Impany',
 'Milford Quay',
 'Protection I',
 'Highland Bounty',
 'Wahiba',
 'Fairfields Cone',
 'Boca Chimes',
 'Bravo Star',
 'Mineral Dust',
 'Rusty Law',
 'Self Aid',
 'Carfax',
 'Copper Streak',
 'Mrs Peopleater',
 'Final Alma',
 'Mr Caractacus',
 'Slightly Gone',
 'Chelsea Man',
 'Solitaire',
 'Rakes Lane',
 'Whitewash',
 'Santo Boy',
 'Failiq',
 'Shady Road',
 "Loren's Courage",
 'Tharros',
 'Escribana',
 'Military Salute',
 'Golden Lantern',
 "Annie'll Do",
 'Forest Lord I',
 'Ardoran',
 'Rain-N-Sun',
 'Garcia I',
 'Pullover',
 'Whiskey Blues',
 'Millmerran',


In [257]:
courses_list = r_d['course'].unique().tolist()

In [258]:
courses_list

['Exeter',
 'Tramore (IRE)',
 'Catterick',
 'Cheltenham',
 'Windsor',
 'Leicester',
 'Southwell (AW)',
 'Fairyhouse (IRE)',
 'Ayr',
 'Sedgefield',
 'Lingfield (AW)',
 'Thurles (IRE)',
 'Nottingham',
 'Haydock',
 'Musselburgh',
 'Folkestone',
 'Market Rasen',
 'Naas (IRE)',
 'Sandown',
 'Chepstow',
 'Wolverhampton',
 'Newton Abbot',
 'Plumpton',
 'Kelso',
 'Punchestown (IRE)',
 'Wincanton',
 'Ascot',
 'Wetherby',
 'Leopardstown (IRE)',
 'Newcastle',
 'Warwick',
 'Fontwell',
 'Carlisle',
 'Worcester',
 'Ludlow',
 'Gowran Park (IRE)',
 'Kempton',
 'Towcester',
 'Navan (IRE)',
 'Doncaster',
 'Clonmel (IRE)',
 'Newbury',
 'Uttoxeter',
 'Down Royal (IRE)',
 'Fakenham',
 'Huntingdon',
 'Downpatrick (IRE)',
 'Tipperary (IRE)',
 'Stratford',
 'Hereford',
 'Wexford (RH) (IRE)',
 'Bangor-on-Dee',
 'Hexham',
 'Limerick (IRE)',
 'Ballinrobe (IRE)',
 'Taunton',
 'Beverley',
 'Curragh (IRE)',
 'Southwell',
 'Roscommon (IRE)',
 'Hamilton',
 'Aintree',
 'Brighton',
 'Phoenix Park (IRE)',
 'Pontefract',

In [253]:
jockey_names = h_d['jockeyName'].unique().tolist()

In [255]:
jockey_names

['J Frost',
 'S Earle',
 'S Burrough',
 'M Richards',
 'P Richards',
 'W Irvine',
 'S McNeill',
 'B Powell Snr',
 'R Chapman',
 'Carl Llewellyn',
 'D Tegg',
 'W McFarland',
 'C Cox',
 'N Dawe',
 'T J Ryan',
 "K F O'Brien",
 "M E O'Callaghan",
 'W Slattery',
 'Mr J A Nash',
 "A J O'Brien",
 'J Collins',
 'Jason Titley',
 'F J Flood',
 'A J Slattery',
 'Mr P McMahon',
 'M Moloney',
 'Lorcan Wyer',
 'J Lower',
 'R Dunwoody',
 'S Turner',
 'N Coleman',
 'Martin Jones',
 'Jamie Osborne',
 'D Gallagher',
 'J Bryan',
 'D Skyrme',
 'Gee Armytage',
 'M Hoad',
 'H Davies',
 'M Furlong',
 'Mr R Teal',
 'Miss Z Davison',
 'S Pearson',
 'G Moore',
 'S McKeever',
 'T Morgan',
 'Steve Smith-Eccles',
 'S D Williams',
 'Susan Kersey',
 'J Railton',
 'M Perrett',
 'B De Haan',
 'M Ahern',
 'S Keightley',
 'M Pitman',
 'Mr P Harding Jones',
 'G McCourt',
 'S Old Davies',
 'J A Harris',
 'B Doran',
 'Derek Byrne',
 'R Bellamy',
 'J Shortt',
 'A Webb',
 'Mr M Jackson',
 'Dale McKeown',
 'S Mitchell',
 'Mr 

In [259]:
race_condition = r_d['condition'].unique().tolist()

In [260]:
race_condition

['Soft',
 'Good To Firm',
 'Good',
 'Standard',
 'Yielding',
 'Good To Soft',
 'Heavy',
 'Good To Yielding',
 'Slow',
 'Firm',
 'Yielding To Soft',
 'Hard',
 'Fast',
 'Soft To Heavy',
 'Very Soft',
 nan,
 'Holding',
 'Sloppy',
 'Muddy',
 'Standard To Slow',
 'Standard To Fast',
 'Frozen',
 'Abandoned']