In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix


In [2]:
# Load the dataset
merged_df = pd.read_csv('C:\\Users\\sujit\\OneDrive - Sri Lanka Institute of Information Technology\\Research SLIIT\\Research Project\\Dataset\\Final_horse_racing_dataset.csv')

# Display the information about the dataset
print(merged_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75162 entries, 0 to 75161
Data columns (total 41 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   horse_id          75162 non-null  object 
 1   race_id           75162 non-null  object 
 2   race_name         75162 non-null  object 
 3   horse_name        75162 non-null  object 
 4   number            75162 non-null  object 
 5   sex               75070 non-null  object 
 6   age               75162 non-null  object 
 7   handicap_weight   75162 non-null  float64
 8   jockey            75090 non-null  object 
 9   trainer           75161 non-null  object 
 10  win_odd_live      75162 non-null  float64
 11  reference_odd     75162 non-null  float64
 12  min_place_odd     75162 non-null  float64
 13  max_place_odd     75162 non-null  float64
 14  ze_show_odd       75162 non-null  float64
 15  ze_4th_odd        75162 non-null  float64
 16  music             75162 non-null  object

  merged_df = pd.read_csv('C:\\Users\\sujit\\OneDrive - Sri Lanka Institute of Information Technology\\Research SLIIT\\Research Project\\Dataset\\Final_horse_racing_dataset.csv')


In [3]:
print(merged_df.columns)

Index(['horse_id', 'race_id', 'race_name', 'horse_name', 'number', 'sex',
       'age', 'handicap_weight', 'jockey', 'trainer', 'win_odd_live',
       'reference_odd', 'min_place_odd', 'max_place_odd', 'ze_show_odd',
       'ze_4th_odd', 'music', 'corde', 'position', 'date', 'Start',
       'event_name', 'race_type', 'distance', 'prize', 'field', 'track',
       'corde.1', 'penetrometer', 'number of horses', 'race time', 'jockey_id',
       'trainer_id', 'final_result_id', '1st', '2nd', '3rd', '4th', '5th',
       '6th', '7th'],
      dtype='object')


In [4]:
# Load dataset
import pandas as pd

# Assuming `merged_df` is your DataFrame
# Define columns to drop
columns_to_drop = [
    'race_id', 'horse_id', 'jockey_id', 'trainer_id', 'final_result_id',
    'race_name', 'horse_name', 'event_name', 'corde.1', '1st', '2nd', '3rd',
    '4th', '5th', '6th', '7th'
]

# Drop columns
merged_df = merged_df.drop(columns=columns_to_drop)

# Check the remaining columns
print("Remaining columns:", merged_df.columns)


Remaining columns: Index(['number', 'sex', 'age', 'handicap_weight', 'jockey', 'trainer',
       'win_odd_live', 'reference_odd', 'min_place_odd', 'max_place_odd',
       'ze_show_odd', 'ze_4th_odd', 'music', 'corde', 'position', 'date',
       'Start', 'race_type', 'distance', 'prize', 'field', 'track',
       'penetrometer', 'number of horses', 'race time'],
      dtype='object')


In [5]:
# Print the first few rows to verify the scaling
print(merged_df.head())

  number sex age  handicap_weight         jockey                      trainer  \
0      1   H   2             57.0      Demuro C.                     Brogi S.   
1      2   F   2             55.5   Roussel Ale.                  Monfort Ed.   
2      3   M   2             55.5  Murzabayev B.  Janackova Koplikova Mlle I.   
3      4   M   2             55.5       Guyon M.                 Vermeulen F.   
4      5   M   2             55.5     Besnier H.  Janackova Koplikova Mlle I.   

   win_odd_live  reference_odd  min_place_odd  max_place_odd  ...        date  \
0           3.1            3.0            1.3            1.8  ...  26/08/2023   
1           4.4            5.1            1.8            2.4  ...  26/08/2023   
2           2.9            3.1            1.3            1.8  ...  26/08/2023   
3           6.7            6.1            3.0            4.2  ...  26/08/2023   
4          15.4           14.6            5.5            7.6  ...  26/08/2023   

   Start race_type  distan

In [6]:
# Check for missing values
print(merged_df.isnull().sum())

number               0
sex                 92
age                  0
handicap_weight      0
jockey              72
trainer              1
win_odd_live         0
reference_odd        0
min_place_odd        0
max_place_odd        0
ze_show_odd          0
ze_4th_odd           0
music                0
corde                0
position             0
date                 0
Start                0
race_type            0
distance             0
prize               12
field                0
track                0
penetrometer         0
number of horses     0
race time            0
dtype: int64


In [7]:
# Check for missing values
missing_values = merged_df.isnull().sum()
print(missing_values[missing_values > 0])  # Display columns with missing values

sex        92
jockey     72
trainer     1
prize      12
dtype: int64


In [8]:
# Impute missing values in numerical columns with the mean
merged_df['prize'].fillna(merged_df['prize'].mean(), inplace=True)

In [9]:
# Impute missing values in categorical columns with the mode
merged_df['sex'].fillna(merged_df['sex'].mode()[0], inplace=True)
merged_df['jockey'].fillna(merged_df['jockey'].mode()[0], inplace=True)

In [10]:
# Drop rows where 'trainer' is missing (only 1 value here, but for illustration)
merged_df.dropna(subset=['trainer'], inplace=True)

# Alternatively, drop entire column if it has too many missing values:
if missing_values['jockey'] / len(merged_df) > 0.3:  # Example threshold of 30%
    merged_df.drop(columns=['jockey'], inplace=True)

In [11]:
# Check again for any remaining missing values
missing_values_after = merged_df.isnull().sum()
print(missing_values_after[missing_values_after > 0])  # Should return an empty series if all handled

Series([], dtype: int64)


In [12]:
# Remove duplicates
merged_df.drop_duplicates(inplace=True)

# Check for outliers or erroneous data (e.g., negative ages)
# Example: Check age column
print(merged_df['age'].describe())

count     75161
unique       14
top           3
freq      19007
Name: age, dtype: object


In [13]:
# Convert 'age' to numeric if it's not already (handle errors by coercing them to NaN)
merged_df['age'] = pd.to_numeric(merged_df['age'], errors='coerce')

# Remove rows with negative ages or NaN values after conversion
merged_df = merged_df[merged_df['age'] >= 0]

# Check the shape after cleaning
print(f"DataFrame shape after removing erroneous entries: {merged_df.shape}")

DataFrame shape after removing erroneous entries: (75051, 25)


In [14]:
# Check for any remaining missing values
missing_values_after = merged_df.isnull().sum()
print(missing_values_after[missing_values_after > 0])  # Should return an empty series if all handled

Series([], dtype: int64)


In [15]:
# Display basic information about the cleaned DataFrame
print(merged_df.info())

# Display the first few rows of the cleaned DataFrame
print(merged_df.head())

<class 'pandas.core.frame.DataFrame'>
Index: 75051 entries, 0 to 75161
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   number            75051 non-null  object 
 1   sex               75051 non-null  object 
 2   age               75051 non-null  float64
 3   handicap_weight   75051 non-null  float64
 4   jockey            75051 non-null  object 
 5   trainer           75051 non-null  object 
 6   win_odd_live      75051 non-null  float64
 7   reference_odd     75051 non-null  float64
 8   min_place_odd     75051 non-null  float64
 9   max_place_odd     75051 non-null  float64
 10  ze_show_odd       75051 non-null  float64
 11  ze_4th_odd        75051 non-null  float64
 12  music             75051 non-null  object 
 13  corde             75051 non-null  int64  
 14  position          75051 non-null  int64  
 15  date              75051 non-null  object 
 16  Start             75051 non-null  object 
 17

In [16]:
print(merged_df.columns)

Index(['number', 'sex', 'age', 'handicap_weight', 'jockey', 'trainer',
       'win_odd_live', 'reference_odd', 'min_place_odd', 'max_place_odd',
       'ze_show_odd', 'ze_4th_odd', 'music', 'corde', 'position', 'date',
       'Start', 'race_type', 'distance', 'prize', 'field', 'track',
       'penetrometer', 'number of horses', 'race time'],
      dtype='object')


In [17]:
from sklearn.preprocessing import LabelEncoder

# List of categorical columns to encode
categorical_cols = ['sex', 'jockey', 'trainer', 'race_type', 'track', 'music', 'corde']

# Dictionary to store encoders for each categorical column
label_encoders = {}

# Encode each categorical column
for col in categorical_cols:
    le = LabelEncoder()
    merged_df[col] = le.fit_transform(merged_df[col].astype(str))  # Encode and convert to string if needed
    label_encoders[col] = le  # Save encoder if needed for future use


In [18]:
# Convert date to datetime format and extract year, month, and day
merged_df['date'] = pd.to_datetime(merged_df['date'])
merged_df['year'] = merged_df['date'].dt.year
merged_df['month'] = merged_df['date'].dt.month
merged_df['day'] = merged_df['date'].dt.day

# Drop the original 'date' column if it's no longer needed
merged_df = merged_df.drop(columns=['date'])


  merged_df['date'] = pd.to_datetime(merged_df['date'])


In [19]:
# Replace 'Unknown' in the 'age' column with 0
merged_df['age'] = merged_df['age'].replace('Unknown', 0)

# Ensure 'age' column is in numeric format
merged_df['age'] = merged_df['age'].astype(float)


In [20]:
from sklearn.preprocessing import StandardScaler

# List of numerical columns to scale
numerical_cols = ['age', 'handicap_weight', 'distance', 'prize', 'penetrometer', 'number of horses', 'race time']

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler to the numerical columns and transform them
merged_df[numerical_cols] = scaler.fit_transform(merged_df[numerical_cols])


In [21]:
from sklearn.preprocessing import StandardScaler

# List of numerical columns to scale
numerical_cols = ['age', 'handicap_weight', 'distance', 'prize', 'penetrometer', 'number of horses', 'race time']

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler to the numerical columns and transform them
merged_df[numerical_cols] = scaler.fit_transform(merged_df[numerical_cols])


In [22]:
# Create a binary target for predicting top 3 horses
merged_df['top_3'] = merged_df['position'].apply(lambda x: 1 if x in [1, 2, 3] else 0)

# Define features (X) and target (y)
X = merged_df.drop(columns=['position', 'top_3'])  # Drop 'position' and other unused columns
y = merged_df['top_3']


In [23]:
from sklearn.model_selection import train_test_split

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [24]:
# Identify categorical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_cols)


Categorical columns: Index(['number', 'Start', 'field'], dtype='object')


In [25]:
# Perform one-hot encoding on categorical columns
X_train = pd.get_dummies(X_train, columns=categorical_cols, drop_first=True)
X_test = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)

# Ensure that the train and test sets have the same columns after encoding
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)


In [27]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
model.fit(X_train, y_train)


In [31]:
# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
from sklearn.metrics import classification_report, confusion_matrix

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.7630404370128573
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.93      0.85     11092
           1       0.60      0.28      0.38      3919

    accuracy                           0.76     15011
   macro avg       0.69      0.61      0.62     15011
weighted avg       0.74      0.76      0.73     15011

Confusion Matrix:
 [[10361   731]
 [ 2826  1093]]
