In [1]:
# Import our dependencies
 
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
%matplotlib inline

### For data clean up pls refer to CleanUp file 

# **Preprocessing Model 2 - Characteristics of a Horse winner**

 Step 1: Read the filtered_df_tree.csv data from the Resources folder into a Pandas DataFrame.

In [2]:
random_forest_df = pd.read_csv('Resources/coverted_tree_df.csv')
random_forest_df.head(5)


Unnamed: 0,rid,winningTime,metric,ncond,class,condition,horseName,trainerName,jockeyName,age,...,outHandicap,RPR,saddle,father,TR,OR,runners,weight,res_win,res_place
0,10312,253.88,3218.0,10,4,Good To Soft,Waterproof,Shaun Keightley,Brendan Powell,4.0,...,2.541359,103.0,9.0,Pour Moi,103.0,80.446685,9.0,66.0,1.0,1.0
1,10312,253.88,3218.0,10,4,Good To Soft,Eva's Diva,Phil Middleton,Mr Sam Lee,6.0,...,2.541359,96.0,7.0,Getaway,89.0,110.0,9.0,65.0,0.0,1.0
2,10312,253.88,3218.0,10,4,Good To Soft,Incredible Dream,Conrad Allen,Joshua Moore,7.0,...,2.541359,97.0,5.0,Vale Of York,88.0,80.446685,9.0,72.0,0.0,1.0
3,10312,253.88,3218.0,10,4,Good To Soft,Hats Off To Larry,Mick Channon,Marc Goldstein,6.0,...,2.541359,89.0,4.0,Sixties Icon,75.0,80.446685,9.0,72.0,0.0,0.0
4,10312,253.88,3218.0,10,4,Good To Soft,Taqwaa,Laura Morgan,Richie McLernon,7.0,...,2.541359,82.0,6.0,Iffraaj,67.0,80.446685,9.0,72.0,0.0,0.0


In [3]:
random_forest_df.columns

Index(['rid', 'winningTime', 'metric', 'ncond', 'class', 'condition',
       'horseName', 'trainerName', 'jockeyName', 'age', 'isFav', 'position',
       'distance', 'outHandicap', 'RPR', 'saddle', 'father', 'TR', 'OR',
       'runners', 'weight', 'res_win', 'res_place'],
      dtype='object')

### Further steps in the model-building process: feature selection, model selection, training, and evaluation ###

Prediction based on:
"class": The class of the race as it can be an important predictor for the outcome.
"ages": The age of the horse can be a significant factor in horse racing outcomes.
"distance": The distance of the race could impact the horse's performance.
"condition": The condition of the track may influence the results.
"winningTime": The winning time in the previous races can be a predictor.
"RPR" (Rating Performance Ranking): The horse's performance rating can be informative.

In [4]:
# List of relevant columns for prediction
columns_to_keep = ['class', 'age', 'distance', 'condition', 'winningTime', 'RPR', 'res_win']

# Create a new DataFrame with only the selected columns
horse_characteristics_df = random_forest_df[columns_to_keep]

# Display the first few rows of the new DataFrame
horse_characteristics_df.head()

Unnamed: 0,class,age,distance,condition,winningTime,RPR,res_win
0,4,4.0,2.0,Good To Soft,253.88,103.0,1.0
1,4,6.0,2.0,Good To Soft,253.88,96.0,0.0
2,4,7.0,2.0,Good To Soft,253.88,97.0,0.0
3,4,6.0,2.0,Good To Soft,253.88,89.0,0.0
4,4,7.0,2.0,Good To Soft,253.88,82.0,0.0


In [5]:
# Determine the number of unique values in each column.
unique_application = horse_characteristics_df.nunique()
print(unique_application)

class             8
age              16
distance          8
condition        18
winningTime    8692
RPR             178
res_win           2
dtype: int64


In [6]:
# Look at res_win value counts for binning - where 0=did not win; 1=won
application_type_counts = horse_characteristics_df['res_win'].value_counts()
application_type_counts

res_win
0.0    128817
1.0     14094
Name: count, dtype: int64

In [7]:
# Check for missing values (NaN)
nan_columns = horse_characteristics_df.columns[horse_characteristics_df.isnull().any()].tolist()
print("Columns with NaN values:")
print(nan_columns)

Columns with NaN values:
[]


In [8]:
num_rows = horse_characteristics_df.shape[0]
num_columns = horse_characteristics_df.shape[1]
print("Number of Rows:", num_rows)
print("Number of Columns:", num_columns)

Number of Rows: 142911
Number of Columns: 7


In [9]:
# sampled_df = horse_characteristics_df.sample(n=30000, random_state=42)
# num_rows = sampled_df.shape[0]
# num_columns = sampled_df.shape[1]
# print("Number of Rows:", num_rows)
# print("Number of Columns:", num_columns)

Number of Rows: 30000
Number of Columns: 7


In [26]:
encoded_df = pd.get_dummies(horse_characteristics_df)
encoded_df.head(5)

Unnamed: 0,class,age,distance,winningTime,RPR,res_win,condition_Fast,condition_Firm,condition_Frozen,condition_Good,...,condition_Muddy,condition_Sloppy,condition_Slow,condition_Soft,condition_Soft To Heavy,condition_Standard,condition_Standard To Slow,condition_Very Soft,condition_Yielding,condition_Yielding To Soft
0,4,4.0,2.0,253.88,103.0,1.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,4,6.0,2.0,253.88,96.0,0.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,4,7.0,2.0,253.88,97.0,0.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,4,6.0,2.0,253.88,89.0,0.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,4,7.0,2.0,253.88,82.0,0.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [27]:
encoded_df.columns

Index(['class', 'age', 'distance', 'winningTime', 'RPR', 'res_win',
       'condition_Fast', 'condition_Firm', 'condition_Frozen',
       'condition_Good', 'condition_Good To Firm', 'condition_Good To Soft',
       'condition_Good To Yielding', 'condition_Heavy', 'condition_Muddy',
       'condition_Sloppy', 'condition_Slow', 'condition_Soft',
       'condition_Soft To Heavy', 'condition_Standard',
       'condition_Standard To Slow', 'condition_Very Soft',
       'condition_Yielding', 'condition_Yielding To Soft'],
      dtype='object')

In [28]:
# Define features set
features = encoded_df.copy()
features.drop("res_win", axis=1, inplace=True)
features.head()

Unnamed: 0,class,age,distance,winningTime,RPR,condition_Fast,condition_Firm,condition_Frozen,condition_Good,condition_Good To Firm,...,condition_Muddy,condition_Sloppy,condition_Slow,condition_Soft,condition_Soft To Heavy,condition_Standard,condition_Standard To Slow,condition_Very Soft,condition_Yielding,condition_Yielding To Soft
0,4,4.0,2.0,253.88,103.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,4,6.0,2.0,253.88,96.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,4,7.0,2.0,253.88,97.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,4,6.0,2.0,253.88,89.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,4,7.0,2.0,253.88,82.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [29]:
# Define target vector
target = encoded_df["res_win"].ravel()
target[:5]

array([1., 0., 0., 0., 0.])

In [None]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=78)

In [32]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=78)

In [33]:
# Creating StandardScaler instance
scaler = StandardScaler()


In [34]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)


In [35]:
 # Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

 ## Fitting the RandomForest

In [36]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)
# Reshape the target vector y using ravel()
y_train = y_train.ravel()

In [37]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

##  Making Predictions Using the RandomForest

In [38]:
 # Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

 ## Model Evaluation

In [21]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [22]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,7712,324
Actual 1,883,81


Accuracy Score : 0.8658888888888889
Classification Report
              precision    recall  f1-score   support

         0.0       0.90      0.96      0.93      8036
         1.0       0.20      0.08      0.12       964

    accuracy                           0.87      9000
   macro avg       0.55      0.52      0.52      9000
weighted avg       0.82      0.87      0.84      9000



## Feature Importance

In [23]:
random_forest_df.columns

Index(['rid', 'winningTime', 'metric', 'ncond', 'class', 'condition',
       'horseName', 'trainerName', 'jockeyName', 'age', 'isFav', 'position',
       'distance', 'outHandicap', 'RPR', 'saddle', 'father', 'TR', 'OR',
       'runners', 'weight', 'res_win', 'res_place'],
      dtype='object')

In [24]:
# Filter the top 10 winner horses
top_10_winners = random_forest_df[random_forest_df['res_win'] == 1].head(10)

# Select the desired features for printing
features_to_print = ['jockeyName', 'father', 'isFav', 'RPR','class', 'age' ]



In [25]:
import matplotlib.pyplot as plt

# Filter the top 50 winner horses
top_50_winners = random_forest_df[random_forest_df['res_win'] == 1].head(50)

# # Select the desired features for printing
# features_to_print = ['position', 'res_place', 'isFav', 'RPR','class']

# Plot the features for the top 10 winner horses
top_10_winners[features_to_print].plot(kind='bar', x='position', figsize=(10, 6))

# Set the plot title and axis labels
plt.title('Features of Top 50 Winner Horses')
plt.xlabel('Features')
plt.ylabel('Feature Values')

# Display the plot
plt.show()


KeyError: 'position'

In [None]:
# Define a dictionary to map the old column names to the new ones
column_name_mapping = {
    'rid': 'Race ID',
    'winningTime': 'Winning Time',
    'metric': 'Metric',
    'ncond': 'Condition',
    'class': 'Class',
    'horseName': 'HorseName',
    'trainerName': 'TrainerName',
    'jockeyName': 'JockeyName',
    'age': 'Age',
    'isFav': 'IsFavorite',
    'position': 'Position',
    'distance': 'Distance',
    'outHandicap': 'OutHandicap',
    'RPR': 'Racing Post Rating',
    'saddle': 'Saddle',
    'father': 'Father',
    'TR': 'Timeform Rating',
    'OR': 'Official Rating',
    'runners': 'Runners',
    'weight': 'Weight',
    'res_win': 'Result_Win',
    'res_place': 'Result_Place',
}

# Rename the columns using the dictionary
renamed_df= random_forest.rename(columns=column_name_mapping)


In [None]:
import matplotlib.pyplot as plt

# Assuming you have 'rf_model' and 'df' with your features as defined in your code

# Get the feature importances from the random forest model
importances = rf_model.feature_importances_

# Create a list of feature names
feature_names = renamed_df.columns

# Sort the features and their importances in descending order
sorted_features = sorted(zip(importances, feature_names), reverse=False)
importances, feature_names = zip(*sorted_features)

# Plot the top 10 features in a bar chart
plt.figure(figsize=(10, 6))
plt.barh(feature_names[:10], importances[:10])
plt.xlabel('Feature Importance')
plt.ylabel('Feature Name')
plt.title('Top 10 Features Importance in Horse Race')
plt.tight_layout()
plt.show()


In [None]:
random_forest.columns

In [None]:

# Group by 'trainerName' and 'jockeyName', and count the number of winners for each group
trainer_winners = random_forest[random_forest['Result_Win'] == 1].groupby('TrainerName')['Result_Win'].sum()
jockey_winners = random_forest[random_forest['Result_Win'] == 1].groupby('JockeyName')['Result_Win'].sum()

# Sort the results in descending order to find the best trainer and jockey
best_trainer = trainer_winners.idxmax()
best_jockey = jockey_winners.idxmax()

# Display the total winner horses under every trainer and jockey
print("Total Winner Horses under Every Trainer:")
print(trainer_winners)
print("\nTotal Winner Horses under Every Jockey:")
print(jockey_winners)

# Display the best trainer and jockey
print("\nBest Trainer:", best_trainer)
print("Best Jockey:", best_jockey)



In [None]:
# Group by 'trainerName' and 'jockeyName', and count the number of winners for each group
trainer_winners = random_forest[random_forest['Result_Win'] == 1].groupby('TrainerName')['Result_Win'].sum()
jockey_winners = random_forest[random_forest['Result_Win'] == 1].groupby('JockeyName')['Result_Win'].sum()

# Sort the results in descending order to find the top 10 trainers and jockeys
top_10_trainers = trainer_winners.nlargest(10)
top_10_jockeys = jockey_winners.nlargest(10)

# Plot the bar chart for top 10 trainers with the highest number of winners
plt.figure(figsize=(10, 6))
top_10_trainers.plot(kind='bar')
plt.xlabel('Trainer Name')
plt.ylabel('Total Winners')
plt.title('Top 10 Trainers with the Highest Number of Winner Horses')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Plot the bar chart for top 10 jockeys with the highest number of winners
plt.figure(figsize=(10, 6))
top_10_jockeys.plot(kind='bar')
plt.xlabel('Jockey Name')
plt.ylabel('Total Winners')
plt.title('Top 10 Jockeys with the Highest Number of Winner Horses')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Display the names of the best trainer and jockey
print("\nBest Trainer:", top_10_trainers.idxmax())
print("Best Jockey:", top_10_jockeys.idxmax())
