Note: see Clean_up file for Mongo DF

In [37]:
 # Initial imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
%matplotlib inline

 ## Loading and Preprocessing Loans Encoded Data

In [61]:
file_path = r'C:\Users\aumek\OneDrive\Desktop\Git_Personnal_Rep\Resource_mustang\coverted_tree_df.csv'
# Read the CSV file into pandas
random_forest = pd.read_csv(file_path)
random_forest.head(3)

Unnamed: 0,rid,winningTime,metric,ncond,class,condition,horseName,trainerName,jockeyName,age,...,outHandicap,RPR,saddle,father,TR,OR,runners,weight,res_win,res_place
0,10312,253.88,3218.0,10,4,Good To Soft,Waterproof,Shaun Keightley,Brendan Powell,4.0,...,2.541359,103.0,9.0,Pour Moi,103.0,80.446685,9.0,66.0,1.0,1.0
1,10312,253.88,3218.0,10,4,Good To Soft,Eva's Diva,Phil Middleton,Mr Sam Lee,6.0,...,2.541359,96.0,7.0,Getaway,89.0,110.0,9.0,65.0,0.0,1.0
2,10312,253.88,3218.0,10,4,Good To Soft,Incredible Dream,Conrad Allen,Joshua Moore,7.0,...,2.541359,97.0,5.0,Vale Of York,88.0,80.446685,9.0,72.0,0.0,1.0


In [62]:
random_forest.columns

Index(['rid', 'winningTime', 'metric', 'ncond', 'class', 'condition',
       'horseName', 'trainerName', 'jockeyName', 'age', 'isFav', 'position',
       'distance', 'outHandicap', 'RPR', 'saddle', 'father', 'TR', 'OR',
       'runners', 'weight', 'res_win', 'res_place'],
      dtype='object')

In [63]:
# List of relevant columns for prediction
columns_to_keep = ['class', 'age', 'distance', 'condition', 'winningTime', 'RPR', 'res_win']

# Create a new DataFrame with only the selected columns
horse_characteristics_df = random_forest[columns_to_keep]

# Display the first few rows of the new DataFrame
horse_characteristics_df.head()


Unnamed: 0,class,age,distance,condition,winningTime,RPR,res_win
0,4,4.0,2.0,Good To Soft,253.88,103.0,1.0
1,4,6.0,2.0,Good To Soft,253.88,96.0,0.0
2,4,7.0,2.0,Good To Soft,253.88,97.0,0.0
3,4,6.0,2.0,Good To Soft,253.88,89.0,0.0
4,4,7.0,2.0,Good To Soft,253.88,82.0,0.0


In [64]:
horse_characteristics_df.dtypes

class            int64
age            float64
distance       float64
condition       object
winningTime    float64
RPR            float64
res_win        float64
dtype: object

In [81]:
# Check for missing values (NaN)
nan_columns = horse_characteristics_df.columns[horse_characteristics_df.isnull().any()].tolist()
print("Columns with NaN values:")
print(nan_columns)

Columns with NaN values:
[]


In [82]:
horse_data_df = pd.get_dummies(horse_characteristics_df)

In [83]:
horse_data_df.columns

Index(['class', 'age', 'distance', 'winningTime', 'RPR', 'res_win',
       'condition_Fast', 'condition_Firm', 'condition_Frozen',
       'condition_Good', 'condition_Good To Firm', 'condition_Good To Soft',
       'condition_Good To Yielding', 'condition_Heavy', 'condition_Muddy',
       'condition_Sloppy', 'condition_Slow', 'condition_Soft',
       'condition_Soft To Heavy', 'condition_Standard',
       'condition_Standard To Slow', 'condition_Very Soft',
       'condition_Yielding', 'condition_Yielding To Soft'],
      dtype='object')

In [84]:
 # Define features set
features= horse_data_df.copy()
features.drop("res_win", axis=1, inplace=True)
features.head()

Unnamed: 0,class,age,distance,winningTime,RPR,condition_Fast,condition_Firm,condition_Frozen,condition_Good,condition_Good To Firm,...,condition_Muddy,condition_Sloppy,condition_Slow,condition_Soft,condition_Soft To Heavy,condition_Standard,condition_Standard To Slow,condition_Very Soft,condition_Yielding,condition_Yielding To Soft
0,4,4.0,2.0,253.88,103.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,4,6.0,2.0,253.88,96.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,4,7.0,2.0,253.88,97.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,4,6.0,2.0,253.88,89.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,4,7.0,2.0,253.88,82.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [85]:
# Define target vector
target = horse_data_df["res_win"].ravel()
target[:5]

array([1., 0., 0., 0., 0.])

In [86]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=78)


In [87]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [88]:
 # Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [89]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

 ## Fitting the Random Forest Model

In [90]:
 # Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [91]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

## Making Predictions Using the Random Forest Model

In [92]:
 # Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

## Model Evaluation

In [93]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [97]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,31336,869
Actual 1,3210,313


Accuracy Score : 0.8858318405732198
Classification Report
              precision    recall  f1-score   support

         0.0       0.91      0.97      0.94     32205
         1.0       0.26      0.09      0.13      3523

    accuracy                           0.89     35728
   macro avg       0.59      0.53      0.54     35728
weighted avg       0.84      0.89      0.86     35728



## Changing target to "IsFav"

In [98]:
# List of relevant columns for prediction
columns_to_keep = ['class', 'age', 'distance', 'condition', 'winningTime', 'RPR', 'res_win','isFav']

# Create a new DataFrame with only the selected columns
horse_fav_df = random_forest[columns_to_keep]

# Display the first few rows of the new DataFrame
horse_fav_df.head()


Unnamed: 0,class,age,distance,condition,winningTime,RPR,res_win,isFav
0,4,4.0,2.0,Good To Soft,253.88,103.0,1.0,0.0
1,4,6.0,2.0,Good To Soft,253.88,96.0,0.0,1.0
2,4,7.0,2.0,Good To Soft,253.88,97.0,0.0,0.0
3,4,6.0,2.0,Good To Soft,253.88,89.0,0.0,0.0
4,4,7.0,2.0,Good To Soft,253.88,82.0,0.0,0.0


In [99]:
# Check for missing values (NaN)
nan_columns = horse_fav_df.columns[horse_fav_df.isnull().any()].tolist()
print("Columns with NaN values:")
print(nan_columns)

Columns with NaN values:
[]


In [100]:
horse_data_df = pd.get_dummies(horse_fav_df)

In [106]:
 # Define features set
features= horse_data_df.copy()
features.drop("isFav", axis=1, inplace=True)
features.head()

Unnamed: 0,class,age,distance,winningTime,RPR,res_win,condition_Fast,condition_Firm,condition_Frozen,condition_Good,...,condition_Muddy,condition_Sloppy,condition_Slow,condition_Soft,condition_Soft To Heavy,condition_Standard,condition_Standard To Slow,condition_Very Soft,condition_Yielding,condition_Yielding To Soft
0,4,4.0,2.0,253.88,103.0,1.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,4,6.0,2.0,253.88,96.0,0.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,4,7.0,2.0,253.88,97.0,0.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,4,6.0,2.0,253.88,89.0,0.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,4,7.0,2.0,253.88,82.0,0.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [107]:
# Define target vector
target = horse_data_df["isFav"].ravel()
target[:5]

array([0., 1., 0., 0., 0.])

In [108]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=78)

In [109]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [110]:
 # Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [111]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

 ## Fitting the Random Forest Model

In [112]:
 # Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [113]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

## Making Predictions Using the Random Forest Model


In [114]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)


## Model Evaluation

In [115]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [116]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,30765,1172
Actual 1,3408,383


Accuracy Score : 0.8718092252575012
Classification Report
              precision    recall  f1-score   support

         0.0       0.90      0.96      0.93     31937
         1.0       0.25      0.10      0.14      3791

    accuracy                           0.87     35728
   macro avg       0.57      0.53      0.54     35728
weighted avg       0.83      0.87      0.85     35728

