In [1]:
# Import our dependencies
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn import datasets
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

**Preparing the data For Model1 - where we are predicting if the 'favorite' horse wins, we used two datasets from "Kaggle" as listed below:**

*dataset 1 = races_* columns description: 
- rid - Race id; 
- Distance
- winningTime - Best time shown
- metric - Distance in meters
- class - class type (created from rclass feature).

*dataset 2 = horses_* columns description: 
- rid - Race id (column used to merge the 2 datasets)
- age - Horse age; 
- saddle - Saddle # where horse starts; 
- isFav - Was horse favorite before start? Can be more then one fav in a race;  
- position - Finishing position, 40 if horse didn't finish; 
- positionL - how far a horse has finished from the pursued horse, horses corpses; 
- outHandicap - Handicap; 
- RPR - RP Rating; 
- TR - Topspeed; 
- OR - Official Rating father - Horse's Father name; 
- runners - Runners total; 
- weight - Horse weight in kg; 
- res_win - Horse won or not; 


### For data clean up pls refer to CleanUp file 

# **Preprocessing Model 2 - Characteristics of a Horse winner**

 Step 1: Read the filtered_df_tree.csv data from the Resources folder into a Pandas DataFrame.

In [2]:
decision_tree = pd.read_csv('Resources/coverted_tree_df.csv')
decision_tree.head(5)

Unnamed: 0,rid,winningTime,metric,ncond,class,condition,horseName,trainerName,jockeyName,age,...,outHandicap,RPR,saddle,father,TR,OR,runners,weight,res_win,res_place
0,10312,253.88,3218.0,10,4,Good To Soft,Waterproof,Shaun Keightley,Brendan Powell,4.0,...,2.541359,103.0,9.0,Pour Moi,103.0,80.446685,9.0,66.0,1.0,1.0
1,10312,253.88,3218.0,10,4,Good To Soft,Eva's Diva,Phil Middleton,Mr Sam Lee,6.0,...,2.541359,96.0,7.0,Getaway,89.0,110.0,9.0,65.0,0.0,1.0
2,10312,253.88,3218.0,10,4,Good To Soft,Incredible Dream,Conrad Allen,Joshua Moore,7.0,...,2.541359,97.0,5.0,Vale Of York,88.0,80.446685,9.0,72.0,0.0,1.0
3,10312,253.88,3218.0,10,4,Good To Soft,Hats Off To Larry,Mick Channon,Marc Goldstein,6.0,...,2.541359,89.0,4.0,Sixties Icon,75.0,80.446685,9.0,72.0,0.0,0.0
4,10312,253.88,3218.0,10,4,Good To Soft,Taqwaa,Laura Morgan,Richie McLernon,7.0,...,2.541359,82.0,6.0,Iffraaj,67.0,80.446685,9.0,72.0,0.0,0.0


In [3]:
# Determine the number of unique values in each column.
unique_application = decision_tree.nunique()
print(unique_application)

rid            14077
winningTime     8692
metric            53
ncond             18
class              8
condition         18
horseName      44803
trainerName     4250
jockeyName      2822
age               16
isFav              2
position          35
distance           8
outHandicap       27
RPR              178
saddle            36
father          2691
TR               158
OR               171
runners           27
weight            34
res_win            2
res_place          2
dtype: int64


In [4]:
if "res_win" in decision_tree.columns:
    print("Column 'res_win' is present in the dataset.")
else:
    print("Column 'res_win' is not present in the dataset.")

Column 'res_win' is present in the dataset.


In [5]:
# Look at res_win value counts for binning - where 0=did not win; 1=won
application_type_counts = decision_tree['res_win'].value_counts()
application_type_counts

res_win
0.0    128817
1.0     14094
Name: count, dtype: int64

In [6]:
# Check for missing values (NaN)
nan_columns = decision_tree.columns[decision_tree.isnull().any()].tolist()
print("Columns with NaN values:")
print(nan_columns)

Columns with NaN values:
[]


In [7]:
# columns_with_null = ['distance']

# # Replace null values with the mean of each column
# for column in columns_with_null:
#     decision_tree[column].fillna(decision_tree[column].mean(), inplace=True)

# # Verify that null values are replaced
# nan_columns = decision_tree.columns[decision_tree.isnull().any()].tolist()
# print("Columns with NaN values after replacement:")
# print(nan_columns)

In [8]:
num_rows = decision_tree.shape[0]
num_columns = decision_tree.shape[1]
print("Number of Rows:", num_rows)
print("Number of Columns:", num_columns)

Number of Rows: 142911
Number of Columns: 23


In [9]:
sampled_df = decision_tree.sample(n=15000, random_state=42)
num_rows = sampled_df.shape[0]
num_columns = sampled_df.shape[1]
print("Number of Rows:", num_rows)
print("Number of Columns:", num_columns)

Number of Rows: 15000
Number of Columns: 23


In [10]:
encoded_df = pd.get_dummies(sampled_df)


### Further steps in the model-building process: feature selection, model selection, training, and evaluation ###

In [11]:
 # Define features set
features = encoded_df.copy()
features.drop("res_win", axis=1, inplace=True)
features.head()


Unnamed: 0,rid,winningTime,metric,ncond,class,age,isFav,position,distance,outHandicap,...,father_Zambezi Sun,father_Zamindar,father_Zanzibari,father_Zebedee,father_Zed,father_Zenno Rob Roy,father_Zerpour,father_Zivo,father_Zoffany,father_Zoustar
101353,410815,222.4,3218.0,1,4,6.0,0.0,5.0,2.0,2.541359,...,False,False,False,False,False,False,False,False,False,False
29275,401605,93.58,1407.0,12,0,3.0,0.0,11.0,1407.0,2.541359,...,False,False,False,False,False,False,False,False,False,False
92053,409839,107.79,1810.0,8,0,4.0,0.0,1.0,1.0,2.541359,...,False,False,False,False,False,False,False,False,False,False
38706,404193,300.77,4323.5,3,0,6.0,0.0,40.0,2.0,2.541359,...,False,False,False,False,False,False,False,False,False,False
86312,409139,88.24,1407.0,0,5,9.0,0.0,11.0,1407.0,2.541359,...,False,False,False,False,False,False,False,False,False,False


In [12]:
# Define target vector
target = encoded_df["res_win"].values.reshape(-1, 1)
target[:5]


array([[0.],
       [0.],
       [1.],
       [0.],
       [0.]])

In [14]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=78)

In [15]:
# Creating StandardScaler instance
scaler = StandardScaler()


In [16]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)


In [17]:
 # Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

 ## Fitting the Decision Tree Model

In [18]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [19]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

##  Making Predictions Using the Tree Model

In [20]:
 # Making predictions using the testing data
predictions = model.predict(X_test_scaled)

 ## Model Evaluation

In [26]:
 # Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [22]:
 # Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3374,0
Actual 1,0,376


Accuracy Score : 1.0
Classification Report
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      3374
         1.0       1.00      1.00      1.00       376

    accuracy                           1.00      3750
   macro avg       1.00      1.00      1.00      3750
weighted avg       1.00      1.00      1.00      3750



In [23]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris


# Load the dataset
iris = load_iris()
X, y = iris.data, iris.target

# Create a decision tree classifier
clf = DecisionTreeClassifier()

# Perform cross-validation
scores = cross_val_score(clf, X, y, cv=5)  # cv=5 specifies 5-fold cross-validation

# Print the cross-validation scores
print("Cross-Validation Scores:", scores)
print("Mean Score:", scores.mean())

Cross-Validation Scores: [0.96666667 0.96666667 0.9        1.         1.        ]
Mean Score: 0.9666666666666668
