In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
# Load and preprocess the data
rdata = pd.read_csv("HorseJump/playground/ml_code/datasheet_horse.csv")
df = pd.DataFrame(rdata)

In [3]:
rdata.dtypes

Sire         int64
 Dam         int64
 y_actual    int64
dtype: object

In [4]:
rdata.columns = rdata.columns.str.replace(' ', '')

In [5]:
x = rdata[['Sire', 'Dam']]
y = rdata['y_actual']

In [6]:
# Extract input features (X) and target variable (y)
X = rdata[['Sire', 'Dam']]
y = rdata['y_actual']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Function to predict y value from user's input independent variables
def predict_jump_value(sire_value, dam_value):
    input_values = np.array([[sire_value, dam_value]])
    y_pred = model.predict(input_values)
    return round(y_pred[0])


In [7]:
sire_value = float(input("Enter Sire value: "))
dam_value = float(input("Enter Dam value: "))

In [8]:
predicted_jump_value = predict_jump_value(sire_value, dam_value)
print("Predicted Jump Value:", predicted_jump_value)

Predicted Jump Value: 126




In [9]:
# Predict on the entire dataset
y_pred = model.predict(X)
y_pred = np.array(y_pred)
y_actual = np.array(y)

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_actual, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_actual, y_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared (R2) Score
r2 = r2_score(y_actual, y_pred)

print("Mean Absolute Error (MAE):", round(mae,3))
print("Root Mean Squared Error (RMSE):", round(rmse,3))
print("R-squared (R2) Score:", round(r2,3))

Mean Absolute Error (MAE): 4.742
Root Mean Squared Error (RMSE): 6.108
R-squared (R2) Score: 0.827


In [10]:
from joblib import dump

In [11]:
# dump(model, 'ml_code/savedModels/model.joblib')
dump(model, 'HorseJump/playground/ml_code/savedModels/model_by_height.joblib')

['HorseJump/playground/ml_code/savedModels/model_by_height.joblib']

In [12]:
horse_name = pd.read_csv('HorseJump/playground/ml_code/Horse Data.csv', skipinitialspace = True)

In [13]:
horse_name.columns = horse_name.columns.str.replace(' ', '')

In [14]:
horse_name.head()

Unnamed: 0,Horse,YearofBirth,Level,Sire,Sire'slevel,Dam,Dam'slevel
0,Diamant de Semilly ...,1991.0,170,Le Tot de Semilly,170,Venise des Cresles,150.0
1,Best MIX Joie De Toulon (aka: JOIE DE TOULON) ...,2009.0,170,Toulon,160,Best MIX Cigane Z,170.0
2,Casall (aka: CASALL IV) ...,2005.0,170,Casall ASK,170,F-Cecilia Bb,140.0
3,"Casall (aka: CASALL III, RUBNER'S CASALL) ...",2004.0,170,Casall ASK,170,Jenga,130.0
4,"VDL Cardento (aka: Cardento 933, Cardento vdl)...",1992.0,170,Capitol I,170,B-Estelle,150.0


In [15]:
horse_name.columns

Index(['Horse', 'YearofBirth', 'Level', 'Sire', 'Sire'slevel', 'Dam',
       'Dam'slevel'],
      dtype='object')

In [16]:
def white_space_remover(df):
    for i in df.columns:
        if df[i].dtype == 'object':
            df[i] = df[i].map(str.strip())
        else:
            pass

In [17]:
horse_name.columns = horse_name.columns.str.strip()

In [18]:
sire_horse_unique = horse_name.Sire.str.strip().unique().tolist()

In [19]:
dam_horse_unique = horse_name.Dam.str.strip().unique().tolist()

In [20]:
sire_horse = horse_name.Sire.str.strip().tolist()

In [21]:
display(len(sire_horse_unique))
display(len(sire_horse))

997

1986

In [22]:
horse_name['Sire'] = horse_name['Sire'].str.replace(' ', '')
horse_name['Dam'] = horse_name['Dam'].str.replace(' ', '')

In [23]:
mean_height_by_horse_male = horse_name.groupby('Sire')["Sire'slevel"].mean()
mean_height_by_horse_female = horse_name.groupby('Dam')["Dam'slevel"].mean()

In [24]:
male_horse_height = mean_height_by_horse_male.to_dict()
female_horse_height = mean_height_by_horse_female.to_dict()

In [25]:
# male_horse_height = pd.DataFrame(male_horse_height)
male_horse_height

{'A-DurI': 140.0,
 'APikachudeMuze': 140.0,
 'Abajo': 110.0,
 'Abdullah': 170.0,
 'Abisko': 110.0,
 'Abke': 160.0,
 'AbleAlbert': 100.0,
 'Acadius': 110.0,
 'AcajouDeLaMarchette': 140.0,
 'Acamani': 170.0,
 'AcobatII': 170.0,
 'AcoradoII': 140.0,
 'AcordII': 140.0,
 'ActionBreaker': 170.0,
 'AdlantusAs': 140.0,
 'Adorado': 170.0,
 'AerlineH949': 140.0,
 'AganixduSeigneur': 150.0,
 'Ahorn': 140.0,
 'AhornZ': 120.0,
 'AirJordan': 135.0,
 'AlCapone': 170.0,
 'Aldato': 140.0,
 'AldatusZ': 140.0,
 'Aletto': 110.0,
 'AliandroB': 130.0,
 'AllWins': 110.0,
 'Allegreto': 160.0,
 'Almoretto': 130.0,
 'AlmoxPrintsJ': 170.0,
 'AlpenFürst517': 110.0,
 'Amadeus': 170.0,
 'Amethist': 140.0,
 'Amsterdam': 170.0,
 'Andiamo': 150.0,
 'Apache': 170.0,
 "Apached'Adriers": 160.0,
 'Aram': 110.0,
 'ArezzoVDL': 160.0,
 'ArgentanI': 125.0,
 'Argentinus': 150.0,
 'Argus': 150.0,
 'Arizona': 110.0,
 'ArkoIII': 170.0,
 'ArmaniCode': 140.0,
 'Armitage': 140.0,
 'Aromats': 170.0,
 'Arpeggio': 150.0,
 'Art': 140.0,