In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

import pickle

from sklearn.model_selection import train_test_split

# Regression Models
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

import tensorflow as tf

# Regression Metrics
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


In [None]:
final_df = pd.read_csv('../eda/train.csv')
print(final_df.shape)
final_df.head()


In [None]:
fig = px.imshow(
    final_df.corr(),
    labels=dict(color="Correlation")
)

fig.update_layout(
    title="Correlation Heatmap",
)

fig.show()


In [None]:
FEATURES = [
    'draw_segments', 'horse_number_segments', 'race_distance',
    'temperature_2m_max', 'wind_speed_10m_max',
    'proportion_of_additional_weight', 'track_width', 'track_moisture',
    'colour_segment', 'sex_segment', 'country_segment',
    'win_freq', 'place_freq', 'average_placing', 'average_speed'
]

FINISH_TIME = 'finish_time_s'

X = final_df[FEATURES]
y = final_df[FINISH_TIME]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(X.shape)
print(y.shape)


In [None]:
linear = LinearRegression()
linear.fit(X_train, y_train)
print("Linear Regression fitted")

lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
print("Lasso Regression fitted")

ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
print("Ridge Regression fitted")

gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=123)
gbr.fit(X_train, y_train)
print("Gradient Boosting Regression fitted")

rfr = RandomForestRegressor(n_estimators=100, max_depth=30, random_state=123)
rfr.fit(X_train, y_train)
print("Random Forest Regression fitted")

nn = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

nn.compile(optimizer='adam', loss='mse', metrics=['mae'])
history = nn.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)
print("Neural Network fitted")

y_pred_linear = linear.predict(X_test)
y_pred_lasso = lasso.predict(X_test)
y_pred_ridge = ridge.predict(X_test)
y_pred_gbr = gbr.predict(X_test)
y_pred_rfr = rfr.predict(X_test)
y_pred_nn = nn.predict(X_test)


In [None]:
MODELS = [
    'Linear Regression', 'Lasso Regression', 'Ridge Regression', 
    'Gradient Boosting Regression', 'Random Forest Regression', 
    'Neural Network'
]

mse_linear = mean_squared_error(y_test, y_pred_linear, squared=False)
r2_linear = r2_score(y_test, y_pred_linear)
mae_linear = mean_absolute_error(y_test, y_pred_linear)

mse_lasso = mean_squared_error(y_test, y_pred_lasso, squared=False)
r2_lasso = r2_score(y_test, y_pred_lasso)
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)

mse_ridge = mean_squared_error(y_test, y_pred_ridge, squared=False)
r2_ridge = r2_score(y_test, y_pred_ridge)
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)

mse_gbr = mean_squared_error(y_test, y_pred_gbr, squared=False)
r2_gbr = r2_score(y_test, y_pred_gbr)
mae_gbr = mean_absolute_error(y_test, y_pred_gbr)

mse_rfr = mean_squared_error(y_test, y_pred_rfr, squared=False)
r2_rfr = r2_score(y_test, y_pred_rfr)
mae_rfr = mean_absolute_error(y_test, y_pred_rfr)

mse_nn = mean_squared_error(y_test, y_pred_nn, squared=False)
r2_nn = r2_score(y_test, y_pred_nn)
mae_nn = mean_absolute_error(y_test, y_pred_nn)

fig = go.Figure()
fig.add_trace(go.Bar(
    x=MODELS,
    y=[mse_linear, mse_lasso, mse_ridge, mse_gbr, mse_rfr, mse_nn],
    name='Root Mean Squared Error',
    text=[f'{mse_linear:.5f}', f'{mse_lasso:.5f}', f'{mse_ridge:.5f}', f'{mse_gbr:.5f}', f'{mse_rfr:.5f}', f'{mse_nn:.5f}'],
    textposition='auto'
))
fig.add_trace(go.Bar(
    x=MODELS,
    y=[mae_linear, mae_lasso, mae_ridge, mae_gbr, mae_rfr, mae_nn],
    name='Mean Absolute Error',
    text=[f'{mae_linear:.5f}', f'{mae_lasso:.5f}', f'{mae_ridge:.5f}', f'{mae_gbr:.5f}', f'{mae_rfr:.5f}', f'{mae_nn:.5f}'],
    textposition='auto'
))
fig.add_trace(go.Bar(
    x=MODELS,
    y=[r2_linear, r2_lasso, r2_ridge, r2_gbr, r2_rfr, r2_nn],
    name='R-squared',
    text=[f'{r2_linear:.5f}', f'{r2_lasso:.5f}', f'{r2_ridge:.5f}', f'{r2_gbr:.5f}', f'{r2_rfr:.5f}', f'{r2_nn:.5f}'],
    textposition='auto'
    ))
fig.update_layout(
    title='Regression Models Performance Metrics',
    xaxis_title='Regression Model',
    yaxis_title='Value'
)

fig.show()


#### Save Regression Models

In [None]:
with open("regression/finish_time_lasso_model.pkl", "wb") as file:
    pickle.dump(lasso, file)

with open("regression/finish_time_ridge_model.pkl", "wb") as file:
    pickle.dump(ridge, file)

with open("regression/finish_time_gbr_model.pkl", "wb") as file:
    pickle.dump(gbr, file)

with open("regression/finish_time_rfr_model.pkl", "wb") as file:
    pickle.dump(rfr, file)
