In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

import pickle

from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE

# Classification Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier, Pool
import xgboost as xgb

# Classification Metrics
from sklearn.metrics import cohen_kappa_score, confusion_matrix, roc_curve, auc
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_fscore_support


In [None]:
final_df = pd.read_csv('../eda/train.csv')
print(final_df.shape)
final_df.head()


In [None]:
final_df['finishing_position_class'] = final_df['finishing_position'].apply(lambda x: 1 if x <= 3 else 0)
final_df['finishing_position_class'].value_counts()


In [None]:
FEATURES = [
    'draw_segments', 'horse_number_segments', 'race_distance',
    'temperature_2m_max', 'wind_speed_10m_max',
    'proportion_of_additional_weight', 'track_width', 'track_moisture',
    'colour_segment', 'sex_segment', 'country_segment',
    'win_freq', 'place_freq', 'average_placing', 'average_speed',
    'average_rating', 'average_race_class'
]

FINISHING_POSITION_CLASS = 'finishing_position_class'

X = final_df[FEATURES]
y = final_df[FINISHING_POSITION_CLASS]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(X.shape)
print(y.shape)


In [None]:
lr = LogisticRegression(max_iter=300, random_state=42)
lr.fit(X_train, y_train)

rfc = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
rfc.fit(X_train, y_train)

xgbc = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1
)
xgbc.fit(X_train, y_train)

cbc = CatBoostClassifier(
    iterations=100,
    learning_rate=0.1,
    depth=6,
    verbose=False
)
cbc.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)
y_pred_rfc = rfc.predict(X_test)
y_pred_xgbc = xgbc.predict(X_test)
y_pred_cbc = cbc.predict(X_test)


In [None]:
lr_acc = accuracy_score(y_test, y_pred_lr)
rfc_acc = accuracy_score(y_test, y_pred_rfc)
xgbc_acc = accuracy_score(y_test, y_pred_xgbc)
cbc_acc = accuracy_score(y_test, y_pred_cbc)

lr_prec = precision_score(y_test, y_pred_lr, average=None)
rfc_prec = precision_score(y_test, y_pred_rfc, average=None)
xgbc_prec = precision_score(y_test, y_pred_xgbc, average=None)
cbc_prec = precision_score(y_test, y_pred_cbc, average=None)

lr_rec = recall_score(y_test, y_pred_lr, average=None)
rfc_rec = recall_score(y_test, y_pred_rfc, average=None)
xgbc_rec = recall_score(y_test, y_pred_xgbc, average=None)
cbc_rec = recall_score(y_test, y_pred_cbc, average=None)

lr_f1 = f1_score(y_test, y_pred_lr, average=None)
rfc_f1 = f1_score(y_test, y_pred_rfc, average=None)
xgbc_f1 = f1_score(y_test, y_pred_xgbc, average=None)
cbc_f1 = f1_score(y_test, y_pred_cbc, average=None)

print("LR Accuracy:", lr_acc)
print("RFC Accuracy:", rfc_acc)
print("XGBC Accuracy:", xgbc_acc)
print("CBC Accuracy:", cbc_acc)
print()
print("LR Precision:", lr_prec)
print("RFC Precision:", rfc_prec)
print("XGBC Precision:", xgbc_prec)
print("CBC Precision:", cbc_prec)
print()
print("LR Recall:", lr_rec)
print("RFC Recall:", rfc_rec)
print("XGBC Recall:", xgbc_rec)
print("CBC Recall:", cbc_rec)
print()
print("LR F1:", lr_f1)
print("RFC F1:", rfc_f1)
print("XGBC F1:", xgbc_f1)
print("CBC F1:", cbc_f1)


In [None]:
MODELS = [
    'Logistic Regression', 'Random Forest_Classifier', 
    'XGBoost Classifier', 'CatBoost Classifier'
]

fig = go.Figure()
fig.add_trace(go.Bar(
    x=MODELS,
    y=[lr_acc, rfc_acc, xgbc_acc, cbc_acc],
    name='Root Mean Squared Error',
    text=[f'{lr_acc:.5f}', f'{rfc_acc:.5f}', f'{xgbc_acc:.5f}', f'{cbc_acc:.5f}'],
    textposition='auto'
))
fig.add_trace(go.Bar(
    x=MODELS,
    y=[lr_prec[1], rfc_prec[1], xgbc_prec[1], cbc_prec[1]],
    name='Mean Absolute Error',
    text=[f'{lr_prec[1]:.5f}', f'{rfc_prec[1]:.5f}', f'{xgbc_prec[1]:.5f}', f'{cbc_prec[1]:.5f}'],
    textposition='auto'
))
fig.add_trace(go.Bar(
    x=MODELS,
    y=[lr_rec[1], rfc_rec[1], xgbc_rec[1], cbc_rec[1]],
    name='R-squared',
    text=[f'{lr_rec[1]:.5f}', f'{rfc_rec[1]:.5f}', f'{xgbc_rec[1]:.5f}', f'{cbc_rec[1]:.5f}'],
    textposition='auto'
    ))
fig.add_trace(go.Bar(
    x=MODELS,
    y=[lr_f1[1], rfc_f1[1], xgbc_f1[1], cbc_f1[1]],
    name='R-squared',
    text=[f'{lr_f1[1]:.5f}', f'{rfc_f1[1]:.5f}', f'{xgbc_f1[1]:.5f}', f'{cbc_f1[1]:.5f}'],
    textposition='auto'
    ))
fig.update_layout(
    title='Classification Models Performance Metrics',
    xaxis_title='Classification Model',
    yaxis_title='Value'
)

fig.show()


#### Perform SMOTE to balance classes on training data

In [None]:
# smote = SMOTE()

# X_train_s, y_train_s = smote.fit_resample(X_train, y_train)


In [None]:
# lr = LogisticRegression(max_iter=300, random_state=42)
# lr.fit(X_train_s, y_train_s)

# rfc = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
# rfc.fit(X_train_s, y_train_s)

# xgbc = xgb.XGBClassifier(
#     n_estimators=100,
#     learning_rate=0.1
# )
# xgbc.fit(X_train_s, y_train_s)

# cbc = CatBoostClassifier(
#     iterations=100,
#     learning_rate=0.1,
#     depth=6,
#     verbose=False
# )
# cbc.fit(X_train_s, y_train_s)

# y_pred_lr = lr.predict(X_test)
# y_pred_rfc = rfc.predict(X_test)
# y_pred_xgbc = xgbc.predict(X_test)
# y_pred_cbc = cbc.predict(X_test)


In [None]:
# lr_acc = accuracy_score(y_test, y_pred_lr)
# rfc_acc = accuracy_score(y_test, y_pred_rfc)
# xgbc_acc = accuracy_score(y_test, y_pred_xgbc)
# cbc_acc = accuracy_score(y_test, y_pred_cbc)

# lr_prec = precision_score(y_test, y_pred_lr, average=None)
# rfc_prec = precision_score(y_test, y_pred_rfc, average=None)
# xgbc_prec = precision_score(y_test, y_pred_xgbc, average=None)
# cbc_prec = precision_score(y_test, y_pred_cbc, average=None)

# lr_rec = recall_score(y_test, y_pred_lr, average=None)
# rfc_rec = recall_score(y_test, y_pred_rfc, average=None)
# xgbc_rec = recall_score(y_test, y_pred_xgbc, average=None)
# cbc_rec = recall_score(y_test, y_pred_cbc, average=None)

# lr_f1 = f1_score(y_test, y_pred_lr, average=None)
# rfc_f1 = f1_score(y_test, y_pred_rfc, average=None)
# xgbc_f1 = f1_score(y_test, y_pred_xgbc, average=None)
# cbc_f1 = f1_score(y_test, y_pred_cbc, average=None)

# print("LR Accuracy:", lr_acc)
# print("RFC Accuracy:", rfc_acc)
# print("XGBC Accuracy:", xgbc_acc)
# print("CBC Accuracy:", cbc_acc)
# print()
# print("LR Precision:", lr_prec)
# print("RFC Precision:", rfc_prec)
# print("XGBC Precision:", xgbc_prec)
# print("CBC Precision:", cbc_prec)
# print()
# print("LR Recall:", lr_rec)
# print("RFC Recall:", rfc_rec)
# print("XGBC Recall:", xgbc_rec)
# print("CBC Recall:", cbc_rec)
# print()
# print("LR F1:", lr_f1)
# print("RFC F1:", rfc_f1)
# print("XGBC F1:", xgbc_f1)
# print("CBC F1:", cbc_f1)


#### Save Regression Models

In [None]:
with open("classification/finish_position_lr_model.pkl", "wb") as file:
    pickle.dump(lr, file)

with open("classification/finish_position_rfc_model.pkl", "wb") as file:
    pickle.dump(rfc, file)

with open("classification/finish_positionn_xgbc_model.pkl", "wb") as file:
    pickle.dump(xgbc, file)

with open("classification/finish_position_cbc_model.pkl", "wb") as file:
    pickle.dump(cbc, file)
