In [425]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, make_scorer

In [426]:
# define mean error scorer

def mean_error_(y_true, y_pred):
    """It assumes y_true and y_pred are numpy arrays. 
   
    Parameters
    ----------
    y_true : numpy array 
        Ground truth labels
    y_pred : numpy array 
        model predictions
    

    Returns
    -------
    MA : float

        MA(Mean Error): over-predicting when MA > 0 or under-predicting when MA < 0.

    """
    return np.mean(y_pred-y_true)

# define score for model trainer
mean_error_scorer = make_scorer(mean_error_, greater_is_better=False)


In [427]:
# load dataframe dataset and print first rows
df = pd.read_csv("MarathonData.csv")
df.head()

Unnamed: 0,id,Marathon,Name,Category,km4week,sp4week,CrossTraining,Wall21,MarathonTime,CATEGORY
0,1,Prague17,Blair MORGAN,MAM,132.8,14.434783,,1.16,2.37,A
1,2,Prague17,Robert Heczko,MAM,68.6,13.674419,,1.23,2.59,A
2,3,Prague17,Michon Jerome,MAM,82.7,13.520436,,1.3,2.66,A
3,4,Prague17,Daniel Or lek,M45,137.5,12.258544,,1.32,2.68,A
4,5,Prague17,Luk ? Mr zek,MAM,84.6,13.945055,,1.36,2.74,A


In [428]:
# Print shape
df.shape

(87, 10)

In [429]:
# Print a summary of the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             87 non-null     int64  
 1   Marathon       87 non-null     object 
 2   Name           87 non-null     object 
 3   Category       81 non-null     object 
 4   km4week        87 non-null     float64
 5   sp4week        87 non-null     float64
 6   CrossTraining  13 non-null     object 
 7   Wall21         87 non-null     object 
 8   MarathonTime   87 non-null     float64
 9   CATEGORY       87 non-null     object 
dtypes: float64(3), int64(1), object(6)
memory usage: 6.9+ KB


In [430]:
# Print statistics of the dataframe
df.describe()

Unnamed: 0,id,km4week,sp4week,MarathonTime
count,87.0,87.0,87.0,87.0
mean,44.0,62.347126,139.840706,3.31908
std,25.258662,26.956019,1191.427864,0.376923
min,1.0,17.9,8.031414,2.37
25%,22.5,44.2,11.498168,3.045
50%,44.0,58.8,12.163424,3.32
75%,65.5,77.5,12.854036,3.605
max,87.0,137.5,11125.0,3.98


In [431]:
# Unique values for Wall21 Colunmns
# Here we can see some '-' values
df['Wall21'].unique()

array(['1.16', '1.23', '1.30', '1.32', '1.36', '1.38', '1.41', '1.35',
       '1.42', '1.40', '1.37', '1.44', '1.45', '1.50', '1.48', '1.52',
       ' -   ', '1.54', '1.57', '1.62', '1.60', '1.51', '1.58', '1.56',
       '1.61', '1.59', '1.68', '1.78', '1.67', '1.66', '1.55', '1.64',
       '1.65', '1.77', '1.71', '1.72', '1.76', '1.63', '1.69', '1.88',
       '1.93', '1.74', '1.75', '1.80', '1.81', '1.94', '1.90', '1.85',
       '1.97', '1.98', '2.05', '2.02'], dtype=object)

In [432]:
# Print unique categories values from this column
# we have a nan rows values
df['Category'].unique()

array(['MAM', 'M45', 'M40', 'M50', 'M55', nan, 'WAM'], dtype=object)

In [433]:
# Drop unnecessary data (NaN info and string columns)
drop_columns = ['id', 'Marathon', 'Name', 'CrossTraining']
df.drop(columns=drop_columns, inplace=True)

# Print first rows
df.head()

Unnamed: 0,Category,km4week,sp4week,Wall21,MarathonTime,CATEGORY
0,MAM,132.8,14.434783,1.16,2.37,A
1,MAM,68.6,13.674419,1.23,2.59,A
2,MAM,82.7,13.520436,1.3,2.66,A
3,M45,137.5,12.258544,1.32,2.68,A
4,MAM,84.6,13.945055,1.36,2.74,A


In [434]:
# Remove nan rows
df.dropna(inplace=True)

In [435]:
# Print unique categories values from this column
# this time nan unique rows are removed from dataframe
df['Category'].unique()

array(['MAM', 'M45', 'M40', 'M50', 'M55', 'WAM'], dtype=object)

In [436]:
# Get the mode to replace '-' values
mod_value = df["Wall21"].mode()
mod_value

0    1.67
Name: Wall21, dtype: object

In [437]:
#  Replace '-' values
df.replace({"-":mod_value}, inplace=True)

In [438]:
# change datatype to float64 for Wall21 Column
df = df.astype({'Wall21' : 'float64'}, copy=True)

In [439]:
# Print Wall21 values
df['Wall21'].unique()

array([1.16, 1.23, 1.3 , 1.32, 1.36, 1.38, 1.41, 1.35, 1.42, 1.4 , 1.37,
       1.44, 1.45, 1.5 , 1.48, 1.52, 1.54, 1.57, 1.62, 1.6 , 1.51, 1.58,
       1.56, 1.61, 1.59, 1.68, 1.78, 1.67, 1.66, 1.55, 1.64, 1.65, 1.77,
       1.71, 1.72, 1.76, 1.63, 1.69, 1.88, 1.93, 1.74, 1.75, 1.8 , 1.81,
       1.94, 1.9 , 1.85, 1.97, 1.98, 2.05, 2.02])

In [440]:
# Get Ground truth , Target labels: Marathon Time
y = df.iloc[:, 4].values

In [441]:
# Remove MarathonTime Column from DataFrame
X = df.drop(columns="MarathonTime")
X = X.values

In [442]:
# Print X features vector
X

array([['MAM', 132.8, 14.43478261, 1.16, 'A'],
       ['MAM', 68.6, 13.6744186, 1.23, 'A'],
       ['MAM', 82.7, 13.52043597, 1.3, 'A'],
       ['M45', 137.5, 12.25854383, 1.32, 'A'],
       ['MAM', 84.6, 13.94505495, 1.36, 'A'],
       ['M40', 42.2, 13.61290323, 1.32, 'A'],
       ['M40', 89.0, 12.59433962, 1.38, 'A'],
       ['M45', 106.0, 12.69461078, 1.41, 'A'],
       ['MAM', 70.0, 13.7704918, 1.38, 'A'],
       ['M45', 84.2, 13.36507937, 1.35, 'A'],
       ['MAM', 93.5, 13.2, 1.42, 'A'],
       ['M50', 65.7, 13.36271186, 1.4, 'A'],
       ['M45', 53.5, 14.07894737, 1.37, 'A'],
       ['M40', 84.4, 13.83606557, 1.41, 'A'],
       ['MAM', 76.8, 12.94382022, 1.44, 'A'],
       ['MAM', 76.1, 14.9704918, 1.45, 'A'],
       ['M55', 112.3, 12.59439252, 1.44, 'A'],
       ['M40', 49.7, 14.33653846, 1.44, 'A'],
       ['MAM', 84.5, 12.54950495, 1.45, 'A'],
       ['MAM', 76.7, 8.031413613, 1.41, 'A'],
       ['MAM', 94.5, 11.88679245, 1.45, 'A'],
       ['M40', 67.3, 13.23934426, 1.5, 'B'

In [443]:
# Print y target labels
y

array([2.37, 2.59, 2.66, 2.68, 2.74, 2.78, 2.81, 2.84, 2.83, 2.86, 2.87,
       2.87, 2.88, 2.88, 2.89, 2.9 , 2.91, 2.91, 2.93, 2.94, 2.99, 3.04,
       3.05, 3.09, 3.1 , 3.12, 3.14, 3.15, 3.16, 3.19, 3.19, 3.21, 3.21,
       3.22, 3.23, 3.24, 3.24, 3.25, 3.28, 3.28, 3.32, 3.32, 3.32, 3.33,
       3.33, 3.35, 3.36, 3.39, 3.4 , 3.46, 3.47, 3.5 , 3.5 , 3.51, 3.52,
       3.52, 3.55, 3.55, 3.56, 3.56, 3.59, 3.62, 3.64, 3.64, 3.65, 3.67,
       3.68, 3.69, 3.69, 3.76, 3.78, 3.8 , 3.87, 3.89, 3.9 , 3.92, 3.93,
       3.93, 3.94, 3.95, 3.98])

In [444]:
X[:,0]

array(['MAM', 'MAM', 'MAM', 'M45', 'MAM', 'M40', 'M40', 'M45', 'MAM',
       'M45', 'MAM', 'M50', 'M45', 'M40', 'MAM', 'MAM', 'M55', 'M40',
       'MAM', 'MAM', 'MAM', 'M40', 'M40', 'M45', 'MAM', 'WAM', 'MAM',
       'M45', 'M50', 'MAM', 'MAM', 'M45', 'M50', 'MAM', 'MAM', 'M45',
       'MAM', 'WAM', 'M45', 'M55', 'MAM', 'MAM', 'M40', 'MAM', 'WAM',
       'M45', 'M45', 'MAM', 'MAM', 'M40', 'M40', 'MAM', 'MAM', 'MAM',
       'MAM', 'M40', 'M40', 'MAM', 'M45', 'MAM', 'M40', 'M45', 'MAM',
       'MAM', 'MAM', 'MAM', 'M45', 'M50', 'WAM', 'MAM', 'MAM', 'MAM',
       'MAM', 'MAM', 'MAM', 'MAM', 'M55', 'M40', 'M40', 'M45', 'M40'],
      dtype=object)

In [445]:
# Use Label Encoder for encode string labels with values between 0 and 0_classes-1
label_encoder = LabelEncoder()
# Encode Category column values
X[:,0] = label_encoder.fit_transform(X[:,0])

X[:,-1] = label_encoder.fit_transform(X[:,-1])

In [446]:
# Print shape of features vector
X.shape

(81, 5)

In [447]:
# Print shape of target vector
y.shape

(81,)

In [448]:
# Create a split for training a CatBoostRegressor
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=15)


In [449]:

# Initialize CatBoostRegressor 
model = CatBoostRegressor(iterations=50,
                          learning_rate=0.3,
                          depth=3)

# Fit model
model.fit(X_train, y_train)

# Get Predictions
preds = model.predict(X_test)


0:	learn: 0.3177347	total: 346us	remaining: 17ms
1:	learn: 0.2645547	total: 581us	remaining: 13.9ms
2:	learn: 0.2157343	total: 798us	remaining: 12.5ms
3:	learn: 0.1819373	total: 923us	remaining: 10.6ms
4:	learn: 0.1576943	total: 1.04ms	remaining: 9.41ms
5:	learn: 0.1392897	total: 1.16ms	remaining: 8.49ms
6:	learn: 0.1221455	total: 1.38ms	remaining: 8.49ms
7:	learn: 0.1106106	total: 1.5ms	remaining: 7.89ms
8:	learn: 0.1024631	total: 1.62ms	remaining: 7.39ms
9:	learn: 0.0959535	total: 1.73ms	remaining: 6.94ms
10:	learn: 0.0932585	total: 1.84ms	remaining: 6.54ms
11:	learn: 0.0884180	total: 2.08ms	remaining: 6.59ms
12:	learn: 0.0835765	total: 2.2ms	remaining: 6.26ms
13:	learn: 0.0813244	total: 2.31ms	remaining: 5.95ms
14:	learn: 0.0792845	total: 2.44ms	remaining: 5.7ms
15:	learn: 0.0764639	total: 2.57ms	remaining: 5.46ms
16:	learn: 0.0743575	total: 2.71ms	remaining: 5.27ms
17:	learn: 0.0730390	total: 2.87ms	remaining: 5.11ms
18:	learn: 0.0721670	total: 3ms	remaining: 4.89ms
19:	learn: 0.07

In [450]:
# Print predictios
preds


array([2.90327744, 3.61399269, 3.60089444, 3.43673058, 2.88092411,
       3.56651449, 3.76089332, 3.46050017, 3.30419952, 3.86751116,
       3.40947514, 3.38357176, 3.12854908, 2.68188027, 2.82812938])

In [451]:
# Print targets
y_test

array([2.91, 3.52, 3.5 , 3.55, 2.88, 3.67, 3.76, 3.32, 3.33, 3.89, 3.52,
       3.32, 3.14, 2.66, 2.87])

In [452]:
# Compute metrics
# R2 score, me, mse, mae
r2_score_res = r2_score(y_test, preds)
mean_error_res = mean_error_(y_test, preds)
mae_res = mean_absolute_error(y_test, preds)
mse_res = mean_squared_error(y_test, preds)

In [453]:
r2_score_res

0.9553263473836324

In [454]:
mean_error_res

-0.0008637626821678938

In [455]:
mae_res

0.057217996390011225

In [456]:
mse_res

0.0054754808340560695