# Fourth task: Prediction

We begin, as always, by importing all the necessary libraries and loading the dataset.

In [27]:
import warnings
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import unidecode

from collections import defaultdict

from scipy.stats import pearsonr, mode
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram

from sklearn.cluster import AgglomerativeClustering, KMeans, DBSCAN, spectral_clustering, OPTICS
from sklearn.metrics import silhouette_score
from sklearn.model_selection import ParameterSampler
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

from pyclustering.cluster.optics import optics
from pyclustering.cluster.cure import cure
from pyclustering.utils import timedcall

import os

cyclists = pd.read_csv('./dataset/cyclists_trasformed.csv')
races = pd.read_csv('./dataset/races_trasformed.csv')

Following the specification we consider only the races from 2022 onward and we consider only top-20 placement to defining the learning task as a binary classification task: one class indicating top placement, the other vice versa.

In [28]:
# Ensure the 'date' column is in datetime format
races['date'] = pd.to_datetime(races['date'])
races.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 589865 entries, 0 to 589864
Data columns (total 17 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   _url               589865 non-null  object        
 1   name               589865 non-null  object        
 2   points             589865 non-null  int64         
 3   length             589865 non-null  float64       
 4   climb_total        589865 non-null  int64         
 5   profile            589865 non-null  int64         
 6   startlist_quality  589865 non-null  int64         
 7   date               589865 non-null  datetime64[ns]
 8   position           589865 non-null  int64         
 9   cyclist            589865 non-null  object        
 10  cyclist_age        589865 non-null  int64         
 11  is_tarmac          589865 non-null  bool          
 12  cyclist_team       589865 non-null  object        
 13  delta              589865 non-null  float64 

In [None]:
# Ensure the 'date' column is in datetime format
races['date'] = pd.to_datetime(races['date'])

# Filter the dataset into training/validation and test sets
train_val_set = races[races['date'] < pd.Timestamp('2022-01-01')].copy()
test_set = races[races['date'] >= pd.Timestamp('2022-01-01')].copy()

# Define the binary classification label: 1 for top-20 placement, 0 otherwise
train_val_set['top_20'] = (train_val_set['position'] <= 20).astype(int)
test_set['top_20'] = (test_set['position'] <= 20).astype(int)

# Drop unnecessary columns for training
columns_to_drop = ['position', '_url', 'cyclist', 'cyclist_team']
train_val_set = train_val_set.drop(columns=columns_to_drop, axis=1)
test_set = test_set.drop(columns=columns_to_drop, axis=1)

# Ensure no data leakage from 'date' (optional but recommended)
train_val_set = train_val_set.drop('date', axis=1)
test_set = test_set.drop('date', axis=1)

# Print the resulting datasets for verification
print("Training/Validation set:")
print(train_val_set.head())

Training/Validation set:
             name  points    length  climb_total  profile  startlist_quality  \
0  Tour de France     100  162000.0         1101        1               1241   
1  Tour de France     100  162000.0         1101        1               1241   
2  Tour de France     100  162000.0         1101        1               1241   
3  Tour de France     100  162000.0         1101        1               1241   
4  Tour de France     100  162000.0         1101        1               1241   

   cyclist_age  is_tarmac  delta  month  season  race_intensity  top_20  
0           22       True    0.0      7  summer        1.503613       1  
1           27       True    0.0      7  summer        1.503613       1  
2           24       True    0.0      7  summer        1.503613       1  
3           30       True    0.0      7  summer        1.503613       1  
4           27       True    0.0      7  summer        1.503613       1  


In [30]:
print("\nTest set:")
print(test_set.head())


Test set:
               name  points    length  climb_total  profile  \
545  Tour de France     100  192900.0         3743        3   
546  Tour de France     100  192900.0         3743        3   
547  Tour de France     100  192900.0         3743        3   
548  Tour de France     100  192900.0         3743        3   
549  Tour de France     100  192900.0         3743        3   

     startlist_quality  cyclist_age  is_tarmac  delta  month  season  \
545               1551           30       True    0.0      7  summer   
546               1551           35       True   22.0      7  summer   
547               1551           30       True   26.0      7  summer   
548               1551           32       True   40.0      7  summer   
549               1551           24       True   49.0      7  summer   

     race_intensity  top_20  
545        2.128635       1  
546        2.128635       1  
547        2.128635       1  
548        2.128635       1  
549        2.128635       1