In [1]:
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns

from matplotlib.colors import ListedColormap
from pandas.plotting import scatter_matrix

from sklearn.compose import ColumnTransformer
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, accuracy_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder

plt.rcParams['figure.figsize'] = (16, 12)

This data came from [kaggle](https://www.kaggle.com/datasets/pablomonleon/tour-de-france-historic-stages-data), but the times in the `stage_data.csv` file were all broken. I've gone back to the [original R package](https://github.com/alastairrushworth/tdf) and fixed things up.

In [2]:
riderDF = pd.read_csv(
    'https://m2pi.syzygy.ca/data/tdf/stage_data_timed.csv',
    na_values={'rank': 'DNF'},
)
winnerDF = pd.read_csv(
    'https://m2pi.syzygy.ca/data/tdf/tdf_winners.csv',
    parse_dates=['start_date'],
)
stageDF = pd.read_csv(
    'https://m2pi.syzygy.ca/data/tdf/tdf_stages.csv',
    parse_dates=['Date'],
)
stageDF.columns = stageDF.columns.str.strip().str.lower()

## What do we have?

Poke around with `.info`, `.describe`, `.columns`, `.head` to see what the data looks like. 

In [3]:
stageDF.tail(n=5)

Unnamed: 0,stage,date,distance,origin,destination,type,winner,winner_country
2231,2,1903-07-05,374.0,Lyon,Marseille,Stage with mountain(s),Hippolyte Aucouturier,FRA
2232,3,1903-07-08,423.0,Marseille,Toulouse,Plain stage,Hippolyte Aucouturier,FRA
2233,4,1903-07-12,268.0,Toulouse,Bordeaux,Plain stage,Charles Laeser,SUI
2234,5,1903-07-13,425.0,Bordeaux,Nantes,Plain stage,Maurice Garin,FRA
2235,6,1903-07-18,471.0,Nantes,Paris,Plain stage,Maurice Garin,FRA


In [4]:
riderDF.head()

Unnamed: 0,year,stage,rank,rider,age,team,points,elapsed
0,1903,stage-1,1,Garin Maurice,32.0,,100.0,63913.0
1,1903,stage-1,2,Pagie Émile,32.0,,70.0,63968.0
2,1903,stage-1,3,Georget Léon,23.0,,50.0,66012.0
3,1903,stage-1,4,Augereau Fernand,20.0,,40.0,67681.0
4,1903,stage-1,5,Fischer Jean,36.0,,32.0,67806.0


In [5]:
winnerDF.head(n=3)

Unnamed: 0,edition,start_date,winner_name,winner_team,distance,time_overall,time_margin,stage_wins,stages_led,height,weight,age,born,died,full_name,nickname,birth_town,birth_country,nationality
0,1,1903-07-01,Maurice Garin,La Française,2428.0,94.553889,2.989167,3,6,1.62,60.0,32,1871-03-03,1957-02-19,,The Little Chimney-sweep,Arvier,Italy,France
1,2,1904-07-02,Henri Cornet,Conte,2428.0,96.098611,2.270556,1,3,,,19,1884-08-04,1941-03-18,,Le rigolo (The joker),Desvres,France,France
2,3,1905-07-09,Louis Trousselier,Peugeot–Wolber,2994.0,,,5,10,,,24,1881-06-29,1939-04-24,,Levaloy / Trou-trou,Paris,France,France


## Some Basic Questions

 * How has the distance of the race changed over the years
 * How has the distance of the stages changed over the years
 * How have the riders changed over the years (e.g. age)
 * How have the winning margins changed over the years

### Cleaning Up and Building Some Extra Features

In [15]:
stageDF.groupby('type').distance.mean()

type
Flat         221.916007
Hilly        196.328947
Mountain     215.687275
TimeTrial     57.679918
Name: distance, dtype: float64

There's a bit too much variation, so we could try classifying these stages ourselves, e.g.

In [10]:
stage_types = {
    'Flat' : [
        'Plain stage',
        'Flat stage',
        'Flat Stage',
        'Transition stage',
        'Intermediate stage',
        'Flat cobblestone stage',
        'Plain stage with cobblestones',
        'Half Stage',
    ],
    'Mountain' : [
        'High mountain stage',
        'Mountain stage',
        'Mountain Stage',
        'Medium mountain stage',
        'Stage with mountain',
        'Stage with mountain(s)',
    ],
    'Hilly' : [
        'Hilly stage',
    ],
    'TimeTrial' : [
        'Individual time trial',
        'Mountain time trial',
        'Team time trial',
    ],
}

Comparing the `DataFrames`, there are some opportunities to join columns to provide richer information. There are also some manipulations we could try to build other features to explore. Try merging `stageDF` and `riderDF` on `year` and `stage` and add a column for speed. Check that it is realistic

## Regression

* Use regression to try to predict the elapsed time for riders


## Classification
  * Try to classify the stages by type