# Tour De France Exploration

In [1]:
import pandas as pd
import numpy as np
import altair as alt

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/camminady/LeTourDataSet/master/data/TDF_Riders_History.csv")

In [3]:
df = df.drop(columns = ['Unnamed: 0'])

## Correct Known Issues

See issues on data set repository

In [4]:
df.loc[(df['Year'] == 1997) & (df['Rider'] == 'MARCO PANTANI'), 'Times'] = '''100h 44' 3"'''
df.loc[(df['Year'] == 1997) & (df['Rider'] == 'MARCO PANTANI'), 'Total Seconds'] = 360839

In [5]:
df.loc[(df['Year'] == 1981) & (df['Rider'] == 'BERNARD HINAULT'), 'Times'] = '''96h 19' 38"'''
df.loc[(df['Year'] == 1981) & (df['Rider'] == 'BERNARD HINAULT'), 'Total Seconds'] = 346778

In [6]:
df.loc[(df['Year'] == 1997) & (df['Rider'] == 'MARCO PANTANI')]

Unnamed: 0,Rank,Rider,Rider No.,Team,Times,Gap,B,P,Year,Distance (km),Number of stages,TotalSeconds,GapSeconds,ResultType,Total Seconds
5885,3,MARCO PANTANI,181,MERCATONE-UNO,"100h 44' 3""",+ 00h 14' 03'',,,1997,3950,22,362678,843,time,360839.0


## Winning Time

In [7]:
gaps_df = df[df['Rank'] == 2][['Year', 'GapSeconds']]

In [8]:
gaps_df['3 Year Rolling Average'] = gaps_df['GapSeconds'].rolling(window = 3).mean()
gaps_df['5 Year Rolling Average'] = gaps_df['GapSeconds'].rolling(window = 5).mean()

Add null values for years the tour wasn't held

In [9]:
ww2_gap = pd.DataFrame(data = {'Year': [i for i in range(1940, 1947)], 
                               'GapSeconds':[None, None, None, None, None, None, None]})

In [10]:
gaps_df = pd.concat([gaps_df, ww2_gap])

Melt for easy plotting

In [12]:
gaps_df = gaps_df.rename(columns = {'GapSeconds': 'Winning Time Gap'})
gaps_df = pd.melt(gaps_df, id_vars = ['Year'], 
                  value_vars = ['Winning Time Gap', '3 Year Rolling Average', '5 Year Rolling Average'])

In [13]:
gaps_df.columns = ['Year', 'Measure', 'Val']

In [14]:
gaps_rect = pd.DataFrame({'x1': [1940], 'x2': [1947], 'y1': [0], 'y2': [11000]})

In [15]:
ww2_gap = alt.Chart(gaps_rect).mark_rect(fill = 'lightgray', opacity = 0.5, width = 7).encode(
                x = alt.X('x1', scale = alt.Scale(domain = [1900, 2023])), 
                x2 = 'x2', 
                y = alt.Y('y1', scale = alt.Scale(domain = [0, 11000])), 
                y2 = 'y2'
            )

In [58]:
base = alt.Chart(gaps_df).mark_line(size = 2).encode(
            x = alt.X('Year:Q', 
                      axis = alt.Axis(title = 'Year', titleFontSize = 15, format = '4f',
                                      tickCount = round(gaps_df['Year'].nunique() / 4)), 
                     scale = alt.Scale(domain = [1900, 2025])), 
            y = alt.Y('Val', axis = alt.Axis(title = 'Time Gap', titleFontSize = 15, format = '1f')), 
            color = alt.Color('Measure', legend = alt.Legend(title = 'Metric')), 
            tooltip = [alt.Tooltip('Val'), alt.Tooltip('Measure')]
        )

In [62]:
tooltips = base.mark_line(strokeWidth=30, opacity=0.01)
c = alt.layer(tooltips, base)

In [63]:
alt.layer(c, ww2_gap).properties(
            height = 500, 
            width = 1000, 
            title = alt.Title('The Tour de France winning time gap has decreased over time', 
                              fontSize = 25, 
                              dx = 10)
        ).interactive()