# Tour De France Exploratory Data Analysis

In [1]:
import pandas as pd
import numpy as np
import altair as alt

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/camminady/LeTourDataSet/master/data/TDF_Riders_History.csv")

In [3]:
df = df.drop(columns = ['Unnamed: 0'])

## Correct Known Issues

See issues on data set repository

In [4]:
#remove years that did not use time for GC win
df = df[~df['Year'].isin([1905, 1906, 1907, 1908, 1909, 1910, 1911, 1912])]

In [5]:
df.loc[(df['Year'] == 1903) & (df['Rider'] == 'PIERRE DESVAGES'), 'GapSeconds'] = 212056
df.loc[(df['Year'] == 1903) & (df['Rider'] == 'ARSÈNE MILLOCHEAU'), 'GapSeconds'] = 219450

In [6]:
df.loc[(df['Year'] == 1904) & (df['Rider'] == 'ANTOINE DEFLOTRIERE'), 'GapSeconds'] = 361722

In [7]:
df.loc[(df['Year'] == 1981) & (df['Rider'] == 'BERNARD HINAULT'), 'Times'] = '''96h 19' 38"'''
df.loc[(df['Year'] == 1981) & (df['Rider'] == 'BERNARD HINAULT'), 'TotalSeconds'] = 346778

In [None]:
df.loc[(df['Year'] == 1987) & (df['Rider'] == 'ROBERT FOREST'), 'Times'] = ''''''
df.loc[(df['Year'] == 1987) & (df['Rider'] == 'ROBERT FOREST'), 'TotalSeconds'] = 421366

In [8]:
df.loc[(df['Year'] == 1997) & (df['Rider'] == 'MARCO PANTANI'), 'Times'] = '''100h 44' 3"'''
df.loc[(df['Year'] == 1997) & (df['Rider'] == 'MARCO PANTANI'), 'TotalSeconds'] = 360839

In [9]:
# save for re-use
df.to_csv('../data/tdf_cleaned.csv', index = False)

## Plot Winning Time Gaps

In [59]:
gaps_df = df.loc[df.groupby('Year')['TotalSeconds'].idxmin() + 1]

In [60]:
gaps_df[gaps_df['Year'] == 1987]

Unnamed: 0,Rank,Rider,Rider No.,Team,Times,Gap,B,P,Year,Distance (km),Number of stages,TotalSeconds,GapSeconds,ResultType
4567,39,DAG OTTO LAURITZEN,196,SEVEN ELEVEN-HOONVED,117h 03' 34'',+ 01h 35' 52'',,,1987,4231,26,421414,5752,time


In [70]:
df[(df['Year'] == 1987) & (df['Rank'] == 38)]

Unnamed: 0,Rank,Rider,Rider No.,Team,Times,Gap,B,P,Year,Distance (km),Number of stages,TotalSeconds,GapSeconds,ResultType
4566,38,ROBERT FOREST,126,FAGOR,115h 27' 41'',-,,,1987,4231,26,415661,0,time


In [65]:
df[df['Year'] == 1987]['TotalSeconds'].min()

#415662

415661

In [43]:
#save for re-use
gaps_df.to_csv('../data/winning_time_gaps.csv', index = False)

In [44]:
gaps_df['3 Year Rolling Average'] = gaps_df['GapSeconds'].rolling(window = 3).mean()
gaps_df['5 Year Rolling Average'] = gaps_df['GapSeconds'].rolling(window = 5).mean()

Add null values for years the tour wasn't held

In [45]:
ww2_gap = pd.DataFrame(data = {'Year': [i for i in range(1940, 1947)], 
                               'GapSeconds':[None, None, None, None, None, None, None]})

In [46]:
no_gc_gap = pd.DataFrame(data = {'Year': [i for i in range(1905, 1913)], 
                                 'GapSeconds': [None, None, None, None, None, None, None, None]})


In [47]:
gaps_df = pd.concat([gaps_df, ww2_gap, no_gc_gap])

Melt for easy plotting

In [48]:
gaps_df = gaps_df.rename(columns = {'GapSeconds': 'Winning Time Gap'})
gaps_df = pd.melt(gaps_df, id_vars = ['Year'], 
                  value_vars = ['Winning Time Gap', '3 Year Rolling Average', '5 Year Rolling Average'])

In [49]:
gaps_df.columns = ['Year', 'Measure', 'Val']

In [50]:
ww2_gaps_rect = pd.DataFrame({'x1': [1940], 'x2': [1947], 'y1': [0], 'y2': [11000]})
no_gc_gaps_rect = pd.DataFrame({'x1': [1905], 'x2': [1913], 'y1': [0], 'y2': [11000]})

In [51]:
ww2_gap = alt.Chart(ww2_gaps_rect).mark_rect(
    fill = 'lightgray', 
    opacity = 0.5, 
    width = 7
).encode(
    x = alt.X('x1', scale = alt.Scale(domain = [1900, 2023])), 
    x2 = 'x2', 
    y = alt.Y('y1', scale = alt.Scale(domain = [0, 11000])), 
    y2 = 'y2'
)


no_gc_gaps_rect = alt.Chart(no_gc_gaps_rect).mark_rect(
    fill = 'lightgray', 
    opacity = 0.5, 
    width = 7
).encode(
    x = alt.X('x1', scale = alt.Scale(domain = [1900, 2023])), 
    x2 = 'x2', 
    y = alt.Y('y1', scale = alt.Scale(domain = [0, 11000])), 
    y2 = 'y2'
)

In [52]:
base = alt.Chart(gaps_df).mark_line(
    size = 2
).encode(
    x = alt.X('Year:Q', 
              axis = alt.Axis(title = 'Year', titleFontSize = 15, format = '4f',
                              tickCount = round(gaps_df['Year'].nunique() / 4)), 
                              scale = alt.Scale(domain = [1900, 2025])), 
    y = alt.Y('Val', axis = alt.Axis(title = 'Time Gap', titleFontSize = 15, format = '1f')), 
    color = alt.Color('Measure', legend = alt.Legend(title = 'Metric')), 
    tooltip = [alt.Tooltip('Year'), alt.Tooltip('Measure', title = 'Measure'), alt.Tooltip('Val', title = 'Value')]
)

In [53]:
tooltips = alt.Chart(gaps_df).mark_line(
    strokeWidth = 30, 
    opacity = 0.01
).encode(
    x = alt.X('Year:Q'),
    y = alt.Y('Val'), 
    color = alt.Color('Measure', legend = None),
    tooltip = [alt.Tooltip('Year'), alt.Tooltip('Measure', title = 'Measure'), alt.Tooltip('Val', title = 'Value')]
)

In [54]:
#to do - note for 1989
c = alt.layer(base, tooltips)

alt.layer(c, ww2_gap, no_gc_gaps_rect).properties(
    height = 400, 
    width = 800, 
    title = alt.Title('The Tour de France winning time gap has decreased over time', 
                      fontSize = 25, dx = 10)
).interactive()

## Average Time Gap From Winner

In [24]:
dfs = []
for year in df['Year'].unique():
    subset = df[df['Year'] == year]
    winning_time = subset['TotalSeconds'].min()
    subset['Time Gap'] = subset['TotalSeconds'] - winning_time
    
    dfs.append(subset)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['Time Gap'] = subset['TotalSeconds'] - winning_time
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['Time Gap'] = subset['TotalSeconds'] - winning_time
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['Time Gap'] = subset['TotalSeconds'] - winning_time
A value is trying to be set 

In [27]:
time_gaps = pd.concat(dfs)