In [1]:
import  pandas

In [2]:
# Load some cycling data from Strava as as example
rides = pandas.read_csv('Strava2021.csv', parse_dates=['start_time'])
rides

Unnamed: 0,start_time,name,distance,moving_time,elevation,average_watts,max_speed
0,2021-02-01 19:35:00,Pedalheads 100 special,102.0,3.18,441.0,188.0,54.3
1,2021-04-01 19:10:00,Return to Hack's Samford Gentleman's ride,46.9,1.66,733.6,193.0,66.9
2,2021-05-01 19:30:00,Morning Ride,34.5,1.09,136.0,184.0,51.8
3,2021-07-01 19:21:00,Morning Ride,33.8,0.98,121.0,226.0,55.0
4,2021-11-01 19:11:00,Morning Ride,49.3,1.76,718.0,184.0,68.7
...,...,...,...,...,...,...,...
205,2021-12-26 22:30:00,Zwift - Dutch Diesel Cycling Starter Ride (C),60.3,1.52,89.0,177.0,46.1
206,2021-12-27 01:30:00,Zwift - OTR Tempo Group Ride,30.1,0.88,341.0,203.0,66.1
207,2021-12-27 19:00:00,Zwift - Rapha Festive 500 Group Ride Led By Et...,60.1,1.61,323.0,185.0,77.0
208,2021-12-28 19:00:00,Morning Ride,86.9,2.96,472.0,158.0,58.3


In [3]:
# Compute a new column named avg_speed based on existing columns distance and moving_time
rides['avg_speed'] = rides.distance / rides.moving_time

In [4]:
rides

Unnamed: 0,start_time,name,distance,moving_time,elevation,average_watts,max_speed,avg_speed
0,2021-02-01 19:35:00,Pedalheads 100 special,102.0,3.18,441.0,188.0,54.3,32.075472
1,2021-04-01 19:10:00,Return to Hack's Samford Gentleman's ride,46.9,1.66,733.6,193.0,66.9,28.253012
2,2021-05-01 19:30:00,Morning Ride,34.5,1.09,136.0,184.0,51.8,31.651376
3,2021-07-01 19:21:00,Morning Ride,33.8,0.98,121.0,226.0,55.0,34.489796
4,2021-11-01 19:11:00,Morning Ride,49.3,1.76,718.0,184.0,68.7,28.011364
...,...,...,...,...,...,...,...,...
205,2021-12-26 22:30:00,Zwift - Dutch Diesel Cycling Starter Ride (C),60.3,1.52,89.0,177.0,46.1,39.671053
206,2021-12-27 01:30:00,Zwift - OTR Tempo Group Ride,30.1,0.88,341.0,203.0,66.1,34.204545
207,2021-12-27 19:00:00,Zwift - Rapha Festive 500 Group Ride Led By Et...,60.1,1.61,323.0,185.0,77.0,37.329193
208,2021-12-28 19:00:00,Morning Ride,86.9,2.96,472.0,158.0,58.3,29.358108


In [5]:
# We want to study the ride data in relation to the weather on that day, so we load a separate data set from the Bureau of Meteorology
bom = pandas.read_csv('BrisbaneDailyWeather.csv')
bom

Unnamed: 0,year,month,day,MinTemp,MaxTemp,Rainfall
0,2022,2,13,18.6,29.3,7.2
1,2022,2,12,20.4,28.9,0.0
2,2022,2,11,19.1,31.3,0.0
3,2022,2,10,19.4,31.2,0.0
4,2022,2,9,18.6,30.0,0.0
...,...,...,...,...,...,...
8440,1999,1,5,,,
8441,1999,1,4,,,
8442,1999,1,3,,,
8443,1999,1,2,,,


In [6]:
# In order to join with the cycling data, we first need to compute a new date column (or index column in this example)
bom.index = pandas.to_datetime(bom[['year','month','day']])
bom

Unnamed: 0,year,month,day,MinTemp,MaxTemp,Rainfall
2022-02-13,2022,2,13,18.6,29.3,7.2
2022-02-12,2022,2,12,20.4,28.9,0.0
2022-02-11,2022,2,11,19.1,31.3,0.0
2022-02-10,2022,2,10,19.4,31.2,0.0
2022-02-09,2022,2,9,18.6,30.0,0.0
...,...,...,...,...,...,...
1999-01-05,1999,1,5,,,
1999-01-04,1999,1,4,,,
1999-01-03,1999,1,3,,,
1999-01-02,1999,1,2,,,


In [7]:
# Retrieve just the columns we are interested in
bom[['MinTemp', 'MaxTemp', 'Rainfall']]

Unnamed: 0,MinTemp,MaxTemp,Rainfall
2022-02-13,18.6,29.3,7.2
2022-02-12,20.4,28.9,0.0
2022-02-11,19.1,31.3,0.0
2022-02-10,19.4,31.2,0.0
2022-02-09,18.6,30.0,0.0
...,...,...,...
1999-01-05,,,
1999-01-04,,,
1999-01-03,,,
1999-01-02,,,


In [8]:
# Update the bom data frame to include only the columns we are interested in (plus the existing date index)
bom = bom[['MinTemp', 'MaxTemp', 'Rainfall']]
bom

Unnamed: 0,MinTemp,MaxTemp,Rainfall
2022-02-13,18.6,29.3,7.2
2022-02-12,20.4,28.9,0.0
2022-02-11,19.1,31.3,0.0
2022-02-10,19.4,31.2,0.0
2022-02-09,18.6,30.0,0.0
...,...,...,...
1999-01-05,,,
1999-01-04,,,
1999-01-03,,,
1999-01-02,,,


In [9]:
# To like the ride data to the BOM data, we need a date column in the ride data frame
rides['start_date'] = rides.start_time.dt.date
rides

Unnamed: 0,start_time,name,distance,moving_time,elevation,average_watts,max_speed,avg_speed,start_date
0,2021-02-01 19:35:00,Pedalheads 100 special,102.0,3.18,441.0,188.0,54.3,32.075472,2021-02-01
1,2021-04-01 19:10:00,Return to Hack's Samford Gentleman's ride,46.9,1.66,733.6,193.0,66.9,28.253012,2021-04-01
2,2021-05-01 19:30:00,Morning Ride,34.5,1.09,136.0,184.0,51.8,31.651376,2021-05-01
3,2021-07-01 19:21:00,Morning Ride,33.8,0.98,121.0,226.0,55.0,34.489796,2021-07-01
4,2021-11-01 19:11:00,Morning Ride,49.3,1.76,718.0,184.0,68.7,28.011364,2021-11-01
...,...,...,...,...,...,...,...,...,...
205,2021-12-26 22:30:00,Zwift - Dutch Diesel Cycling Starter Ride (C),60.3,1.52,89.0,177.0,46.1,39.671053,2021-12-26
206,2021-12-27 01:30:00,Zwift - OTR Tempo Group Ride,30.1,0.88,341.0,203.0,66.1,34.204545,2021-12-27
207,2021-12-27 19:00:00,Zwift - Rapha Festive 500 Group Ride Led By Et...,60.1,1.61,323.0,185.0,77.0,37.329193,2021-12-27
208,2021-12-28 19:00:00,Morning Ride,86.9,2.96,472.0,158.0,58.3,29.358108,2021-12-28


In [10]:
# This seems correct, but if we look more carefully we see the values in the start_date column are actually of type object (i.e. strings, and not dates)
rides.start_date

0      2021-02-01
1      2021-04-01
2      2021-05-01
3      2021-07-01
4      2021-11-01
          ...    
205    2021-12-26
206    2021-12-27
207    2021-12-27
208    2021-12-28
209    2021-12-30
Name: start_date, Length: 210, dtype: object

In [11]:
# Compare this to the bom data were the values in the index column are of type datetime64
bom.index

DatetimeIndex(['2022-02-13', '2022-02-12', '2022-02-11', '2022-02-10',
               '2022-02-09', '2022-02-08', '2022-02-07', '2022-02-06',
               '2022-02-05', '2022-02-04',
               ...
               '1999-01-10', '1999-01-09', '1999-01-08', '1999-01-07',
               '1999-01-06', '1999-01-05', '1999-01-04', '1999-01-03',
               '1999-01-02', '1999-01-01'],
              dtype='datetime64[ns]', length=8445, freq=None)

In [12]:
# After a quick Google search, we discover the normalize() method will convert a date time value into just a date value (with the appropriate data type)
rides['start_date'] = rides.start_time.dt.normalize()
rides

Unnamed: 0,start_time,name,distance,moving_time,elevation,average_watts,max_speed,avg_speed,start_date
0,2021-02-01 19:35:00,Pedalheads 100 special,102.0,3.18,441.0,188.0,54.3,32.075472,2021-02-01
1,2021-04-01 19:10:00,Return to Hack's Samford Gentleman's ride,46.9,1.66,733.6,193.0,66.9,28.253012,2021-04-01
2,2021-05-01 19:30:00,Morning Ride,34.5,1.09,136.0,184.0,51.8,31.651376,2021-05-01
3,2021-07-01 19:21:00,Morning Ride,33.8,0.98,121.0,226.0,55.0,34.489796,2021-07-01
4,2021-11-01 19:11:00,Morning Ride,49.3,1.76,718.0,184.0,68.7,28.011364,2021-11-01
...,...,...,...,...,...,...,...,...,...
205,2021-12-26 22:30:00,Zwift - Dutch Diesel Cycling Starter Ride (C),60.3,1.52,89.0,177.0,46.1,39.671053,2021-12-26
206,2021-12-27 01:30:00,Zwift - OTR Tempo Group Ride,30.1,0.88,341.0,203.0,66.1,34.204545,2021-12-27
207,2021-12-27 19:00:00,Zwift - Rapha Festive 500 Group Ride Led By Et...,60.1,1.61,323.0,185.0,77.0,37.329193,2021-12-27
208,2021-12-28 19:00:00,Morning Ride,86.9,2.96,472.0,158.0,58.3,29.358108,2021-12-28


In [13]:
# The values in the start_date column are now of type datetime64 (which will allow us to match them with corresponding values in the BOM data)
rides.start_date

0     2021-02-01
1     2021-04-01
2     2021-05-01
3     2021-07-01
4     2021-11-01
         ...    
205   2021-12-26
206   2021-12-27
207   2021-12-27
208   2021-12-28
209   2021-12-30
Name: start_date, Length: 210, dtype: datetime64[ns]

In [14]:
# Finally we are able to join data from the rides data with corresponding rows from the BOM data 
# i.e. where the start_date column of the rides is equal to the index of the bom 
rides.join(bom, on = 'start_date')

Unnamed: 0,start_time,name,distance,moving_time,elevation,average_watts,max_speed,avg_speed,start_date,MinTemp,MaxTemp,Rainfall
0,2021-02-01 19:35:00,Pedalheads 100 special,102.0,3.18,441.0,188.0,54.3,32.075472,2021-02-01,22.5,29.7,1.4
1,2021-04-01 19:10:00,Return to Hack's Samford Gentleman's ride,46.9,1.66,733.6,193.0,66.9,28.253012,2021-04-01,18.0,26.2,0.0
2,2021-05-01 19:30:00,Morning Ride,34.5,1.09,136.0,184.0,51.8,31.651376,2021-05-01,16.7,22.3,6.2
3,2021-07-01 19:21:00,Morning Ride,33.8,0.98,121.0,226.0,55.0,34.489796,2021-07-01,11.9,20.5,7.0
4,2021-11-01 19:11:00,Morning Ride,49.3,1.76,718.0,184.0,68.7,28.011364,2021-11-01,16.0,24.1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
205,2021-12-26 22:30:00,Zwift - Dutch Diesel Cycling Starter Ride (C),60.3,1.52,89.0,177.0,46.1,39.671053,2021-12-26,20.8,28.4,7.6
206,2021-12-27 01:30:00,Zwift - OTR Tempo Group Ride,30.1,0.88,341.0,203.0,66.1,34.204545,2021-12-27,21.3,28.9,0.0
207,2021-12-27 19:00:00,Zwift - Rapha Festive 500 Group Ride Led By Et...,60.1,1.61,323.0,185.0,77.0,37.329193,2021-12-27,21.3,28.9,0.0
208,2021-12-28 19:00:00,Morning Ride,86.9,2.96,472.0,158.0,58.3,29.358108,2021-12-28,19.6,25.7,14.6


In [15]:
# If instead of matching with the index of the bom, we wish to match some other column, we can instead use the merge method to 
# join based on an arbitary column of the rides data equal to some arbitrary column of the bom data.
# Here we are matching the avg_speed of the ride with the MaxTemp on that day (which doesn't necessarily make a great deal of sense).
# Notice that rows from each data set may be include more than once in the merged data set since it combines all rows from the 
# first data frame with all rows from the second data frame that have matching column values.
rides.merge(bom, left_on = 'avg_speed', right_on='MaxTemp')

Unnamed: 0,start_time,name,distance,moving_time,elevation,average_watts,max_speed,avg_speed,start_date,MinTemp,MaxTemp,Rainfall
0,2021-01-19 19:29:00,Morning Ride,30.6,1.00,139.0,176.0,48.9,30.6,2021-01-19,18.3,30.6,0.0
1,2021-01-19 19:29:00,Morning Ride,30.6,1.00,139.0,176.0,48.9,30.6,2021-01-19,20.8,30.6,0.0
2,2021-01-19 19:29:00,Morning Ride,30.6,1.00,139.0,176.0,48.9,30.6,2021-01-19,23.0,30.6,0.4
3,2021-01-19 19:29:00,Morning Ride,30.6,1.00,139.0,176.0,48.9,30.6,2021-01-19,19.4,30.6,0.0
4,2021-01-19 19:29:00,Morning Ride,30.6,1.00,139.0,176.0,48.9,30.6,2021-01-19,24.1,30.6,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
587,2021-02-10 19:33:00,Morning Ride,61.2,1.80,207.0,205.0,50.4,34.0,2021-02-10,21.0,34.0,0.0
588,2021-02-10 19:33:00,Morning Ride,61.2,1.80,207.0,205.0,50.4,34.0,2021-02-10,24.0,34.0,0.0
589,2021-02-10 19:33:00,Morning Ride,61.2,1.80,207.0,205.0,50.4,34.0,2021-02-10,25.0,34.0,0.0
590,2021-02-10 19:33:00,Morning Ride,61.2,1.80,207.0,205.0,50.4,34.0,2021-02-10,19.0,34.0,0.0


In [16]:
# Try creating some joining examples of your own ...