# Joggling Analysis
Scott Jenkins

- Create and test views / visualisations for joggling archive app
- Other analysis / questions


In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from itertools import combinations
pd.options.display.max_rows = None

from pyvis.network import Network

In [2]:
data = pd.read_csv('results.csv')
data.head()

Unnamed: 0,Date,Year,Distance,Standard Distance?,Event / Venue,Joggler,Gender,Nationality,Finish Time,Drops,Notes / Result Links,Unnamed: 11,Unnamed: 12
0,28/01/2024,2024,3b 6.7km,No,"The Snowy Soles Snowshoe Race, Ontario",Michal Kapral,M,CAN,00:49:24,?,https://www.instagram.com/p/C2pmYbarGbq/,,
1,27/01/2024,2024,3b 5km,Yes,Holyrood Parkrun,James McDiarmid,M,GBR,00:21:30,0,https://www.parkrun.org.uk/holyrood/results/73/,,
2,27/01/2024,2024,3b 5km,Yes,Młynówka Królewska Parkrun,Kacper Suchora,M,POL,00:23:45,?,https://www.parkrun.pl/mlynowkakrolewska/resul...,,
3,27/01/2024,2024,3b 5km,Yes,Holyrood Parkrun,Scott Jenkins,M,GBR,00:25:21,3,https://www.parkrun.org.uk/holyrood/results/73/,,
4,27/01/2024,2024,3b 5km,Yes,College of Micronesia-FSM Fun Run,Dana Lee Ling,M,FSM,00:38:11,?,https://www.strava.com/athletes/16093155?oq=dan,,


In [3]:
print(f'There are {len(data)} joggling results from {len(data['Joggler'].unique())} jogglers.')

countries = data[data['Nationality']!='0']['Nationality'].unique()
print(f'These Jogglers represent {len(countries)} countries: {countries}')

There are 1029 joggling results from 259 jogglers.
These Jogglers represent 23 countries: ['CAN' 'GBR' 'POL' 'FSM' 'USA' 'AUS' 'SWE' 'TUR' 'DEU' 'IRL' 'ISR' 'EST'
 'CZE' 'ITA' 'JPN' 'TWN' 'BEL' 'ZAF' 'AUT' 'IND' 'NLD' 'MEX' 'NOR']


## Joggler Graph

In [4]:
# Get list of jogglers
jogglers = list(data['Joggler'].unique())

In [5]:
# Group the data by date and event and list the jogglers participating
joggled_together = pd.DataFrame(data.groupby(['Date','Event / Venue'])['Joggler'].apply(lambda x: list(np.unique(x))).reset_index())
joggled_together = joggled_together[~joggled_together['Event / Venue'].isin(['Time Trial','IJA, Virtual'])].reset_index(drop=True)

# Count the number of jogglers at each event, and remove any events where the joggler has joggled alone
joggled_together['joggler_count'] = joggled_together['Joggler'].apply(lambda x: len(x))
joggled_together = joggled_together[joggled_together['joggler_count']>1].reset_index(drop=True)

joggled_together.head()# .sort_values('joggler_count',ascending=False)

Unnamed: 0,Date,Event / Venue,Joggler,joggler_count
0,02/07/2022,Endure24 Leeds,"[Dominic Lodge, Jake Lodge, Jordan Evans, Ralp...",6
1,02/07/2022,York Parkrun,"[Dominic Lodge, Jake Lodge, Jordan Evans, Scot...",4
2,02/10/2022,London Marathon,"[Barak Hirschowitz, Jack Hirschowitz]",2
3,03/07/2022,Endure24 Leeds,"[Dominic Lodge, Jake Lodge, Jordan Evans, Ralp...",6
4,03/11/2019,New York City Marathon,"[Barak Hirschowitz, Jack Hirschowitz]",2


In [11]:
net= Network()

# Create a node for each joggler
net.add_nodes(jogglers)
net.num_nodes()

# initialise empty array to store # occurance each pair of jogglers have joggled together
together_count_array = np.zeros(shape=(len(jogglers),len(jogglers)))
together_count_array

# For each pair of jogglers in the joggled_together dataframe, count the number of occurances they have joggled together.
# Create an edge in the graph, with weight equal to this number of occurance.

# Loop over events
for i in range(len(joggled_together)):
    # Retrieve the list of jogglers at the event
    event_jogglers = joggled_together['Joggler'][i]
    # Get all pairwise combinations of these jogglers
    pairs = list(combinations(event_jogglers, 2))
    # Iterate through each pair of jogglers
    for j in range(len(pairs)):
        # Add 1 to the numpy row,col corresponding to each pair
        together_count_array[jogglers.index(pairs[j][0]),jogglers.index(pairs[j][1])] += 1

# together_count_array

# Add edges to graph

for i in range(together_count_array.shape[0]):
    for j in range(together_count_array.shape[1]):
        if together_count_array[i,j] >= 1:
            net.add_edge(jogglers[i], jogglers[j], weight=together_count_array[i,j])
        else:
            pass

In [18]:
net.show(name='test.html')

test.html


AttributeError: 'NoneType' object has no attribute 'render'

### All-Time Lists

In [14]:
def all_time_list(distance):
    fastest_times = data[data['Distance']==distance][['Joggler','Finish Time']].groupby(['Joggler']).min().reset_index()
    fastest_times = fastest_times.merge(data,how='left',left_on=['Joggler','Finish Time'],right_on=['Joggler','Finish Time'])
    fastest_times['Ranking'] = pd.to_numeric(fastest_times['Finish Time'].rank(method="min")).astype(int)
    fastest_times['Nationality'] = fastest_times['Nationality'].replace({'0':'Unknown'})
    fastest_times = fastest_times[['Ranking','Joggler','Gender','Nationality','Date','Event / Venue','Finish Time']].sort_values('Ranking').reset_index(drop=True)
    return fastest_times

all_time_list('5b Mile')

Unnamed: 0,Ranking,Joggler,Gender,Nationality,Date,Event / Venue,Finish Time
0,1,Matthew Feldman,M,USA,27/07/2012,Time Trial,00:06:33
1,2,Billy Gillen,M,USA,1989,Time Trial,00:07:41
2,3,Ben Schoenberg,M,Unknown,1996,"IJA, Rapid City",00:10:36
3,4,Barry Goldmeier,M,USA,1996,"IJA, Rapid City",00:10:37
4,5,Scott Jenkins,M,GBR,16/04/2020,Time Trial,00:15:21


### Joggler Pivot

In [15]:
# Get the year of each result...

def record_year(sample_date):
    try:
        year = datetime.strptime(sample_date, '%d/%m/%Y').year     # If we have the full date, then extract the year
    except:
        year = int(sample_date)                                    # Else, we only have the year. Use this.
    return year


## Apply this function to all dates
data['Year'] = data.apply(lambda x: record_year(x['Date']),axis=1)

In [16]:
nationality_df = data[['Joggler','Nationality']].drop_duplicates().reset_index(drop=True).replace({'0':'Unknown'})
nationality_df.head()

Unnamed: 0,Joggler,Nationality
0,Michal Kapral,CAN
1,James McDiarmid,GBR
2,Kacper Suchora,POL
3,Scott Jenkins,GBR
4,Dana Lee Ling,FSM


In [17]:
nationality_df['Nationality'].value_counts()

Nationality
Unknown    115
USA         68
GBR         23
DEU         13
CAN         10
POL          4
ISR          4
TUR          3
IRL          3
JPN          2
BEL          1
MEX          1
NLD          1
IND          1
AUT          1
ZAF          1
CZE          1
TWN          1
ITA          1
EST          1
SWE          1
AUS          1
FSM          1
NOR          1
Name: count, dtype: int64

In [18]:
recency_df = data.groupby('Joggler')['Year'].max().reset_index().rename({'Year':'Year Most Recently Active'},axis=1)
recency_df.head()

Unnamed: 0,Joggler,Year Most Recently Active
0,Aaron Scott,2020
1,Adam Griffin,1996
2,Albert Lucas,1996
3,Andrea Goranson,1992
4,Andrew Head,1985


In [19]:
pivot_df = pd.pivot_table(data,
                          values='Finish Time',
                          index='Joggler', 
                          columns='Distance', 
                          aggfunc='min')
pivot_df = pivot_df[['3b Mile','3b 5km','3b 10km','3b Half Marathon','3b Marathon','5b Mile']].reset_index().fillna('-')
pivot_df.head()

Distance,Joggler,3b Mile,3b 5km,3b 10km,3b Half Marathon,3b Marathon,5b Mile
0,Aaron Scott,00:05:21,00:17:47,-,-,-,-
1,Adam Griffin,00:06:07,-,-,-,-,-
2,Albert Lucas,00:05:27,00:20:39,-,-,03:29:04,-
3,Andrea Goranson,00:10:56,-,-,-,-,-
4,Andrew Head,00:05:29,-,-,-,-,-


In [20]:
joggler_df = nationality_df.merge(recency_df,on='Joggler').merge(pivot_df,on='Joggler')
joggler_df.style.format({"Year Most Recently Active": lambda x : '{:.4f}'.format(x)})
joggler_df

Unnamed: 0,Joggler,Nationality,Year Most Recently Active,3b Mile,3b 5km,3b 10km,3b Half Marathon,3b Marathon,5b Mile
0,Michal Kapral,CAN,2024,-,00:17:05,00:36:27,01:20:40,02:50:12,-
1,James McDiarmid,GBR,2024,-,00:19:20,00:39:57,01:33:28,03:35:25,-
2,Kacper Suchora,POL,2024,-,00:22:10,00:52:35,-,-,-
3,Scott Jenkins,GBR,2024,00:06:58,00:19:31,00:40:28,01:27:48,04:23:12,00:15:21
4,Dana Lee Ling,FSM,2024,-,00:38:11,-,-,-,-
5,David Tate,USA,2024,-,00:25:37,00:55:24,-,-,-
6,Heather Thompson,USA,2024,-,00:35:22,-,-,-,-
7,Tim Butler,GBR,2024,-,-,00:47:01,-,04:07:30,-
8,Narelle Cabassi,AUS,2024,-,00:29:15,-,-,-,-
9,Jake Lodge,GBR,2023,00:05:55,00:20:11,-,01:45:28,-,-


In [21]:
nationality_df = data[['Joggler','Nationality']].drop_duplicates().reset_index(drop=True).replace({'0':'Unknown'})
country_df = nationality_df[nationality_df['Nationality']!='Unknown'].groupby('Nationality').count().reset_index().rename({'Joggler':'Joggler Count'},axis=1)
country_df

Unnamed: 0,Nationality,Joggler Count
0,AUS,1
1,AUT,1
2,BEL,1
3,CAN,10
4,CZE,1
5,DEU,13
6,EST,1
7,FSM,1
8,GBR,23
9,IND,1
