# Joggling Analysis
Scott Jenkins

- Create and test views / visualisations for joggling archive app
- Other analysis / questions


In [3]:
import pandas as pd
from datetime import datetime
pd.options.display.max_rows = None

In [4]:
data = pd.read_csv('results.csv')
data.head()

Unnamed: 0,Date,Year,Distance,Standard Distance?,Event / Venue,Joggler,Gender,Nationality,Finish Time,Drops,Notes / Result Links
0,22/10/2023,2023,3b Marathon,Yes,Abingdon Marathon,Scott Jenkins,M,GBR,04:23:12,1,https://www.strava.com/activities/10084587222
1,22/10/2023,2023,3b 10km,Yes,Mens 10km Edinburgh,James McDiarmid,M,GBR,00:39:57,0,https://www.myrunning.uk/mens-10k-edinburgh/re...
2,22/10/2023,2023,3b Marathon,Yes,Monster Mash Marathon,Tim Butler,M,GBR,04:39:22,?,https://www.strava.com/activities/10084951644
3,16/10/2023,2023,3b 10km,Yes,Bospho Run,Levent Denizci,M,TUR,00:52:10,?,https://www.instagram.com/reel/Cyc1DdXADLa/
4,15/10/2023,2023,3b Marathon,Yes,Toronto Waterfront Marathon,Michael-Lucien Bergeron,M,CAN,03:00:06,?,https://www.strava.com/activities/10044512008


In [5]:
len(data['Joggler'].unique()), len(data)

(251, 959)

### All-Time Lists

In [6]:
def all_time_list(distance):
    fastest_times = data[data['Distance']==distance][['Joggler','Finish Time']].groupby(['Joggler']).min().reset_index()
    fastest_times = fastest_times.merge(data,how='left',left_on=['Joggler','Finish Time'],right_on=['Joggler','Finish Time'])
    fastest_times['Ranking'] = pd.to_numeric(fastest_times['Finish Time'].rank(method="min")).astype(int)
    fastest_times['Nationality'] = fastest_times['Nationality'].replace({'0':'Unknown'})
    fastest_times = fastest_times[['Ranking','Joggler','Gender','Nationality','Date','Event / Venue','Finish Time']].sort_values('Ranking').reset_index(drop=True)
    return fastest_times

all_time_list('5b Mile')

Unnamed: 0,Ranking,Joggler,Gender,Nationality,Date,Event / Venue,Finish Time
0,1,Matthew Feldman,M,USA,27/07/2012,Time Trial,00:06:33
1,2,Billy Gillen,M,USA,1989,Time Trial,00:07:41
2,3,Ben Schoenberg,M,Unknown,1996,"IJA, Rapid City",00:10:36
3,4,Barry Goldmeier,M,USA,1996,"IJA, Rapid City",00:10:37


### Joggler Pivot

In [7]:
# Get the year of each result...

def record_year(sample_date):
    try:
        year = datetime.strptime(sample_date, '%d/%m/%Y').year     # If we have the full date, then extract the year
    except:
        year = int(sample_date)                                    # Else, we only have the year. Use this.
    return year


## Apply this function to all dates
data['Year'] = data.apply(lambda x: record_year(x['Date']),axis=1)

In [9]:
nationality_df = data[['Joggler','Nationality']].drop_duplicates().reset_index(drop=True).replace({'0':'Unknown'})
nationality_df.head()

Unnamed: 0,Joggler,Nationality
0,Scott Jenkins,GBR
1,James McDiarmid,GBR
2,Tim Butler,GBR
3,Levent Denizci,TUR
4,Michael-Lucien Bergeron,CAN


In [10]:
nationality_df['Nationality'].value_counts()

Nationality
Unknown    115
USA         63
GBR         22
DEU         13
CAN         10
POL          4
ISR          4
IRL          3
TUR          3
JPN          2
BEL          1
NLD          1
IND          1
AUT          1
ZAF          1
ITA          1
FSM          1
TWN          1
CZE          1
EST          1
SWE          1
NOR          1
Name: count, dtype: int64

In [11]:
recency_df = data.groupby('Joggler')['Year'].max().reset_index().rename({'Year':'Year Most Recently Active'},axis=1)
recency_df.head()

Unnamed: 0,Joggler,Year Most Recently Active
0,Aaron Scott,2020
1,Adam Griffin,1996
2,Albert Lucas,1996
3,Andrea Goranson,1992
4,Andrew Head,1985


In [12]:
pivot_df = pd.pivot_table(data,
                          values='Finish Time',
                          index='Joggler', 
                          columns='Distance', 
                          aggfunc='min')
pivot_df = pivot_df[['3b Mile','3b 5km','3b 10km','3b Half Marathon','3b Marathon','5b Mile']].reset_index().fillna('-')
pivot_df.head()

Distance,Joggler,3b Mile,3b 5km,3b 10km,3b Half Marathon,3b Marathon,5b Mile
0,Aaron Scott,00:05:21,00:17:47,-,-,-,-
1,Adam Griffin,00:06:07,-,-,-,-,-
2,Albert Lucas,00:05:27,00:20:39,-,-,03:29:04,-
3,Andrea Goranson,00:10:56,-,-,-,-,-
4,Andrew Head,00:05:29,-,-,-,-,-


In [13]:
joggler_df = nationality_df.merge(recency_df,on='Joggler').merge(pivot_df,on='Joggler')
joggler_df.style.format({"Year Most Recently Active": lambda x : '{:.4f}'.format(x)})
joggler_df

Unnamed: 0,Joggler,Nationality,Year Most Recently Active,3b Mile,3b 5km,3b 10km,3b Half Marathon,3b Marathon,5b Mile
0,Scott Jenkins,GBR,2023,-,00:19:31,00:40:28,01:27:48,04:23:12,-
1,James McDiarmid,GBR,2023,-,00:19:39,00:39:57,01:33:28,03:35:25,-
2,Tim Butler,GBR,2023,-,-,00:47:01,-,04:07:30,-
3,Levent Denizci,TUR,2023,-,-,00:50:38,01:56:38,04:28:48,-
4,Michael-Lucien Bergeron,CAN,2023,-,00:16:49,00:34:47,01:17:09,03:00:06,-
5,Chris Edwin,GBR,2023,00:05:29,00:18:21,00:36:02,01:20:27,03:00:06,-
6,Stefan Nygard,SWE,2023,-,-,-,-,-,-
7,Barry Goldmeier,USA,2023,-,-,-,-,05:53:46,00:10:37
8,Jake Lodge,GBR,2023,00:05:55,00:20:11,-,01:45:28,-,-
9,Kacper Suchora,POL,2023,-,00:31:36,-,-,-,-


In [14]:
nationality_df = data[['Joggler','Nationality']].drop_duplicates().reset_index(drop=True).replace({'0':'Unknown'})
country_df = nationality_df[nationality_df['Nationality']!='Unknown'].groupby('Nationality').count().reset_index().rename({'Joggler':'Joggler Count'},axis=1)
country_df

Unnamed: 0,Nationality,Joggler Count
0,AUT,1
1,BEL,1
2,CAN,10
3,CZE,1
4,DEU,13
5,EST,1
6,FSM,1
7,GBR,22
8,IND,1
9,IRL,3
