# Joggling Analysis and Testing for Streamlit
Scott Jenkins, May 2023

In [1]:
import pandas as pd
from datetime import datetime
pd.options.display.max_rows = None

In [2]:
data = pd.read_csv('test_results.csv')
data.head()

Unnamed: 0,Date,Distance,Event / Venue,Joggler,Gender,Nationality,Finish Time,Drops,Notes / Result Links
0,23/05/2022,3b Mile,Time Trial,Henry Wellenstein,M,USA,00:04:39,0,
1,01/02/2003,3b Mile,"Atlanta, Georgia",Will Howard,M,USA,00:04:42,?,http://www.atlantajugglers.org/aja-media/artic...
2,08/05/2018,3b Mile,Time Trial,Zach Prescott,M,USA,00:04:43,?,https://www.runnersworld.com/runners-stories/a...
3,1986,3b Mile,"IJA, San Jose",Kirk Swenson,M,USA,00:04:43,1,http://www.juggling.org/orgs/ija/champs.txt ht...
4,1985,3b Mile,"IJA, Atlanta",Kirk Swenson,M,USA,00:04:47,?,http://www.juggling.org/orgs/ija/champs.txt


In [3]:
len(data['Joggler'].unique()), len(data)

(201, 645)

### All-Time Lists

In [4]:
def all_time_list(distance):
    fastest_times = data[data['Distance']==distance][['Joggler','Finish Time']].groupby(['Joggler']).min().reset_index()
    fastest_times = fastest_times.merge(data,how='left',left_on=['Joggler','Finish Time'],right_on=['Joggler','Finish Time'])
    fastest_times['Ranking'] = pd.to_numeric(fastest_times['Finish Time'].rank(method="min")).astype(int)
    fastest_times['Nationality'] = fastest_times['Nationality'].replace({'0':'Unknown'})
    fastest_times = fastest_times[['Ranking','Joggler','Gender','Nationality','Date','Event / Venue','Finish Time']].sort_values('Ranking').reset_index(drop=True)
    return fastest_times

all_time_list('5b Mile')

Unnamed: 0,Ranking,Joggler,Gender,Nationality,Date,Event / Venue,Finish Time
0,1,Matthew Feldman,M,USA,27/07/2012,Time Trial,00:06:33
1,2,Billy Gillen,M,USA,1989,Time Trial,00:07:41
2,3,Ben Schoenberg,M,Unknown,1996,"IJA, Rapid City",00:10:36
3,4,Barry Goldmeier,M,USA,1996,"IJA, Rapid City",00:10:37


### Joggler Pivot

In [5]:
# Get the year of each result...

def record_year(sample_date):
    try:
        year = datetime.strptime(sample_date, '%d/%m/%Y').year     # If we have the full date, then extract the year
    except:
        year = int(sample_date)                                    # Else, we only have the year. Use this.
    return year


## Apply this function to all dates
data['Year'] = data.apply(lambda x: record_year(x['Date']),axis=1)

In [6]:
nationality_df = data[['Joggler','Nationality']].drop_duplicates().reset_index(drop=True).replace({'0':'Unknown'})
nationality_df.head()

Unnamed: 0,Joggler,Nationality
0,Henry Wellenstein,USA
1,Will Howard,USA
2,Zach Prescott,USA
3,Kirk Swenson,USA
4,Banks Helfrich,Unknown


In [11]:
recency_df = data.groupby('Joggler')['Year'].max().reset_index().rename({'Year':'Year Most Recently Active'},axis=1)
recency_df.head()

Unnamed: 0,Joggler,Year Most Recently Active
0,Aaron Scott,2020
1,Adam Griffin,1996
2,Albert Lucas,1996
3,Andrea Goranson,1992
4,Andrew Head,1985


In [12]:
pivot_df = pd.pivot_table(data,
                          values='Finish Time',
                          index='Joggler', 
                          columns='Distance', 
                          aggfunc='min')
pivot_df = pivot_df[['3b Mile','3b 5km','3b 10km','3b Half Marathon','3b Marathon','5b Mile']].reset_index().fillna('-')
pivot_df.head()

Distance,Joggler,3b Mile,3b 5km,3b 10km,3b Half Marathon,3b Marathon,5b Mile
0,Aaron Scott,00:05:21,00:17:47,-,-,-,-
1,Adam Griffin,00:06:07,-,-,-,-,-
2,Albert Lucas,00:05:27,00:20:39,-,-,03:29:04,-
3,Andrea Goranson,00:10:56,-,-,-,-,-
4,Andrew Head,00:05:29,-,-,-,-,-


In [14]:
joggler_df = nationality_df.merge(recency_df,on='Joggler').merge(pivot_df,on='Joggler')
joggler_df.style.format({"Year Most Recently Active": lambda x : '{:.4f}'.format(x)})
joggler_df

Unnamed: 0,Joggler,Nationality,Year Most Recently Active,3b Mile,3b 5km,3b 10km,3b Half Marathon,3b Marathon,5b Mile
0,Henry Wellenstein,USA,2022,00:04:39,-,-,-,-,-
1,Will Howard,USA,2003,00:04:42,-,-,-,-,-
2,Zach Prescott,USA,2018,00:04:43,-,-,-,-,-
3,Kirk Swenson,USA,1986,00:04:43,00:16:58,-,-,-,-
4,Banks Helfrich,Unknown,1985,00:05:06,-,-,-,-,-
5,Jim Piaskowy,Unknown,1984,00:05:06,-,-,-,-,-
6,Reese Edwards,Unknown,1997,00:05:13,00:18:21,-,-,-,-
7,Sean McLoughlin,Unknown,1982,00:05:15,00:19:00,-,-,-,-
8,Mike Hebebrand,USA,2002,00:05:16,00:17:37,00:35:45,-,03:32:00,-
9,Aaron Scott,USA,2020,00:05:21,00:17:47,-,-,-,-


In [23]:
nationality_df = data[['Joggler','Nationality']].drop_duplicates().reset_index(drop=True).replace({'0':'Unknown'})
country_df = nationality_df[nationality_df['Nationality']!='Unknown'].groupby('Nationality').count().reset_index().rename({'Joggler':'Joggler Count'},axis=1)
country_df

Unnamed: 0,Nationality,Joggler Count
0,AUT,1
1,CAN,8
2,GBR,15
3,GER,2
4,IRE,1
5,ISR,1
6,JPN,1
7,NED,1
8,NOR,1
9,POL,1
