### Import packages

In [None]:
import pandas  as pd
import numpy as np

### Create pandas dataframe by reading csv file

In [3]:
df = pd.read_csv('medals.csv')

### How does data look in general?

In [85]:
df.head()

Unnamed: 0,Games,Year,Sport,Discipline,Athlete,Team,Gender,Event,Medal,Gold,Silver,Bronze
0,Athens (1896),1896,Aquatics,Swimming,alfred hajos,HUN,Men,100m freestyle men,Gold,1,0,0
1,Athens (1896),1896,Aquatics,Swimming,Otto Herschmann,AUT,Men,100m freestyle men,Silver,0,1,0
2,Athens (1896),1896,Aquatics,Swimming,dimitrios drivas,GRE,Men,100m freestyle for sailors men,Bronze,0,0,1
3,Athens (1896),1896,Aquatics,Swimming,Ioannis Malokinis,GRE,Men,100m freestyle for sailors men,Gold,1,0,0
4,Athens (1896),1896,Aquatics,Swimming,spiridon chasapis,GRE,Men,100m freestyle for sailors men,Silver,0,1,0


### Is data missing?

In [86]:
df.isnull().sum()

Games         0
Year          0
Sport         0
Discipline    0
Athlete       0
Team          0
Gender        0
Event         0
Medal         0
Gold          0
Silver        0
Bronze        0
dtype: int64

$\rightarrow$ no entry is NaN

### Are Sport column and Discipline column the same for Athletics?

In [10]:
# compare all entries in dataframe of rows that have Sport==Athletics with rows that have Discipline==Athletics
comparison = (df[df.Sport=='Athletics'] == df[df.Discipline=='Athletics'])
#print(comparison)

In [11]:
# False equals a 0, True a 1
comparison.sum()

Games         3867
Year          3867
Sport         3867
Discipline    3867
Athlete       3867
Team          3867
Gender        3867
Event         3867
Medal         3867
Gold          3867
Silver        3867
Bronze        3867
dtype: int64

In [12]:
len(comparison)

3867

$\rightarrow$ Sum of each column equals the length of the dataframe, meaning all are identical! 
<br/>
We can assume Athletics to mean Sport==Athletics and/or Discipline==Athletics

### Is gold medal data 0/1 encoding consistent?

In [90]:
pd.unique(df.Gold)

array([ 1,  0, 10])

In [91]:
df[df.Gold==10]

Unnamed: 0,Games,Year,Sport,Discipline,Athlete,Team,Gender,Event,Medal,Gold,Silver,Bronze
16927,Los Angeles (1984),1984,Athletics,Athletics,Carl Lewis,USA,Men,long jump men,Gold,10,0,0


$\rightarrow$ no, fix entry with Gold=1 to not falsify aggregate 

In [92]:
df.at[16927, 'Gold']  = 1

In [93]:
df.iloc[16927]

Games         Los Angeles (1984)
Year                        1984
Sport                  Athletics
Discipline             Athletics
Athlete               Carl Lewis
Team                         USA
Gender                       Men
Event              long jump men
Medal                       Gold
Gold                           1
Silver                         0
Bronze                         0
Name: 16927, dtype: object

In [94]:
print(pd.unique(df.Medal))

['Gold' 'Silver' 'Bronze']


$\rightarrow$ consistent, can be used

is data correct for other medals?

In [95]:
print(pd.unique(df.Silver))
print(pd.unique(df.Bronze))

[0 1]
[0 1]


$\rightarrow$ consistent

### Question 1: Find the five athletes with the most gold medals in athletics

In [114]:
# select all rows of gold medalists in athletics, group them by athlete's name and sort descending by the sum of their gold medals
athletics_medalists_sorted = df[(df.Sport=='Athletics') & (df.Medal=='Gold')][['Athlete','Gold']].groupby(['Athlete']).sum().sort_values(by = ['Gold'],ascending=[False])

In [107]:
athletics_medalists_sorted[:5]

Unnamed: 0_level_0,Gold
Athlete,Unnamed: 1_level_1
Carl Lewis,20
Usain Bolt,9
Paavo Nurmi,9
Allyson Felix,6
Ville Ritola,5


### Question 2: Find the 5 athletes with gold medals in the largest number of (different) athletics events 

In [137]:
# select all rows of gold medalists, group them by athlete's name and sort descending by the sum of their gold medals
# reset indices to not use atheletes' names 
events_medalists_sorted = df[(df.Medal=='Gold') & (df.Discipline=='Athletics')][['Athlete','Event','Gold']].drop_duplicates(['Event']).groupby(['Athlete']).sum().sort_values(by = ['Gold'],ascending=[False]).reset_index()

In [138]:
events_medalists_sorted[:5]

Unnamed: 0,Athlete,Gold
0,Hannes Kolehmainen,3
1,Usain Bolt,3
2,Mohamed Farah,2
3,Mildred Didrikson,2
4,Ellery Clark,2
