# Basic Analysis

In [2]:
import pandas as pd

In [7]:
oo = pd.read_csv('../../data/olympics.csv', skiprows=4)
oo.head()

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal
0,Athens,1896,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100m freestyle,M,Gold
1,Athens,1896,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100m freestyle,M,Silver
2,Athens,1896,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100m freestyle for sailors,M,Bronze
3,Athens,1896,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100m freestyle for sailors,M,Gold
4,Athens,1896,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100m freestyle for sailors,M,Silver


### value_counts()

In [8]:
# value_counts is one of the most useful methods in pandas. It returns a series object, 
# counting all the unique values. There are two things in particular to be aware of 
# value_counts. As this is returning a count of the unique values, the first value is 
# the most frequently occurring element. The second, the second most frequently occurring 
# element and so on. This order can be reversed by just setting the ascending flag to True.
# Dropna, one of the parameters within the value_counts is True by default and you will not 
# get a count of the na values. The na values remember are the missing data values. If your 
# data set has a significant number of na values, this can be misleading and you can turn 
# this feature off by setting dropna to False. 
oo.value_counts()

City       Edition  Sport      Discipline       Athlete                     NOC  Gender  Event                                Event_gender  Medal 
Moscow     1980     Hockey     Hockey           SINGH, Singh                IND  Men     hockey                               M             Gold      2
Amsterdam  1928     Aquatics   Diving           BECKER-PINKSTON, Elizabeth  USA  Women   10m platform                         W             Gold      1
Moscow     1980     Rowing     Rowing           DREIFKE, Joachim            GDR  Men     double sculls (2x)                   M             Gold      1
                                                FRÖHLICH, Silvia            GDR  Women   four-oared shell with coxswain (4-)  W             Gold      1
                                                FROLOVA, Nina               URS  Women   eight with coxswain (8+)             W             Silver    1
                                                                                             

In [10]:
# Edition, and value_counts sorts the values by the Olympics where the most medals were 
# presented to the ones with the least medals were presented. So here, 
# the most medals were presented in the 2008 games and for whatever reason, there were more 
# medals presented in the 2000 game versus the 2004 games. 
oo.Edition.value_counts()

Edition
2008    2042
2000    2015
2004    1998
1996    1859
1992    1705
1988    1546
1984    1459
1980    1387
1976    1305
1920    1298
1972    1185
1968    1031
1964    1010
1952     889
1912     885
1956     885
1924     884
1960     882
1936     875
1948     814
1908     804
1928     710
1932     615
1900     512
1904     470
1896     151
Name: count, dtype: int64

In [12]:
# want to look into a bit more. Or let's say if I was interested to know
# over the history of the Olympics, so that's from 1896 to 2008, how many 
# medals were presented to men, and how many medals were presented to women, 
oo.Gender.value_counts()

Gender
Men      21721
Women     7495
Name: count, dtype: int64

In [13]:
# want to look into a bit more. Or let's say if I was interested to know
# over the history of the Olympics, so that's from 1896 to 2008, how many 
# medals were presented to men, and how many medals were presented to women, 
oo.Gender.value_counts(ascending = True) # Ascending order

Gender
Women     7495
Men      21721
Name: count, dtype: int64

In [15]:
# want to look into a bit more. Or let's say if I was interested to know
# over the history of the Olympics, so that's from 1896 to 2008, how many 
# medals were presented to men, and how many medals were presented to women, 
oo.Gender.value_counts(ascending = True, dropna=True) # (Ascending order, remove NA values) --> ascending = True, dropna=True

Gender
Women     7495
Men      21721
Name: count, dtype: int64

### sort_values()

In [17]:
# Sort_values() sorts the values in a series. As axis is equal to zero, 
# you are sorting along the column and in ascending order by default. S
# o if you visualize a series as being a single column, you are sorting 
# the contents of that column in ascending order. By default, the NaNs, 
# or missing data, are put right at the end. Sort_values(), when used in 
# conjunction with a DataFrame, is particularly useful as you can sort 
# multiple series in ascending and descending order/.
oo.Athlete.sort_values()

651                 AABYE, Edgar
2849       AALTONEN, Arvo Ossian
2852       AALTONEN, Arvo Ossian
7716    AALTONEN, Paavo Johannes
7730    AALTONEN, Paavo Johannes
                  ...           
603                   ÖSTMO, Ole
608                   ÖSTMO, Ole
621                   ÖSTMO, Ole
596                   ÖSTMO, Ole
8051           ÖSTRAND, Per-Olof
Name: Athlete, Length: 29216, dtype: object

In [18]:
# to capture the sorted series create a new variable to capture it
ath = oo.Athlete.sort_values()
ath

651                 AABYE, Edgar
2849       AALTONEN, Arvo Ossian
2852       AALTONEN, Arvo Ossian
7716    AALTONEN, Paavo Johannes
7730    AALTONEN, Paavo Johannes
                  ...           
603                   ÖSTMO, Ole
608                   ÖSTMO, Ole
621                   ÖSTMO, Ole
596                   ÖSTMO, Ole
8051           ÖSTRAND, Per-Olof
Name: Athlete, Length: 29216, dtype: object

In [20]:
# Sort by the edition of the Olympics, and the athletes' names. 
# So oo is the name of our DataFrame, sort_values(), and let's say, 
# by, and because we're going to be using multiple series, we 
# can enter them as a list. So we sort by Edition, and Athlete
oo.sort_values(by=['Edition','Athlete'])
# we have athletes with names such as Zubari and Zueva. And so looking at that one line of code, 
# we're sorting by edition first, and then the athletes' names, and so each section of the 
# edition will be sorted by athlete for each Olympic Edition. In the next video

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal
7,Athens,1896,Aquatics,Swimming,"ANDREOU, Joannis",GRE,Men,1200m freestyle,M,Silver
82,Athens,1896,Gymnastics,Artistic G.,"ANDRIAKOPOULOS, Nicolaos",GRE,Men,rope climbing,M,Gold
110,Athens,1896,Gymnastics,Artistic G.,"ANDRIAKOPOULOS, Nicolaos",GRE,Men,"team, parallel bars",M,Silver
111,Athens,1896,Gymnastics,Artistic G.,"ATHANASOPOULOS, Spyros",GRE,Men,"team, parallel bars",M,Silver
48,Athens,1896,Cycling,Cycling Road,"BATTEL, Edward",GBR,Men,individual road race,M,Bronze
...,...,...,...,...,...,...,...,...,...,...
28095,Beijing,2008,Equestrian,Dressage,"ZU-SAYN WITTGENSTEIN, Nathalie",DEN,Women,team,X,Bronze
28819,Beijing,2008,Sailing,Sailing,"ZUBARI, Shahar",ISR,Men,RS:X - Windsurfer,M,Bronze
28977,Beijing,2008,Taekwondo,Taekwondo,"ZUBCIC, Martina",CRO,Women,49 - 57 kg,W,Bronze
28387,Beijing,2008,Gymnastics,Rhythmic G.,"ZUEVA, Natalia",RUS,Women,group competition,W,Gold


### Boolean indexing

In [22]:
oo.head()

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal
0,Athens,1896,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100m freestyle,M,Gold
1,Athens,1896,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100m freestyle,M,Silver
2,Athens,1896,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100m freestyle for sailors,M,Bronze
3,Athens,1896,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100m freestyle for sailors,M,Gold
4,Athens,1896,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100m freestyle for sailors,M,Silver


In [24]:
# Boolean vectors or conditions can be used to filter data. 
# Based on a condition, pass series of true and false values to a 
# series or data frame to select and display the rules where the 
# series has true values. Instead of using:
# and = &
# or = |
# not = ~ 
# multiple conditions must be in brackets
oo.Medal == 'Gold'

0         True
1        False
2        False
3         True
4        False
         ...  
29211    False
29212    False
29213    False
29214     True
29215    False
Name: Medal, Length: 29216, dtype: bool

In [25]:
# to create a dataframe where all athletes have created a gold medal
# put condition into sqaure brackets of the dataframe
oo[oo.Medal == 'Gold']
# lists all gold medals as dataframe with all columns

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal
0,Athens,1896,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100m freestyle,M,Gold
3,Athens,1896,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100m freestyle for sailors,M,Gold
6,Athens,1896,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,1200m freestyle,M,Gold
9,Athens,1896,Aquatics,Swimming,"NEUMANN, Paul",AUT,Men,400m freestyle,M,Gold
13,Athens,1896,Athletics,Athletics,"BURKE, Thomas",USA,Men,100m,M,Gold
...,...,...,...,...,...,...,...,...,...,...
29199,Beijing,2008,Wrestling,Wrestling Gre-R,"GUENOT, Steeve",FRA,Men,60 - 66kg,M,Gold
29203,Beijing,2008,Wrestling,Wrestling Gre-R,"KVIRKELIA, Manuchar",GEO,Men,66 - 74kg,M,Gold
29206,Beijing,2008,Wrestling,Wrestling Gre-R,"MINGUZZI, Andrea",ITA,Men,74 - 84kg,M,Gold
29210,Beijing,2008,Wrestling,Wrestling Gre-R,"KHUSHTOV, Aslanbek",RUS,Men,84 - 96kg,M,Gold


In [27]:
# multiple conditions
# woman athletes that have won a gold medal
oo[(oo.Gender == 'Women')&(oo.Medal == 'Gold')]

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal
417,Paris,1900,Golf,Golf,"ABBOTT, Margaret Ives",USA,Women,individual,W,Gold
641,Paris,1900,Tennis,Tennis,"COOPER, Charlotte",GBR,Women,mixed doubles,X,Gold
649,Paris,1900,Tennis,Tennis,"COOPER, Charlotte",GBR,Women,singles,W,Gold
710,St Louis,1904,Archery,Archery,"HOWELL, Matilda Scott",USA,Women,double columbia round (50y - 40y - 30y),W,Gold
713,St Louis,1904,Archery,Archery,"HOWELL, Matilda Scott",USA,Women,double national round (60y - 50y),W,Gold
...,...,...,...,...,...,...,...,...,...,...
29134,Beijing,2008,Weightlifting,Weightlifting,"CAO, Lei",CHN,Women,75kg,W,Gold
29147,Beijing,2008,Wrestling,Wrestling Free.,"HUYNH, Carol",CAN,Women,- 48kg,W,Gold
29155,Beijing,2008,Wrestling,Wrestling Free.,"YOSHIDA, Saori",JPN,Women,48 - 55kg,W,Gold
29163,Beijing,2008,Wrestling,Wrestling Free.,"ICHO, Kaori",JPN,Women,55 - 63kg,W,Gold


### String handling

In [28]:
# String handling generally have names matching the equivalent scaler built in string methods 
# that are available in Python. These are available under the str attribute. Using the 
# str attribute, you have access to several common string methods, 
# such as contains, startswith, isnumeric 

# looking for flo jo athlete
oo.Athlete.str.contains('Florence')

0        False
1        False
2        False
3        False
4        False
         ...  
29211    False
29212    False
29213    False
29214    False
29215    False
Name: Athlete, Length: 29216, dtype: bool

In [32]:
# view as a dataframe to see Athlete Names
oo[oo.Athlete.str.contains('Florence')]

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal
1843,London,1908,Skating,Figure skating,"SYERS, Florence",GBR,Women,individual,W,Gold
1848,London,1908,Skating,Figure skating,"SYERS, Florence",GBR,Women,pairs,X,Bronze
4173,Paris,1924,Aquatics,Swimming,"BARKER, Florence",GBR,Women,4x100m freestyle relay,W,Silver
8162,Helsinki,1952,Athletics,Athletics,"FOULDS-PAUL, June Florence",GBR,Women,4x100m relay,W,Bronze
9060,Melbourne / Stockholm,1956,Athletics,Athletics,"FOULDS-PAUL, June Florence",GBR,Women,4x100m relay,W,Silver
10849,Tokyo,1964,Athletics,Athletics,"AMOORE-POLLOCK, Judith Florence",AUS,Women,400m,W,Bronze
16817,Los Angeles,1984,Athletics,Athletics,"GRIFFITH-JOYNER, Florence",USA,Women,200m,W,Silver
18287,Seoul,1988,Athletics,Athletics,"GRIFFITH-JOYNER, Florence",USA,Women,100m,W,Gold
18305,Seoul,1988,Athletics,Athletics,"GRIFFITH-JOYNER, Florence",USA,Women,200m,W,Gold
18347,Seoul,1988,Athletics,Athletics,"GRIFFITH-JOYNER, Florence",USA,Women,4x100m relay,W,Gold
