In [1]:
import pandas as pd 
import numpy as np
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot 

In [2]:
# Load Clean data 
clean = pd.read_csv('all_data/clean_data.csv')

In [3]:
from IPython.display import HTML

HTML('''<script>
code_show = true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

# Mini Literature Review 

The Olympics data set has already been widely studied. Therefore, instead of repeating an analysis already available online, this section is aimed at summarising the available key findings about the data set. Doing so will crystallise what questions still remain unanswered and help come up with a novel analysis topic. To aid this summary, plots highlighting some of the key findings are generated in the notebook. Parts of this summary have also been extended and include my own analysis. 

### General
__Key findings :__

- The number of athletes, events, and nations has grown dramatically since 1896. For the Summer Games growth levelled off around 2000 for and may have reached a saturation point, with around 300 events and 10,000 athletes. While, the number of athletes in Winter Olympics keeps growing [1][3].
- Since 1896, there have been 51 Summer and Winter Olympic Games [7]. 
- The first modern era Winter Olympics took place in 1924, 28 years after the Summer Games begun [7].
- Up until the Barcelona 1992 Olympics, Summer and Winter Games were hosted during the same year, in four year intervals. After 1992 the year when the Summer and Winter Olympics were hosted was split up so that the Winter Games occur cyclical two year after the Summer Games [2]. This progression can be seen on the figure below. 
- 1916, 1940 & 1944  Olympic games didn't happen due to the World Wars [1][3]. 

In [4]:
## 'Boring' graph
# clean.groupby(['Year', 'Season'])['Season'].count().unstack().plot(kind='bar',figsize=(14,6))
# plt.title('Number of Athletes at Olympic Games')
# plt.ylabel('Number of Athletes')
# plt.show()

In [5]:
sth = clean.groupby(['Year', 'City'])['Season'].value_counts()
df4plot = pd.DataFrame(data={'Num_Athletes': sth.values}, index = sth.index).reset_index()
df_Summer = df4plot[df4plot['Season']=='Summer'] 
df_Winter = df4plot[df4plot['Season']=='Winter']

In [6]:
init_notebook_mode(connected=True)

traceS = go.Bar(x = df_Summer['Year'],y = df_Summer['Num_Athletes'],name="Summer Games",
                marker=dict(color='rgb(270,180,0)',opacity=1,))

traceW = go.Bar(x = df_Winter['Year'],y = df_Winter['Num_Athletes'],name="Winter Games",
                marker=dict(color='rgb(20,200,255)'))

layout = dict(title = 'Number of Athletes in Olympic Games',
          xaxis = dict(title = 'Year', showticklabels=True), 
          yaxis = dict(title = 'Number of athlets'),
          barmode='stack')

fig = dict(data= [traceS, traceW], layout=layout)
iplot(fig, filename='Number_athletes_olympiads')

## Sports and Events

__Key findings :__
- The composition of sporting events and types changed thought history. Some sport disciplines like Lacrosse (1904) and the Basque Pelota (1900) were only ever present at the Games once, whereas other disciplines such as Gymnastics and Swimming were a present at all the Olympiads [3]. 
- Today Olympics are associated exclusively with sport competitions, but between 1916 and 1948, the Games also included Art Competitions, where artists from different countries competed on events such as Painting, Sculpting and Architecture. The Art Competitions were dominated by Germany, France, and Italy. The number of Art Competitions events peaked to 19 when the Games were held in Nazi Germany. That year Germany won 13 medals for this games, beating all other countries and using this fact for political propaganda [1]. 
- Most popular sports throughout time: Athletics, Gymnastics, Swimming, Shooting [2]
- Since year 2000, the number of sport disciplines at the Summer Olympics has peaked at 34, this is the highest in modern history [3]. As far as the Winter Olympics the maximum number of different sport categories was 15, reached in  2014.

Print-out and plots below illustrate the above findings.

In [7]:
Winter = clean[clean['Season']=='Winter']
print('Max number of sports at Winter Olympics:')
Winter.groupby('Year')['Sport'].nunique().reset_index().max()

Max number of sports at Winter Olympics:


Year     2014
Sport      15
dtype: int64

In [8]:
Summer = clean[clean['Season']=='Summer'].pivot_table(clean, index=['Year','Sport'], 
                                        aggfunc=lambda x: len(x.unique())).reset_index()[['Year','Sport','Event']]
Summer = Summer.pivot("Sport", "Year", "Event")
Summer.fillna(0,inplace=True)

Winter = clean[clean['Season']=='Winter'].pivot_table(clean, index=['Year','Sport'], 
                                        aggfunc=lambda x: len(x.unique())).reset_index()[['Year','Sport','Event']]
Winter = Winter.pivot("Sport", "Year", "Event")
Winter.fillna(0,inplace=True)

In [11]:
from plotly import tools

trace_Summer = go.Heatmap(z = Summer.values, y = list(Summer.index) , x = list(Summer.columns), name = 'Summer Games',
                   colorscale = 'YlOrRd', reversescale = True, colorbar = dict(title = '#Events',x = 0.43))

trace_Winter = go.Heatmap(z = Winter.values, y = list(Winter.index) , x = list(Winter.columns), name = 'Winter Games',
                   colorscale = 'Blues', reversescale = True, colorbar = dict(title = '#Events'))

fig = tools.make_subplots(rows=1, cols=2, print_grid=False, subplot_titles=('Summer Games', 'Winter Games'))

fig.append_trace(trace_Summer, 1, 1)
fig.append_trace(trace_Winter, 1, 2)

fig['layout']['xaxis1'].update(title='Year',  domain=[0.025, 0.425])
fig['layout']['xaxis2'].update(title='Year', domain=[0.585, 1])

fig['layout']['yaxis1'].update(title='Sport', tickangle = 345, tickfont=dict(size=8))
fig['layout']['yaxis2'].update(title='Sport',tickangle = 345, tickfont=dict(size=8))
                          
fig['layout'].update(height=800, width=1400, 
                     title='Evolution of Sport disciplines in the Olympics with time')

iplot(fig, filename='heatmap')

### Aside: on *exotic* Olympic Sports and the overall Olympic Sport mix.
The choice of Olympic Sport disciplines throughout history raises some eyebrows, or at least it raised mine: What is Aeronautics, Skeleton, Basque Pelota, Jeu De Paume etc.?  What are the 'old' discontinued sports and the 'newcomers'? 

Looking at the __Summer Olympics__, some 'old' disciplines which have only featured in the Olympics once:
- Aeronautics (Berlin, 1936): Refers to flying planes, or more specifically gliding; whereby aircrafts were catapulted into the air and judged for most spectacular aerobatics. This were mainly demonstration events, with the hopes of becoming fully events in the future [8].
- Motor-boating (London, 1908): Was a motorboat race! 3 events were hosted and due to adverse either conditions only one boat finished each of these. One gold medal went to France and two to the UK [12].
- Basque Pelota (Paris, 1900): *Is the name for a variety of court sports played with a ball using one's hand, a racket, a wooden bat or a basket, against a wall or with two teams face to face separated by a line on the ground or a net. The roots of this class of games can be traced to the Greek and other ancient cultures*. At the 1900 Summer Olympics, a Basque pelota tournament was contested. Only two teams competed, so only one match was played. Spain won Gold and France Silver. This was the only Olympiad where pelota was an official sport, being revived at the 1924, 1968 and 1992 Games as a demonstration sport [9].
- Jeu de paume (London, 1908): Is a ball-and-court game that originated in France and is somewhat related to Basque Pelota. It was an indoor precursor of tennis played without racquets. This game is believed to have been the originator of many related sports, such as: Fist-ball, Fives, Squash, Badminton and others [10]. 
- Racquets  (London, 1908): Is another sport similar to the ones above, but originating in the UK, Ireland, USA and Canada. Racquets was only present in the 1908 Olympics. 
    
The above gives a surprising insight into the evolution of racquet sports in general. These were clearly not as well defined and established as today. One could say that the disciplines of Racquets, Basque Pelota and Jeu de paume are represented at the Olympics today, through their modern day analogies, such as: Tennis, Table Tennis and Badminton, but not Squash. Albeit bering much resemblance to say Basque Pelota, squash has never qualified as an Olympic discipline. So far it was only accepted as a demonstration sport for the 2018 Summer Youth Olympics [11].

Some noteworthy 'newcomers' to the summer olympics are sports like Golf,  Water Polo, Rugby and Taekwondo. Martial arts have a long standing presence at the olympics; wrestling being among the oldest olympic disciplines and Judo present since 1964. Which brings the question; why not include other martial arts? The IOC has not reached agreement on either Sumo or Karate, although an exception will be made for Sumo in the 2020 Tokyo Games. Another 'new' Summer Olympic discipline is Trampolining, where athletes perform acrobatics while bouncing on a trampoline. Scoring is based on jump difficulty, time in the air and horizontal displacement from the centre of the trampoline bed... [13]. In my view, the inclusion of this discipline is questionable; ordinary acrobatics already includes aspects of trampolining while other popular sports such as squash or karate are being rejected from the olympics, presumably also due to lack of space.

Looking at the __Winter Olympics:__
- Skeleton and Luge are sports, which I have never heard of before. Skeleton racing involves plummeting head-first down a steep and treacherous ice track on a tiny sled. Fun fact is that skeleton is considered the world's first sliding sport [16]. Luge is also a sliding sport but here athletes go down facing up and feet first on a slightly different kind of sledge.  
- Alpinism is not a conventional sport competition rather medals were awarded for outstanding mountaineering feats. In 1924 medals were awarded to the participants of the 1922 Mount Everest expedition. The prize included posthumous medals for seven Sherpas who died in an avalanche. In 1932 German brothers Franz and Tony Schmidt received the Olympic Alpine Prize for their "first ascent of the North Face of the Matterhorn". In 1936 the prize was awarded to Gunter and Mrs. Dyrenfurth from Switzerland for their Himalayan exploration. In September 1946, the IOC agreed to drop the Alpinism Prize. However, Reinhold Messner and Jerzy Kukuczka were awarded Silver Medals at the 1988 Calgary Winter Games for successfully summiting each of the 14 8,000-meter peaks[15].
- Snowbording is the 'newcomer' sport at the Winter Olympics.

<br>
Who decides which sports are 'Olympic'? <br>
This deacon is taken by the IOC (International Olympic Committee). which is a Swiss private non-governmental organisation and is the the authority responsible for organising the modern Olympic Games.
    

## Geography 

__Key findings :__
- Geographic representation in the Games has grown since 1896, although Africa, Southeast Asia, the Middle East, and South America are still very under-represented [1]. The plot below visualises country participation in olympics. 

In [12]:
sth = clean.groupby(['Country'])['Year'].nunique()
df4plot = pd.DataFrame(data={'Participation': sth.values}, index = sth.index).reset_index()

trace = go.Choropleth(locations = df4plot['Country'],
                      locationmode='country names', 
                      z = df4plot['Participation'],
                      colorscale = 'YlOrRd',                      
                      reversescale = True,
                      marker = dict( line = dict(color = 'gray',width = 0.5)),
                      colorbar = dict(title = '#Olympics attended'))

layout = go.Layout(title = 'Olympic Partricipation',  width=1000, height=600,
                   geo = dict(projection = dict( type = 'equirectangular')))

fig = dict(data = [trace], layout = layout)
iplot(fig,filename='world_Choropleth')

### Country Specific Studies
- Some country-specific analysis have also been undertaken. Most of these focus on the USA as this is the country which holds the most medals overall. Other analysis focused on India [5] and Italy [4] as this were the authors home countries.
- USA has won the most medals overall, the country is strongest at Swimming, Athletics, Basketball, Rowing, Shooting etc. Notably, the American swimmer Michael Phelps holds the world record for the highest amount of gold medals won in history, 23 medals in total, won over 5 years [2].
- India won most of its gold medals in Hockey before the 1990s. Hockey used to be a National sport but interest in it amongst India’s youth has gradually diminished and so did the medal count. Cricket is a very popular sport in India, however since the early 1900s it is not an Olympic sport [5]. Indian woman's participation in the Olympics has followed the global trends and gradually increased with time [5].
- In Italy women participation also followed the global trends and gradually increased with time. Curiously, men participation started decreasing from 2008 onwards.

## Gender
__Key findings :__
- Only the first edition of Olympics didn't have any female athletes [3].
- Overall, sex ratio of participants: 27.5% Female and 72.5% Male [2]. However, Female participation increased dramatically; in 1976, not a single Olympic team was comprised of 50% women and in the 2014/2016 Olympics 15 teams had at least 50% females, lead by China (64%), Romania (58%), and the Ukraine (57%). A few countries won 100% of their medals in women’s events: Taiwan (5 medals in weightlifting and archery), India (2 medals in wrestling and badminton), Bulgaria (7 medals in the high jump, rhythmic gymnastics, and wrestling), and Portugal (1 medal in judo) [1].
- Nazi women dominated the medals in 1936, East German and Soviet women dominated in 1976, and American women dominated in 2016 [1].

In [13]:
fig = {
  "data": [
    {
      "values": clean[clean.Year<=1970]['Sex'].value_counts(),
      "labels": ["Male","Female"],
      "domain": {"x": [0, .325555]},
      "name": "before 1970",
      "hoverinfo":"label+percent+name",
      "hole": .6,
      "type": "pie"
    },
    {
      "values": clean[(clean.Year > 1970) & (clean.Year < 2000)]['Sex'].value_counts(),
      "labels": ["Male","Female"],      
      "textposition":"inside",
      "domain": {"x": [.33, .6555]},
      "name": "1970-2000",
      "hoverinfo":"label+percent+name",
      "hole": .6,
      "type": "pie"
    },
    {
      "values": clean[clean.Year >= 2000]['Sex'].value_counts(),
      "labels": ["Male","Female"],      
      "textposition":"inside",
      "domain": {"x": [.66, 1]},
      "name": "after 2000",
      "hoverinfo":"label+percent+name",
      "hole": .6,
      "type": "pie"
    }],
  "layout": {
        "title":"Evolution of Woman Participation in the Olympics ",
        "annotations": [
            {
                "font": {
                    "size": 14
                },
                "showarrow": False,
                "text": "before 1970",
                "x": 0.11,
                "y": 0.5
            },
            {
                "font": {
                    "size": 14
                },
                "showarrow": False,
                "text": "1970-2000",
                "x": 0.49,
                "y": 0.5
            },
            {
                "font": {
                    "size": 14
                },
                "showarrow": False,
                "text": "after 2000",
                "x": 0.88,
                "y": 0.5
            }
        ]
    }
}
iplot(fig, filename='sex')

## Age and Body Metrics 

__Key findings :__

- The size of Olympians have become more extreme over time. In most sports this means taller and heavier, but in a few sports such as gymnastics, athletes have become smaller [1]. Form 1936 to 2016, the weight for female Gymnasts has gone down from 60 to 50 kilograms on average [4]. Additionally, [4] Studied the body metric development of Gymnasts over time in depth. He concluded that: 
    - The weight for men has been more or less stable since 1964;
    - The height was approximately stable for both men and women.

- In Winter Olympic games, average age of medal winners is mostly higher than the non-medal winners [3].

- Gold medalists aged 50 and more won in disciplines such: shooting, archery, sailing and above all horse riding. This is intuitive, as this disciplines are less physically exacting, while older athletes benefit form experience [4].

- In Winter Olympic games, the average age of medal winners is mostly higher than the non-medal winners [3].


Plots and charts illustrating the findings described in this section have already been presented in the 1st notebook. 

## Medals 

__Key findings :__
- Top 5 Countries with most gold medals in descending order: USA, Soviet Union, Germany, Italy, Great Britain [2].
- Revisiting the map graph in the Geography section above, a correlation between a country's participation in the Olympics and the total medals accumulated can be observed. Unsurprisingly, the countries with the most medals are also amongst those with the highest participation rates. However, USA is well ahead Australia, Canada, France and UK, which all had near maximum participation rates. This brings to mind the idea that population size may have something to do with the final medal count. Thus, it could be interesting to explore the relationship between population size, economic wealth, olympic participation and the number of medals won. 

In [14]:
# create df with a medal count by type of medal and country
medals = pd.DataFrame(clean.groupby('Medal')['Country'].value_counts())\
                .rename(columns={'Country':'Medal_Count'}).reset_index() 
                # Out: 3 columns: Medal/Country/Medal count (by type of medal)

# make pivot table with country as rows and medal type as collumn, fill missing values with zero (no medals)
medals_piv = medals.pivot(index='Country',columns='Medal').fillna(0)

# add total medals column and total overall (number of athletes participating)
medals_piv['Total_Medals'] = medals_piv.Medal_Count.Bronze + medals_piv.Medal_Count.Silver \
                            + medals_piv.Medal_Count.Gold
medals_piv['Total_Athletes'] = medals_piv.Medal_Count.sum(axis=1)

# add column for total medals won as a % of total athelet participants 
medals_piv['%medalists'] = round(100* medals_piv['Total_Medals']/medals_piv['Total_Athletes'],1)

# sort data frame in descending order using the total medals column
medals_piv = medals_piv.sort_values(by='Total_Medals',ascending=False)

In [15]:
print('Top 15 countries by total medal count:') 
medals_piv.head(15)

Top 15 countries by total medal count:


Unnamed: 0_level_0,Medal_Count,Medal_Count,Medal_Count,Medal_Count,Total_Medals,Total_Athletes,%medalists
Medal,Bronze,Gold,No_Medal,Silver,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
USA,1358.0,2638.0,12967.0,1641.0,5637.0,18604.0,30.3
Russia,1178.0,1599.0,7745.0,1170.0,3947.0,11692.0,33.8
Germany,1260.0,1301.0,12031.0,1195.0,3756.0,15787.0,23.8
UK,651.0,677.0,10048.0,739.0,2067.0,12115.0,17.1
France,666.0,499.0,10784.0,602.0,1767.0,12551.0,14.1
Italy,531.0,575.0,9031.0,531.0,1637.0,10668.0,15.3
Sweden,535.0,479.0,6755.0,522.0,1536.0,8291.0,18.5
Canada,451.0,463.0,8329.0,438.0,1352.0,9681.0,14.0
Australia,522.0,368.0,6374.0,459.0,1349.0,7723.0,17.5
Hungary,371.0,432.0,5417.0,332.0,1135.0,6552.0,17.3


__Remarks:__ The countries with the maximum number of medals were identified by most of the cited sources. However, I have not come across an analysis taking into account the number of medals won as a fraction of the total number of Athletes sent to compete in the Olympics. Naturally, countries with more resources could send many more athletes to compete in the Olympics, consequently hoping to win more medals. Although I am sure some rules must exist to limit this type of behaviour and all athletes have to qualify. To investigate this idea further, in the table above, the column '%medalists' gives the average percentage of athletes holing a medal out of all the athletes sent to the Olympics. Even after sorting the table according to the '%medalists' column, USA and Russia hold dominance, but curiously Russia emerges as the leader. Throughout history, almost 34% of all the athletes Russia sends to the Olympics come back home with a medal. In the US this proportion is about 30%, followed by Germany with 24%, Pakistan with 22%, Norway and Sweden with 19% and Jamaica in 7th place with a surprising 18% rate. I am curious to see the evolution of this % figures with time and this is an area I will explore further in the section below. My hypothesis is that there will likely be trends for the different countries or at least peaks with a interesting historical explanations.  

In [24]:
traceG = go.Bar(x = medals_piv.index, y = medals_piv.Medal_Count.Gold.head(15).values, name="Gold",
               marker=dict( color='rgb(212,175,55)', opacity=0.7, reversescale = True))

traceS = go.Bar(x = medals_piv.index, y = medals_piv.Medal_Count.Silver.head(15).values, name="Silver",
               marker=dict( color='rgb(192,192,192)', opacity=1, reversescale = True))

traceB = go.Bar(x = medals_piv.index, y = medals_piv.Medal_Count.Bronze.head(15).values, name="Bronze",
               marker=dict( color='rgb(128,0,0)', opacity=0.5, reversescale = True))

trace_perc = go.Scatter(x= medals_piv.index , y = medals_piv['%medalists'].head(15).values,name="%_Medalists",  
                        yaxis='y2',marker=dict(color="Blue"),mode = "markers")

layout = go.Layout(title='Top Countries by Medals Won; in absolute and % terms', 
                   yaxis = dict(title = 'Number of Medals'),
                   yaxis2 = dict(title='% Medals won out of all attempts', anchor='x',overlaying='y',side='right'),
                   legend=dict( x=0.82,y=1, traceorder='normal',font=dict(color='#000'), bgcolor='#F2E2E2',
                               bordercolor='#FFFFFF',borderwidth=2), 
                   barmode='group')

fig = go.Figure(data = [traceG,traceS,traceB,trace_perc], layout = layout)
iplot(fig, filename = "medal")  

### Further explore top countries by absolute number of medals won and % medals won
__Food for thought:__ How should we think of 'Olympic excellence'? Is it in the absolute number of medals won or is it in the % of all athletes that came back home with a medal? Going by absolute numbers favours big, populous and/or relatively wealthy countries. <br>
Note: On the plots below the 'To 10' countries are selected based max %medals won. Then the absolute and % terms are plotted separately to see differences. 

In [17]:
# top 10 countries list:
top = ['Russia', 'USA', 'Germany', 'Pakistan', 'Norway', 'Sweden', 'Jamaica',
        'Netherlands', 'Australia', 'Hungary']

In [18]:
medals = pd.DataFrame(clean.groupby(['Year','Medal'])['Country'].value_counts())\
                .rename(columns={'Country':'Medal_Count'}).reset_index() 

In [19]:
# group medals DF by country
g = medals.groupby('Country')
# init empty dictionary of data frames to store values for top 15 countries
d = {country: pd.DataFrame() for country in top}

# loop over top 15 counries and calculate %medlas each year
for country in top: 
    d[country] = pd.DataFrame(g.get_group(country).drop('Country',axis=1).pivot(index='Year',columns='Medal').fillna(0))
    d[country] ['Total_Medals'] = d[country].Medal_Count.Bronze + d[country].Medal_Count.Silver \
                            + d[country].Medal_Count.Gold
    d[country]['Total_Athletes'] = d[country].Medal_Count.sum(axis=1)
    # add column for total medals won as a % of total athelet participants 
    d[country]['%medalists'] = round(100* d[country]['Total_Medals']/d[country]['Total_Athletes'],1)
# OUT: dictionary of data frames

In [20]:
init_notebook_mode(connected=True)
trace_Ru = go.Scatter(x = d['Russia'].index ,y = d['Russia']['Total_Medals'],name="Russia",
                      marker=dict(color="Red"),  opacity=0.5, mode = "markers+lines")

trace_USA = go.Scatter(x = d['USA'].index ,y = d['USA']['Total_Medals'],name="USA",
                    marker=dict(color="Blue"),  opacity=0.5, mode = "markers+lines")

trace_Ger = go.Scatter(x = d['Germany'].index ,y = d['Germany']['Total_Medals'],name="Germany",
                    marker=dict(color="black"),  opacity=0.5, mode = "markers+lines")

trace_Pak = go.Scatter(x = d['Pakistan'].index ,y = d['Pakistan']['Total_Medals'],name="Pakistan",
                    marker=dict(color="Orange"),  opacity=0.6, mode = "markers+lines")

trace_Nor = go.Scatter(x = d['Norway'].index ,y = d['Norway']['Total_Medals'],name="Norway",
                    marker=dict(color="cyan"),  opacity=0.5, mode = "markers+lines")

trace_Swe = go.Scatter(x = d['Sweden'].index ,y = d['Sweden']['Total_Medals'],name="Sweden",
                      marker=dict(color="darkred"),  opacity=0.5, mode = "markers+lines")

trace_jam = go.Scatter(x = d['Jamaica'].index ,y = d['Jamaica']['Total_Medals'],name="Jamaica",
                    marker=dict(color="purple"),  opacity=0.5, mode = "markers+lines")

trace_ned = go.Scatter(x = d['Netherlands'].index ,y = d['Netherlands']['Total_Medals'],name="Netherlands",
                    marker=dict(color="deeppink"),  opacity=0.5, mode = "markers+lines")

trace_aud = go.Scatter(x = d['Australia'].index ,y = d['Australia']['Total_Medals'],name="Australia",
                    marker=dict(color="lime"),  opacity=0.5, mode = "markers+lines")

trace_h = go.Scatter(x = d['Hungary'].index ,y = d['Hungary']['Total_Medals'],name="Hungary",
                    marker=dict(color="green"),  opacity=0.5, mode = "markers+lines")



fig = tools.make_subplots(rows=2, cols=1, print_grid=False, subplot_titles=('World', '~Europe'))

fig.append_trace(trace_Ru, 1, 1)
fig.append_trace(trace_jam, 1, 1)
fig.append_trace(trace_USA, 1, 1)
fig.append_trace(trace_Pak, 1, 1)
fig.append_trace(trace_aud, 1, 1)

fig.append_trace(trace_Ger, 2, 1)
fig.append_trace(trace_Nor, 2, 1)
fig.append_trace(trace_Swe, 2, 1)
fig.append_trace(trace_ned, 2, 1)
fig.append_trace(trace_h, 2, 1)

fig['layout']['xaxis'].update(title='Year')
fig['layout']['xaxis2'].update(title='Year')

fig['layout']['yaxis'].update(title='Number of Medals won',tickfont=dict(size=8))
fig['layout']['yaxis2'].update(title='Number of Medals won',tickfont=dict(size=8))


#fig['layout']['yaxis1'].update(title='Sport', tickangle = 345, tickfont=dict(size=8))

fig['layout'].update(height=600, width=1000, title = ' Top 10 countries; Total Medals won by time')


iplot(fig, filename='Number of Medals won')

In [21]:
init_notebook_mode(connected=True)
trace_Ru = go.Scatter(x = d['Russia'].index ,y = d['Russia']['%medalists'],name="Russia",
                      marker=dict(color="Red"),  opacity=0.5, mode = "markers+lines")

trace_USA = go.Scatter(x = d['USA'].index ,y = d['USA']['%medalists'],name="USA",
                    marker=dict(color="Blue"),  opacity=0.5, mode = "markers+lines")

trace_Ger = go.Scatter(x = d['Germany'].index ,y = d['Germany']['%medalists'],name="Germany",
                    marker=dict(color="black"),  opacity=0.5, mode = "markers+lines")

trace_Pak = go.Scatter(x = d['Pakistan'].index ,y = d['Pakistan']['%medalists'],name="Pakistan",
                    marker=dict(color="Orange"),  opacity=0.6, mode = "markers+lines")

trace_Nor = go.Scatter(x = d['Norway'].index ,y = d['Norway']['%medalists'],name="Norway",
                    marker=dict(color="cyan"),  opacity=0.5, mode = "markers+lines")

trace_Swe = go.Scatter(x = d['Sweden'].index ,y = d['Sweden']['%medalists'],name="Sweden",
                      marker=dict(color="darkred"),  opacity=0.5, mode = "markers+lines")

trace_jam = go.Scatter(x = d['Jamaica'].index ,y = d['Jamaica']['%medalists'],name="Jamaica",
                    marker=dict(color="purple"),  opacity=0.5, mode = "markers+lines")

trace_ned = go.Scatter(x = d['Netherlands'].index ,y = d['Netherlands']['%medalists'],name="Netherlands",
                    marker=dict(color="deeppink"),  opacity=0.5, mode = "markers+lines")

trace_aud = go.Scatter(x = d['Australia'].index ,y = d['Australia']['%medalists'],name="Australia",
                    marker=dict(color="lime"),  opacity=0.5, mode = "markers+lines")

trace_h = go.Scatter(x = d['Hungary'].index ,y = d['Hungary']['%medalists'],name="Hungary",
                    marker=dict(color="green"),  opacity=0.5, mode = "markers+lines")



fig = tools.make_subplots(rows=2, cols=1, print_grid=False, subplot_titles=('World', '~Europe'))

fig.append_trace(trace_Ru, 1, 1)
fig.append_trace(trace_jam, 1, 1)
fig.append_trace(trace_USA, 1, 1)
fig.append_trace(trace_Pak, 1, 1)
fig.append_trace(trace_aud, 1, 1)

fig.append_trace(trace_Ger, 2, 1)
fig.append_trace(trace_Nor, 2, 1)
fig.append_trace(trace_Swe, 2, 1)
fig.append_trace(trace_ned, 2, 1)
fig.append_trace(trace_h, 2, 1)

fig['layout']['xaxis'].update(title='Year')
fig['layout']['xaxis2'].update(title='Year')

fig['layout']['yaxis'].update(title='% Medals won out of all attempts',tickfont=dict(size=8))
fig['layout']['yaxis2'].update(title='% Medals won out of all attempts',tickfont=dict(size=8))


#fig['layout']['yaxis1'].update(title='Sport', tickangle = 345, tickfont=dict(size=8))

fig['layout'].update(height=600, width=1000, title = ' Top 10 countries; % Medals won by time')


iplot(fig, filename='% Medals won out of all attempts')

   
### Who is a true winner? and a jumble of Olympic history

__Observe:__
- Since approx. 1994 there is a 'zig-zag' pattern in the data corresponding to the winter and summer Olympics being out of phase (by 2 years). From this pattern, it can be seen that some countries excel at the Summer Games (e.g. Jamaica, Pakistan), but have a very limited presence at the Winter Games and vice versa. 
- Using the %medalists metric Jamaica, Pakistan and Australia emerge as 'alternative' winners; in absolute terms they bring back home few medals, but the proportion of all athletes that come back winners is noteworthy.
- Contrary to expectation, there are no clear trends in the '%medalists' data, however some interesting spikes occur, which help "bump up" the countries overall average. Most of the time, many of the 'top 10' countries have %medals values lower than the overall average. For instance, Australia had a very good year in 1900 when all its athletes won a medal (100%), ever since the %medals ranged between 26 and 3 %. The Netherlands, Norway and Sweden all followed a similar trajectory with spikes and stronger results in the remote past. Although the Netherlands and Norway had a strong come back in the summer Olympics of 2000, and Sweden in the winter Olympics of 2006. 
-  For Germany and Hungary the %medalist rate remains high throughout time with a few spikes scattered across history. In absolute terms this countries also rank highly. Germany's total medal count rise coincides with a period of economic recovery after WWII. Although this is rivalled by a peak in 1936 when the then Natzi Germany hosted the Berlin Olympics. Hungary is a much smaller country, therefore its absolute medal count is not as large as Germany's. However, along many stretches of history it outperforms, both in absolute and % terms, more prosperous European countries such as Norway, the Netherlands and Sweden. 
- Remarkably, the USA % of athletes which come back home with a medal remains steady thought history. While the absolute medal count is more jittery. The drop in USA's medal count in 1980 was due to the country's boycott of those Olympic Games (hosted in Russia). The peak it reached in 1984 is (only rivalled by the one in 1904), generally sets the tone for USA's consistent performance in the years after. 
- Jamaica had an excellent year in 1952, when 50% of its athletes came back home with a medal. At that time the country was still a British Colony. Since Jamaica gained independence in 1962, its absolute medal count and %medalists rate remained low until the late 1990s when its score begun to rise again. 
- Other counties also have peaks in times which coincide with more authoritarian/ propaganda oriented political rule: 
    - Russias grandeur in absolute medal count and %medalists falls in the Cold War period and coincides with the existence of a communistic USSR. Its results peak in 1980 mainly due to USA's boycott of the Olympics event held in Moscow and consequentially reduced competition.  
    - Pakistan was never strong in absolute terms, but used to be very strong in the %medalists view; with high values between 1960 and 1988, reaching a peak of almost 87% in 1968. In the 1992 Olympics the country had 57% medalists amongst all its athletes, ever since it did not win any medals despite having participated in most of the Games. It is interesting to note that Pakistans rise in %Olympian excellence coincides with the rise of Ayub Khan, who became Pakistans president through a coup (1958-1969) and who's rule rises controversy until today. Additionally, between 1965 and 1971 Pakistan and India are at war over Kashmir [6].  
    - While Germany remains a strong contestant regardless of the political backdrop, this not the case with Hungary. It should be noted that Hungary's major peaks in both absolute an relative terms also fall in times when the country was politically aligned with Natzi Germany, and later under a communist regime. Sadly, the democratic Hungary of today is not as strong a contestant as it was in the past. 


## Conclusion

As can be seen form the above summary, many interesting questions about the Olympic dataset have already been asked and answered. It is my impression that topics such as: Olympic History, Woman at the Olympics, Olympic Disciplines and their relationship with body metrics, Geographic Participation as well as Top Countries (by medals) have been well analysed. I have also encountered some country-specific analysis about USA, India, Italy. While conducting this 'ground work' I became interested in finding out more about how Europe faired in the Olympics; which countries excelled at which sports? and in which years? This questions and more are tackled in the following notebooks. Additionally, as far as Medals go, I realise it is natural to champion and study the winners. That probably why I have not come across any mention of the loosing countries. How many countries have never own an Olympic medal? Which countries are those? This questions are briefly explored in the section below.  



### Aside: not so Olympic Winners

In [22]:
print('!!! Out of {} participating countries, {}  never won an Olympic Medal !!!'.\
      format(clean.Country.nunique(), medals_piv.Total_Medals.value_counts()[0]))

!!! Out of 209 participating countries, 72  never won an Olympic Medal !!!


Yes, approx. 35% of all participating Countries never won a medal! The map below shows this countries as well as the number of times they have competed. Unsurprisingly, these are some of the poorest and/ or smallest countries in the world. Most are in Africa, some in South-East Asia and some in the Caribbean, these island-countries are so small they cannot be seen on the map below. One explanation for this state of affairs is that countries with little resources cannot train their athletes to a sufficient standard, while small countries may simply lack the population volume to produce athletic talent. Albeit, the example of Jamaica goes against this generalisation.  

In [23]:
NoMedal = medals_piv.tail(72)
NoMedal.reset_index(inplace=True)

trace = go.Choropleth(locations = NoMedal.Country,
                      locationmode='country names', 
                      z = NoMedal.Total_Athletes,
                      colorscale = 'Blues',                      
                      reversescale = True,
                      marker = dict( line = dict(color = 'gray',width = 0.5)),
                      colorbar = dict(title = '#Attempts'))

layout = go.Layout(title = 'Countries without an Olympic Medal',  width=1000, height=600,
                   geo = dict(projection = dict( type = 'equirectangular')),
                   hovermode='closest')

fig = dict(data = [trace], layout = layout)
iplot(fig,filename='noMedal_Choropleth')

## Sources : 

- [1] Randi H Griffin *Olympic history data: thorough analysis*:https://www.kaggle.com/heesoo37/olympic-history-data-a-thorough-analysis/report
- [2] Debadri Dutta *Analysing the Olympics (for last 120 yrs.)*: https://www.kaggle.com/duttadebadri/analysing-the-olympics-for-last-120-yrs
- [3] Arunsankar Kumarakurubaran *Key Insights from Olympic history data*: https://www.kaggle.com/arunsankar/key-insights-from-olympic-history-data
- [4] Marco Giuseppe de Pinto *Let's discover more about the Olympic Games!*: https://www.kaggle.com/marcogdepinto/let-s-discover-more-about-the-olympic-games
- [5] Dheerendra Singh Tomar *Progress of India in the history of Olympic games*: https://www.kaggle.com/tomardheerendra/progress-of-india-in-the-history-of-olympic-games

- [6] *Pakistan Timeline and History Overview*: https://www.ducksters.com/geography/country/pakistan_history_timeline.php

- [7] Lucky source 7 is me, I have calculated/ observed some facts to supplement the summary. 

- [8] Elizabeth Borja *The Year Aeronautics Was an Olympic Event*: https://airandspace.si.edu/stories/editorial/year-aeronautics-was-olympic-event

- [9] Wikipedia *Basque pelota*: https://en.wikipedia.org/wiki/Basque_pelota_at_the_1900_Summer_Olympics

- [10] Wikipedia *Jeu de paume*: https://en.wikipedia.org/wiki/Jeu_de_paume_at_the_1908_Summer_Olympics

- [11] Wikipedia *Squash*:https://en.wikipedia.org/wiki/Squash_(sport)

- [12] Wikipedia *Water motorsports at the 1908 Summer Olympics*: https://en.wikipedia.org/wiki/Water_motorsports_at_the_1908_Summer_Olympics

- [13] Wikipedia *Trampolining*: https://en.wikipedia.org/wiki/Trampolining
- [14] Olympics official website: https://www.olympic.org/skeleton
- [15] https://www.topendsports.com/events/discontinued/alpinism.htm