In [1]:
import pandas as pd
import requests
import time
import re

## Get information on years, seasons and host cities/countries


In [17]:
response = requests.get("https://blog.ticketcity.com/olympic-games-full-list-by-city-year/")
tables = pd.read_html(response.content)

#sort by year

date_and_location = tables[0].sort_values(0)

#remove 1906 olympics which are not officially recognized

date_and_location = date_and_location[date_and_location[1] != 'Olympic Games']


#drop future olympics

date_and_location = date_and_location[6:-6]

In [18]:
#rename columns

date_and_location.columns = ['year', 'season', 'host_city', 'host_country']

In [19]:
date_and_location = date_and_location.reset_index(drop=True)

In [20]:
#change 2021 olympics postponed because of COVID to 2020, removing asterisk

date_and_location.loc[45,'year'] = 2020

date_and_location

Unnamed: 0,year,season,host_city,host_country
0,1924,Winter Olympics,Chamonix,France
1,1924,Summer Olympics,Paris,France
2,1928,Winter Olympics,St. Moritz,Switzerland
3,1928,Summer Olympics,Amsterdam,Netherlands
4,1932,Winter Olympics,Lake Placid,USA
5,1932,Summer Olympics,Los Angeles,USA
6,1936,Winter Olympics,Garmisch-Partenkirchen,Germany
7,1936,Summer Olympics,Berlin,Germany
8,1948,Summer Olympics,London,England
9,1948,Winter Olympics,St. Moritz,Switzerland


In [21]:
#fix types for season and year columns

date_and_location['season'] = date_and_location['season'].astype(str)
date_and_location['year'] = date_and_location['year'].astype(int)

In [22]:
date_and_location['season'] = [x[0] for x in date_and_location['season'].str.split()]

## Define a function for getting every olympics table from Wikipedia, adding the year and season

In [25]:
def oly_table_getter(input_url: str, current_year: int, current_season: str, host_country: str) -> pd.DataFrame:
    """
        Def:
            takes in info in order to scrape wiki medal data for indicated olympics

        *args:
            input_url: string
            current_year: integer
            current_season: str (Summer/Winter)
            host_country: str

        returns:
            dataframe with columns added for year, season and host country
    """
    current_data = [x for x in pd.read_html(input_url) if (x.shape[0] >= 10)][0]
    current_data['year'], current_data['season'], current_data['host_country'] = (current_year, current_season, host_country)
    time.sleep(1)
    current_data = remove_totals(current_data)
    return current_data

## Helper function to remove totals data, we can aggregate it later if needed ourselves

In [26]:
def remove_totals(oly_df):
    #check whether last row is the totals information
    assert('Total' in oly_df.tail(1).iloc[0,0])

    #drop that row, saving all rows but the last
    olympics = oly_df.iloc[0:-1,:]

    return olympics

In [27]:
full_olympics = []

In [28]:
for year, season, host in zip(date_and_location['year'], date_and_location['season'], date_and_location['host_country']):
    base_url = "https://en.wikipedia.org/wiki/" + str(year) + '_' + str(season) + "_Olympics_medal_table"
    data = oly_table_getter(base_url, year, season, host)
    full_olympics.append(data)


In [29]:
#remove 2022

full_olympics = full_olympics[:-1]

In [30]:
#add list of dataframes together vertically
olympic_dataframe = pd.concat(full_olympics)

In [31]:
unedited_df = olympic_dataframe

## Fill missing values in Nations columns

In [32]:
unedited_df['Nation'].fillna(unedited_df['NOC'], inplace=True)

In [33]:
unedited_df['Nation'].fillna(unedited_df.iloc[:,-1], inplace=True)

## Lots of data cleaning, mostly for the nations column with different conventions (country codes, etc)

In [34]:
#remove columns created by naming/html errors
edited_df = unedited_df.iloc[:, :-2]

assert(unedited_df.isna().sum().all() == 0)

In [35]:
#remove non_alpha characters from nations column
edited_df.loc[:, 'Nation'] = edited_df['Nation'].apply(lambda x: re.sub(r'\([^()]*\)', '', x))
edited_df.loc[:, 'Nation'] = edited_df['Nation'].apply(lambda x: re.sub(r'\[[^()]*]', '', x))
edited_df.loc[:, 'Nation'] = edited_df['Nation'].apply(lambda x: re.sub(r'\*', '', x))

In [36]:
edited_df.loc[:, 'Nation'] = edited_df['Nation'].apply(lambda x: x.strip())
edited_df.loc[:, 'Nation'] = edited_df['Nation'].apply(lambda x: x.lower())
edited_df.loc[:, 'Nation'] = edited_df['Nation'].apply(lambda x: x.title())

In [37]:
edited_df.reset_index(drop=True, inplace=True)

In [38]:
edited_df.loc[1559, "Nation"] = 'Russia'

## Save to csv

In [None]:
edited_df.to_csv("olympics_cleaned.csv")