First, Python packages need to be imported for this project.

In [1]:
import numpy as np
from scipy.interpolate import InterpolatedUnivariateSpline
import pandas as pd

Next, within the dataset chronicling all medals from all Olympic Games, those pertaining to 1960 through 2022 need to be selected.



In [2]:
# Initial data dump
col_list = ['discipline_title', 'slug_game', 'event_title', 
            'medal_type', 'country_name', 'country_3_letter_code']
medal_table = pd.read_csv('archive/olympic_medals.csv', usecols = col_list)

# Extract columns for hosting city and year from "slug_game" column
medal_table['City'] = medal_table['slug_game'].str[:-4].str.replace('-', ' ').str.strip().str.title()
medal_table['Year'] = medal_table['slug_game'].str[-4:]

# Format event titles and medal types
medal_table['event_title'] = medal_table['event_title'].str.title()
medal_table['medal_type'] = medal_table['medal_type'].str.lower().str.title()

# Clean medal table with organized data
new_col_list = ['Year', 'City', 'discipline_title', 'event_title', 
                'medal_type', 'country_name', 'country_3_letter_code']
new_medal_table = medal_table[new_col_list]

# Rename cleaned medal columns
new_medal_table = new_medal_table.rename(columns = {
    'discipline_title': 'Sport',
    'event_title': 'Event',
    'medal_type': 'Medal',
    'country_name': 'Country Name',
    'country_3_letter_code': 'Country Code'
})

In [3]:
new_medal_table

Unnamed: 0,Year,City,Sport,Event,Medal,Country Name,Country Code
0,2022,Beijing,Curling,Mixed Doubles,Gold,Italy,ITA
1,2022,Beijing,Curling,Mixed Doubles,Gold,Italy,ITA
2,2022,Beijing,Curling,Mixed Doubles,Silver,Norway,NOR
3,2022,Beijing,Curling,Mixed Doubles,Silver,Norway,NOR
4,2022,Beijing,Curling,Mixed Doubles,Bronze,Sweden,SWE
...,...,...,...,...,...,...,...
21692,1896,Athens,Weightlifting,Heavyweight - One Hand Lift Men,Silver,Denmark,DEN
21693,1896,Athens,Weightlifting,Heavyweight - One Hand Lift Men,Bronze,Greece,GRE
21694,1896,Athens,Weightlifting,Heavyweight - Two Hand Lift Men,Gold,Denmark,DEN
21695,1896,Athens,Weightlifting,Heavyweight - Two Hand Lift Men,Silver,Great Britain,GBR


The table is now compact and readable, but now needs only the rows that pertain to the Winter Olympics games from 1960 to 2022.

1964 &emsp; Innsbruck, Austria

1968 &emsp; Grenoble, France

1972 &emsp; Sapporo, Japan

1976 &emsp; Innsbruck, Austria

1980 &emsp; Lake Placid, New York, USA

1984 &emsp; Sarajevo, Bosnia and Herzegovina (Yugoslavia)

1988 &emsp; Calgary, Alberta, Canada

1992 &emsp; Albertville, France

1994 &emsp; Lillehamer, Norway

1998 &emsp; Nagano, Japan

2002 &emsp;Salt Lake City, Utah, USA

2006 &emsp; Turin, Italy

2010 &emsp; Vancouver, British Columbia, Canada

2014 &emsp; Sochi, Russia

2018 &emsp; Pyeongchang, South Korea

2022 &emsp; Beijing, China

In [4]:
# List the years and cities for the Winter Olympic games of interest
winter_years = ['1964', '1968', '1972', '1976','1980', '1984', '1988', '1992', 
                '1994', '1998', '2002', '2006', '2010', '2014', '2018', '2022']
                
winter_cities = ['Innsbruck', 'Grenoble', 'Sapporo', 'Innsbruck', 'Lake Placid',  'Sarajevo', 
                 'Calgary', 'Albertille', 'Lillehamer', 'Nagano',  'Salt Lake City', 'Turin', 
                 'Vancouver', 'Sochi', 'Pyeongchang', 'Beijing']

# Select the rows based on the above set of games
winter_medal_table = new_medal_table.loc[new_medal_table['Year'].isin(winter_years)]
winter_medal_table = winter_medal_table.loc[winter_medal_table['City'].isin(winter_cities)]

In [5]:
winter_medal_table

Unnamed: 0,Year,City,Sport,Event,Medal,Country Name,Country Code
0,2022,Beijing,Curling,Mixed Doubles,Gold,Italy,ITA
1,2022,Beijing,Curling,Mixed Doubles,Gold,Italy,ITA
2,2022,Beijing,Curling,Mixed Doubles,Silver,Norway,NOR
3,2022,Beijing,Curling,Mixed Doubles,Silver,Norway,NOR
4,2022,Beijing,Curling,Mixed Doubles,Bronze,Sweden,SWE
...,...,...,...,...,...,...,...
16012,1964,Innsbruck,Bobsleigh,Two-Man Men,Bronze,Italy,ITA
16013,1964,Innsbruck,Bobsleigh,Two-Man Men,Bronze,Italy,ITA
16014,1964,Innsbruck,Bobsleigh,Four-Man Men,Gold,Canada,CAN
16015,1964,Innsbruck,Bobsleigh,Four-Man Men,Silver,Austria,AUT


Now with the medal count data aggregated, it shall be important to clean the list of countries below in order to simplify it, since it consists of countries no longer in existence (such as the Soviet Union or Yugoslavia) and those that have for not very long (such as Belarus and Ukraine).

In [6]:
winter_medal_table['Country Name'].value_counts()

Norway                                   274
United States of America                 266
Germany                                  242
Canada                                   200
Austria                                  184
Soviet Union                             180
Switzerland                              135
Netherlands                              134
Sweden                                   129
German Democratic Republic (Germany)     129
Russian Federation                       114
France                                   114
Finland                                  112
Italy                                    112
People's Republic of China                78
Republic of Korea                         69
Japan                                     63
Federal Republic of Germany               48
ROC                                       38
Czech Republic                            33
Great Britain                             23
Slovenia                                  21
Poland    

That is why the three-letter country code is critical for comparing data in the Winter Olympic medal count (1960 - 2022) versus those which are currently evaluated by the World Bank for historic GDP.

In [7]:
# Initial data dump (note: data only extends to 2020 and will have to be extrapolated)
WB_country_list = ['Country Name', 'Country Code']
WB_year_list = ['1964', '1968', '1972', '1976','1980', '1984', '1988', '1992', 
                '1994', '1998', '2002', '2006', '2010', '2014', '2018', '2020']
WB_GDP = pd.read_csv(
    'API_NY.GDP.MKTP.CD_DS2_en_csv_v2_3930485/API_NY.GDP.MKTP.CD_DS2_en_csv_v2_3930485.csv', 
    skiprows = 4,
    usecols = (WB_country_list + WB_year_list))

However, first the countries that are listed in the medal table need to have their codes verified in order to ensure that they are properly listed in the World Bank's economic data table.

In [8]:
# Create a list of the countries in the medal table to compare against the World Bank table
countries = winter_medal_table['Country Name'].value_counts().index[::].tolist()
codes = winter_medal_table['Country Code'].value_counts().index[::].tolist()
wrong = []

# Check which codes do not match
for index, code in enumerate(codes, 0):
    if WB_GDP['Country Code'].str.contains(code).any():
        pass
    else:
        wrong.append([countries[index], codes[index]])

In [9]:
wrong

[['Germany', 'GER'],
 ['Soviet Union', 'URS'],
 ['Switzerland', 'SUI'],
 ['Netherlands', 'NED'],
 ['German Democratic Republic (Germany)', 'GDR'],
 ['Federal Republic of Germany', 'FRG'],
 ['ROC', 'ROC'],
 ['Slovenia', 'SLO'],
 ['Czechoslovakia', 'TCH'],
 ['Olympic Athletes from Russia', 'OAR'],
 ['Latvia', 'LAT'],
 ['Croatia', 'CRO'],
 ['Bulgaria', 'BUL'],
 ['Yugoslavia', 'YUG'],
 ['Denmark', 'DEN']]

This issue needs to be nipped in the bud. Therefore, the countries listed above need their codes changed in order to have their GNI per capita data pulled. It is easier to apply these changes to the World Bank table, since each country has its own row.

Also, due to doping scandals among Russian athletes, the Russian Federation (RUS) has been banned from having an official delegation and hence had to compete as "Olympic Athletes from Russia" (OAR) in Pyeongchang 2018 and the "Russian Olympic Committee" in Beijing 2022. Despite this, it is just easier to merge these delegations into that of the Russian Federation.

In [10]:
# Change the country codes in the World Bank table to match that of the Winter Olympic table
for country in wrong:
    WB_GDP.loc[
        WB_GDP['Country Name'] == country[0], 'Country Code'] = country[1]

# Consolidate the entries in the Winter Olympic medal table for the Russian Federation
winter_medal_table['Country Name'].replace({'ROC': 'Russian Federation',
                                        'Olympic Athletes from Russia': 'Russian Federation'},
                                          inplace = True)
winter_medal_table['Country Code'].replace({'ROC': 'RUS', 'OAR': 'RUS'}, inplace = True)

# Change Liechtenstein to Austria to make things simpler
winter_medal_table['Country Name'].replace({'Liechtenstein': 'Austria'}, inplace = True)
winter_medal_table['Country Code'].replace({'LIE': 'AUT'}, inplace = True)

In [11]:
# Limit list only to countries that have scored medals in the Winter Olympics
WB_GDP = WB_GDP.loc[WB_GDP['Country Code'] \
                                          .isin(winter_medal_table['Country Code'])] \
                                          .sort_values(by='Country Code', ascending=True)

In [12]:
WB_GDP

Unnamed: 0,Country Name,Country Code,1964,1968,1972,1976,1980,1984,1988,1992,1994,1998,2002,2006,2010,2014,2018,2020
13,Australia,AUS,23801100000.0,32716990000.0,52051400000.0,105101500000.0,150032300000.0,193593600000.0,236065900000.0,325480300000.0,322807300000.0,399404500000.0,395342700000.0,747556200000.0,1147589000000.0,1467505000000.0,1428530000000.0,1327836000000.0
14,Austria,AUT,9169984000.0,12440630000.0,22059610000.0,42959980000.0,82058910000.0,67985340000.0,133339400000.0,195078100000.0,203535200000.0,218259900000.0,213377800000.0,335998600000.0,391892700000.0,441996100000.0,454945900000.0,433258500000.0
17,Belgium,BEL,15960110000.0,21376350000.0,37209420000.0,71113880000.0,126829300000.0,83349530000.0,162299100000.0,234781700000.0,244884100000.0,258528300000.0,257157800000.0,407918100000.0,480951600000.0,534678100000.0,543008500000.0,521861300000.0
25,Belarus,BLR,,,,,,,,,17793000000.0,15264370000.0,14594250000.0,36954310000.0,57222490000.0,78813840000.0,60031260000.0,60258240000.0
21,Bulgaria,BUL,,,,,19839230000.0,17594940000.0,22555940000.0,10350520000.0,9697417000.0,15030700000.0,16402850000.0,34379810000.0,50682060000.0,57082010000.0,66363420000.0,69889350000.0
35,Canada,CAN,49377520000.0,71829810000.0,113082800000.0,206575600000.0,273853800000.0,355372600000.0,507354400000.0,592387700000.0,578139300000.0,634000000000.0,760649300000.0,1319265000000.0,1617343000000.0,1805750000000.0,1725329000000.0,1645423000000.0
40,China,CHN,59708340000.0,70846540000.0,113687600000.0,153940500000.0,191149200000.0,259946500000.0,312353600000.0,426915700000.0,564324700000.0,1029043000000.0,1470550000000.0,2752132000000.0,6087164000000.0,10475680000000.0,13894820000000.0,14722730000000.0
99,Croatia,CRO,,,,,,,,,,25792880000.0,27074550000.0,50860790000.0,60426020000.0,58330290000.0,62247870000.0,57203780000.0
54,Czech Republic,CZE,,,,,,,,34805010000.0,47850200000.0,66807430000.0,82196000000.0,156264100000.0,209069900000.0,209358800000.0,248950100000.0,245339300000.0
58,Denmark,DEN,,13505570000.0,23232380000.0,44575890000.0,71127530000.0,59105240000.0,115552800000.0,152915700000.0,156162400000.0,176991900000.0,178635200000.0,282884900000.0,321995300000.0,352993600000.0,356841200000.0,356084900000.0


Obviously, due to the birth of new nations as well as the emergence of formerly communist ones (such as Poland and Romania) in the world scene result in a lack of GDP data for up until the early 1990s. Even data for Winter Olympic powerhouses such as Slovenia is missing up until the late 1900s, as well as some countries have missing data altogether (such North Korea and Liechtenstein at the present).

In order to conduct a more focused analysis of more recent Winter Olympic trends, it would be far simpler and more accurate to set the beginning of the study to 1998 in order to accommodate Slovenia as well as eliminate Liechtenstein (which is culturally similar to Austria and Switzerland) and North Korea (which is just a poor version of South Korea) entirely. Also, 1998 was the first year snowboarding was introduced into the Winter Olympics as an official sport with medals awarded.

In [13]:
# Trimmed down year list
year_list = ['1998', '2002', '2006', '2010', '2014', '2018', '2020']

# Limit list only to countries with reliable data as explained above
GDP = WB_GDP[WB_country_list + year_list].loc[
    (WB_GDP['Country Code'] != 'LIE') &
    (WB_GDP['Country Code'] != 'PRK')]

In [14]:
GDP

Unnamed: 0,Country Name,Country Code,1998,2002,2006,2010,2014,2018,2020
13,Australia,AUS,399404500000.0,395342700000.0,747556200000.0,1147589000000.0,1467505000000.0,1428530000000.0,1327836000000.0
14,Austria,AUT,218259900000.0,213377800000.0,335998600000.0,391892700000.0,441996100000.0,454945900000.0,433258500000.0
17,Belgium,BEL,258528300000.0,257157800000.0,407918100000.0,480951600000.0,534678100000.0,543008500000.0,521861300000.0
25,Belarus,BLR,15264370000.0,14594250000.0,36954310000.0,57222490000.0,78813840000.0,60031260000.0,60258240000.0
21,Bulgaria,BUL,15030700000.0,16402850000.0,34379810000.0,50682060000.0,57082010000.0,66363420000.0,69889350000.0
35,Canada,CAN,634000000000.0,760649300000.0,1319265000000.0,1617343000000.0,1805750000000.0,1725329000000.0,1645423000000.0
40,China,CHN,1029043000000.0,1470550000000.0,2752132000000.0,6087164000000.0,10475680000000.0,13894820000000.0,14722730000000.0
99,Croatia,CRO,25792880000.0,27074550000.0,50860790000.0,60426020000.0,58330290000.0,62247870000.0,57203780000.0
54,Czech Republic,CZE,66807430000.0,82196000000.0,156264100000.0,209069900000.0,209358800000.0,248950100000.0,245339300000.0
58,Denmark,DEN,176991900000.0,178635200000.0,282884900000.0,321995300000.0,352993600000.0,356841200000.0,356084900000.0


Now given this streamlined table, the final step shall be to extrapolate the GDP of the respective countries from 1998 to 2022 using the InterpolatedUnivariateSpline feature of the SciPy package. Use a linear curve to account for the fact that these mostly developed countries have a slower economic growth rate compared to that of developing countries.

In [15]:
# Initialize a new column as a list
new_col = []

# Loop through each country to extrapolate to the year 2022
for index, country in enumerate(GDP['Country Code'], 0):
    # Given values
    xi = np.array([int(x) for x in year_list])
    yi = np.array(GDP.loc[GDP['Country Code'] == country][year_list])
    # Positions to inter/extrapolate
    x = np.linspace(1998, 2022)
    # spline order: 1 linear, 2 quadratic, 3 cubic ... 
    order = 1
    # do inter/extrapolation
    s = InterpolatedUnivariateSpline(xi, yi, k = order)
    y = s(x)
    new_col.append(y[(len(y)) - 1])

# Extract columns for hosting city and year from "slug_game" column
GDP['2022'] = new_col

# Finally, drop the column for the year 2020
GDP = GDP.drop(columns = '2020', axis = 1)

In [16]:
GDP

Unnamed: 0,Country Name,Country Code,1998,2002,2006,2010,2014,2018,2022
13,Australia,AUS,399404500000.0,395342700000.0,747556200000.0,1147589000000.0,1467505000000.0,1428530000000.0,1227143000000.0
14,Austria,AUT,218259900000.0,213377800000.0,335998600000.0,391892700000.0,441996100000.0,454945900000.0,411571100000.0
17,Belgium,BEL,258528300000.0,257157800000.0,407918100000.0,480951600000.0,534678100000.0,543008500000.0,500714100000.0
25,Belarus,BLR,15264370000.0,14594250000.0,36954310000.0,57222490000.0,78813840000.0,60031260000.0,60485220000.0
21,Bulgaria,BUL,15030700000.0,16402850000.0,34379810000.0,50682060000.0,57082010000.0,66363420000.0,73415270000.0
35,Canada,CAN,634000000000.0,760649300000.0,1319265000000.0,1617343000000.0,1805750000000.0,1725329000000.0,1565518000000.0
40,China,CHN,1029043000000.0,1470550000000.0,2752132000000.0,6087164000000.0,10475680000000.0,13894820000000.0,15550640000000.0
99,Croatia,CRO,25792880000.0,27074550000.0,50860790000.0,60426020000.0,58330290000.0,62247870000.0,52159690000.0
54,Czech Republic,CZE,66807430000.0,82196000000.0,156264100000.0,209069900000.0,209358800000.0,248950100000.0,241728500000.0
58,Denmark,DEN,176991900000.0,178635200000.0,282884900000.0,321995300000.0,352993600000.0,356841200000.0,355328500000.0


Correspondingly, trim down the table of Winter Olympic Medals to that of the smaller range of years being studied (basically, the 21st century).

In [17]:
# Trimmed down year list now with 2022 properly at the end
new_year_list = ['1998', '2002', '2006', '2010', '2014', '2018', '2022']

# Select the rows based on the above set of games
winter_medal_table = winter_medal_table.loc[winter_medal_table['Year'].isin(new_year_list)]

In [18]:
winter_medal_table

Unnamed: 0,Year,City,Sport,Event,Medal,Country Name,Country Code
0,2022,Beijing,Curling,Mixed Doubles,Gold,Italy,ITA
1,2022,Beijing,Curling,Mixed Doubles,Gold,Italy,ITA
2,2022,Beijing,Curling,Mixed Doubles,Silver,Norway,NOR
3,2022,Beijing,Curling,Mixed Doubles,Silver,Norway,NOR
4,2022,Beijing,Curling,Mixed Doubles,Bronze,Sweden,SWE
...,...,...,...,...,...,...,...
8400,1998,Nagano,Short Track Speed Skating,500M Women,Silver,People's Republic of China,CHN
8401,1998,Nagano,Short Track Speed Skating,500M Women,Bronze,Republic of Korea,KOR
8402,1998,Nagano,Short Track Speed Skating,1000M Women,Gold,Republic of Korea,KOR
8403,1998,Nagano,Short Track Speed Skating,1000M Women,Silver,People's Republic of China,CHN


As a result, the medal count for each country has been greatly trimmed down to eliminate defunct and redundant countries.

In [19]:
winter_medal_table['Country Name'].value_counts()

Germany                       232
United States of America      204
Norway                        202
Canada                        177
Russian Federation            170
Austria                       128
Netherlands                    97
Sweden                         90
France                         88
Switzerland                    86
Italy                          80
People's Republic of China     78
Republic of Korea              69
Japan                          57
Finland                        56
Czech Republic                 33
Slovenia                       21
Poland                         19
Belarus                        18
Australia                      18
Great Britain                  17
Latvia                         14
Croatia                        11
Slovakia                       10
Estonia                         8
Ukraine                         8
Kazakhstan                      5
New Zealand                     5
Bulgaria                        5
Belgium       

The final level of cleaning needs to be performed in order to pick which names are simpler to present between the two tables

In [20]:
# Recreating the list of the countries from the trimmed Winter Olympics medal table
countries = winter_medal_table['Country Name'].value_counts().index[::].tolist()
codes = winter_medal_table['Country Code'].value_counts().index[::].tolist()
mismatch = []

# Check which codes do not match
for index, code in enumerate(codes, 0):
    if (GDP.loc[GDP['Country Code'] == code, 'Country Name']
        == countries[index]).any():
        pass
    else:
        mismatch.append([countries[index], 
                         GDP.loc[GDP['Country Code'] == code]
                                                ['Country Name']])

In [21]:
mismatch

[['United States of America',
  251    United States
  Name: Country Name, dtype: object],
 ["People's Republic of China",
  40    China
  Name: Country Name, dtype: object],
 ['Republic of Korea',
  126    Korea, Rep.
  Name: Country Name, dtype: object],
 ['Great Britain',
  81    United Kingdom
  Name: Country Name, dtype: object],
 ['Slovakia',
  221    Slovak Republic
  Name: Country Name, dtype: object]]

The simplest name is the best. Therefore, the following shall be implemented on both tables.

In [22]:
# Change country names in the Winter Olympic medal table
winter_medal_table['Country Name'].replace({'United States of America': 'United States'},
                                          inplace = True)
winter_medal_table['Country Name'].replace({"People's Republic of China": 'China'},
                                          inplace = True)
winter_medal_table['Country Name'].replace({'Republic of Korea': 'South Korea'},
                                          inplace = True)
winter_medal_table['Country Name'].replace({'Great Britain': 'United Kingdom'},
                                          inplace = True)


# Change country names in the World Bank table
GDP.loc[
    GDP['Country Name'] == 'Korea, Rep.', 'Country Name'] = 'South Korea'
GDP.loc[
    GDP['Country Name'] == 'Slovak Republic', 'Country Name'] = 'Slovakia'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [23]:
winter_medal_table['Country Name'].value_counts()

Germany               232
United States         204
Norway                202
Canada                177
Russian Federation    170
Austria               128
Netherlands            97
Sweden                 90
France                 88
Switzerland            86
Italy                  80
China                  78
South Korea            69
Japan                  57
Finland                56
Czech Republic         33
Slovenia               21
Poland                 19
Belarus                18
Australia              18
United Kingdom         17
Latvia                 14
Croatia                11
Slovakia               10
Estonia                 8
Ukraine                 8
Kazakhstan              5
New Zealand             5
Bulgaria                5
Belgium                 4
Hungary                 4
Spain                   3
Denmark                 1
Name: Country Name, dtype: int64

Finally, trim down the list of countries that have won medals during the years 2002 to 2022.

In [24]:
# Limit list only to countries that have scored medals in the Winter Olympics
GDP = GDP.loc[GDP['Country Code'] \
              .isin(winter_medal_table['Country Code'])] \
              .sort_values(by='Country Code', ascending = True)

# Set index to country name
GDP.index = GDP['Country Name']
GDP = GDP.drop(columns = ['Country Name', 'Country Code'])

In [25]:
GDP

Unnamed: 0_level_0,1998,2002,2006,2010,2014,2018,2022
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Australia,399404500000.0,395342700000.0,747556200000.0,1147589000000.0,1467505000000.0,1428530000000.0,1227143000000.0
Austria,218259900000.0,213377800000.0,335998600000.0,391892700000.0,441996100000.0,454945900000.0,411571100000.0
Belgium,258528300000.0,257157800000.0,407918100000.0,480951600000.0,534678100000.0,543008500000.0,500714100000.0
Belarus,15264370000.0,14594250000.0,36954310000.0,57222490000.0,78813840000.0,60031260000.0,60485220000.0
Bulgaria,15030700000.0,16402850000.0,34379810000.0,50682060000.0,57082010000.0,66363420000.0,73415270000.0
Canada,634000000000.0,760649300000.0,1319265000000.0,1617343000000.0,1805750000000.0,1725329000000.0,1565518000000.0
China,1029043000000.0,1470550000000.0,2752132000000.0,6087164000000.0,10475680000000.0,13894820000000.0,15550640000000.0
Croatia,25792880000.0,27074550000.0,50860790000.0,60426020000.0,58330290000.0,62247870000.0,52159690000.0
Czech Republic,66807430000.0,82196000000.0,156264100000.0,209069900000.0,209358800000.0,248950100000.0,241728500000.0
Denmark,176991900000.0,178635200000.0,282884900000.0,321995300000.0,352993600000.0,356841200000.0,355328500000.0


Since the time frame is rather short and the countries listed are relatively political stable (ignoring the present situation in Eastern Europe), there is no need to factor in inflation for the GDP figures.

In [26]:
# Initial data dump (note: data only extends to 2020 and will have to be extrapolated)
pop_country_list = ['Country Name', 'Country Code']
pop_year_list = ['1998', '2002', '2006', '2010', '2014', '2018', '2020']
population = pd.read_csv(
    'API_SP.POP.TOTL_DS2_en_csv_v2_3918567/API_SP.POP.TOTL_DS2_en_csv_v2_3918567.csv', 
    skiprows = 4,
    usecols = (pop_country_list + pop_year_list))

# Change country names in the World Bank table
population.loc[
    population['Country Name'] == 'Korea, Rep.', 'Country Name'] = 'South Korea'
population.loc[
    population['Country Name'] == 'Slovak Republic', 'Country Name'] = 'Slovakia'

# Change the country codes in the World Bank table to match that of the Winter Olympic table
for country in wrong:
    population.loc[
        population['Country Name'] == country[0], 'Country Code'] = country[1]

# Limit list only to countries that have scored medals in the Winter Olympics
population = population.loc[population['Country Code'] \
                                          .isin(winter_medal_table['Country Code'])] \
                                          .sort_values(by='Country Code', ascending = True)

# Initialize a new column as a list
new_col = []

# Loop through each country to extrapolate to the year 2022
for index, country in enumerate(population['Country Code'], 0):
    # Given values
    xi = np.array([int(x) for x in year_list])
    yi = np.array(population.loc[population['Country Code'] == country][year_list])
    # Positions to inter/extrapolate
    x = np.linspace(1998, 2022)
    # spline order: 1 linear, 2 quadratic, 3 cubic ... 
    order = 1
    # do inter/extrapolation
    s = InterpolatedUnivariateSpline(xi, yi, k = order)
    y = s(x)
    new_col.append(y[(len(y)) - 1])

# Extract columns for hosting city and year from "slug_game" column
population['2022'] = new_col

# Finally, drop the column for the year 2020
population = population.drop(columns = '2020', axis = 1)

# Set index to country name
population.index = population['Country Name']
population = population.drop(columns = ['Country Name', 'Country Code'])

In [27]:
population.style.format('{:,.0f}')

Unnamed: 0_level_0,1998,2002,2006,2010,2014,2018,2022
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Australia,18711000,19651400,20697900,22031750,23475686,24982688,26391394
Austria,7976789,8081957,8268641,8363404,8546356,8840521,8993889
Belgium,10203008,10332785,10547958,10895586,11209057,11427054,11684940
Belarus,10071963,9865548,9604924,9483836,9448515,9438785,9321119
Bulgaria,8256786,7837161,7601022,7395599,7223938,7025037,6842993
Canada,30155173,31360079,32571174,34004889,35437435,37065178,38945298
China,1241935000,1280400000,1311020000,1337705000,1371860000,1402760000,1419098724
Croatia,4532135,4302174,4311159,4295427,4238389,4087843,4006557
Czech Republic,10294373,10196916,10238905,10474410,10525347,10629928,10767864
Denmark,5304219,5375931,5437272,5547683,5643475,5793636,5869172


After the GDP and population have been tabulated, the next step is to calculate the GDP per capita.

In [28]:
GDP_per_capita = (GDP / population).round(decimals = 2)

In [29]:
GDP_per_capita.style.format("{:,.0f}")

Unnamed: 0_level_0,1998,2002,2006,2010,2014,2018,2022
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Australia,21346,20118,36117,52088,62512,57181,46498
Austria,27362,26402,40635,46858,51718,51461,45761
Belgium,25338,24888,38673,44142,47701,47520,42851
Belarus,1516,1479,3847,6034,8341,6360,6489
Bulgaria,1820,2093,4523,6853,7902,9447,10729
Canada,21025,24255,40504,47562,50956,46549,40198
China,829,1149,2099,4550,7636,9905,10958
Croatia,5691,6293,11797,14068,13762,15228,13019
Czech Republic,6490,8061,15262,19960,19891,23420,22449
Denmark,33368,33229,52027,58041,62549,61592,60542


First, general medal count data shall be tallied. The actual weighting of the medal does not matter, but only the overall quantity of them regardless of whether they were gold, silver, or bronze.

In [30]:
medal_count = population.copy()

In [31]:
for col in medal_count.columns:
    medal_count[col].values[:] = 0

In [33]:
olympiads = new_year_list

for index, year in enumerate(olympiads, 0):
    # Find medals in each year
    medals = winter_medal_table[winter_medal_table['Year'].isin([year])]
    # Count values
    medals = medals['Country Name'].value_counts()
    # Create dataframe
    medals = medals.to_frame()
    # Extract values and index
    medals_values = medals['Country Name']
    medals_index = medals.index
    # Dump values into the table
    medal_count.loc[medals_index, year] = medals_values
    # Convert to int
    medal_count[year] = medal_count[year].astype('int')

In [34]:
medal_count

Unnamed: 0_level_0,1998,2002,2006,2010,2014,2018,2022
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Australia,1,2,2,3,3,3,4
Austria,17,17,24,17,18,16,19
Belgium,1,0,0,0,0,1,2
Belarus,2,1,1,3,6,3,2
Bulgaria,1,3,1,0,0,0,0
Canada,16,18,26,29,27,34,27
China,8,9,13,13,9,10,16
Croatia,0,4,3,3,1,0,0
Czech Republic,3,3,4,6,8,7,2
Denmark,1,0,0,0,0,0,0


Next, the medals shall be differentiated by event.

In [None]:
winter_medal_table['Sport'].value_counts()

Clean the data to merge "Short Track" with "Short Track Speed Skating".

In [None]:
# Change sport names in the Winter Olympic medal table
winter_medal_table['Sport'].replace({'Short Track': 'Short Track Speed Skating'},
                                          inplace = True)
winter_medal_table['Sport'].replace({'Speed skating': 'Long Track Speed Skating'},
                                          inplace = True)

In [None]:
winter_medal_table['Sport'].value_counts()

In [None]:
# Select the rows based on the above set of games
winter_medal_table.loc[winter_medal_table['Sport'].isin(['Cross Country Skiing'])
                      & winter_medal_table['Year'].isin(['2002'])]['Country Name'].value_counts()