In [605]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [606]:
gdp_df = pd.read_csv('GDP_new.csv')
results_df = pd.read_csv('Results.csv')
hosts_df = pd.read_csv('Hosts.csv')

In [607]:
gdp_df.head()

Unnamed: 0,Country,Country Code,1960,1961,1962,1963,1964,1965,1966,1967,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Aruba,ABW,,,,,,,,,...,2727933000.0,2791061000.0,2963128000.0,2983799000.0,3092179000.0,3276188000.0,3395794000.0,2610039000.0,3126019000.0,
1,Africa Eastern and Southern,AFE,21125020000.0,21616230000.0,23506280000.0,28048360000.0,25920670000.0,29472100000.0,32014370000.0,33269510000.0,...,985987100000.0,1006526000000.0,927348500000.0,885176400000.0,1021043000000.0,1007196000000.0,1000834000000.0,927593300000.0,1081998000000.0,1169484000000.0
2,Afghanistan,AFG,537777800.0,548888900.0,546666700.0,751111200.0,800000000.0,1006667000.0,1400000000.0,1673333000.0,...,20564490000.0,20550580000.0,19998140000.0,18019550000.0,18896350000.0,18418860000.0,18904500000.0,20143450000.0,14583140000.0,
3,Africa Western and Central,AFW,10447640000.0,11173210000.0,11990530000.0,12727690000.0,13898110000.0,14929790000.0,15910840000.0,14510580000.0,...,833948100000.0,894322500000.0,768644700000.0,691363400000.0,684898800000.0,767025700000.0,822538400000.0,786460000000.0,844459700000.0,877863300000.0
4,Angola,AGO,,,,,,,,,...,133401600000.0,137244400000.0,87219300000.0,49840490000.0,68972770000.0,77792940000.0,69309110000.0,50241370000.0,65685440000.0,106713600000.0


In [608]:
results_df.head()

Unnamed: 0,discipline_title,event_title,slug_game,participant_type,medal_type,athletes,rank_equal,rank_position,country_name,country_code,country_3_letter_code,athlete_url,athlete_full_name,value_unit,value_type
0,Curling,Mixed Doubles,beijing-2022,GameTeam,GOLD,"[('Stefania CONSTANTINI', 'https://olympics.co...",False,1,Italy,IT,ITA,,,,
1,Curling,Mixed Doubles,beijing-2022,GameTeam,SILVER,"[('Kristin SKASLIEN', 'https://olympics.com/en...",False,2,Norway,NO,NOR,,,,
2,Curling,Mixed Doubles,beijing-2022,GameTeam,BRONZE,"[('Almida DE VAL', 'https://olympics.com/en/at...",False,3,Sweden,SE,SWE,,,,
3,Curling,Mixed Doubles,beijing-2022,GameTeam,,"[('Jennifer DODDS', 'https://olympics.com/en/a...",False,4,Great Britain,GB,GBR,,,,
4,Curling,Mixed Doubles,beijing-2022,GameTeam,,"[('Rachel HOMAN', 'https://olympics.com/en/ath...",False,5,Canada,CA,CAN,,,,


In [609]:
hosts_df.head()

Unnamed: 0,game_slug,game_end_date,game_start_date,game_location,game_name,game_season,game_year
0,beijing-2022,2022-02-20T12:00:00Z,2022-02-04T15:00:00Z,China,Beijing 2022,Winter,2022
1,tokyo-2020,2021-08-08T14:00:00Z,2021-07-23T11:00:00Z,Japan,Tokyo 2020,Summer,2020
2,pyeongchang-2018,2018-02-25T08:00:00Z,2018-02-08T23:00:00Z,Republic of Korea,PyeongChang 2018,Winter,2018
3,rio-2016,2016-08-21T21:00:00Z,2016-08-05T12:00:00Z,Brazil,Rio 2016,Summer,2016
4,sochi-2014,2014-02-23T16:00:00Z,2014-02-07T04:00:00Z,Russian Federation,Sochi 2014,Winter,2014


In [610]:
# prep GDP df
olympic_years = [1960, 1964, 1968, 1972, 1976, 1980, 1984, 1988, 1992, 1994, 1996, 1998, 2000, 2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020]
gdp_olympic_years_df = gdp_df[['Country', 'Country Code'] + [str(year) for year in olympic_years]]
gdp_olympic_years_df.head()

gdp_clean = pd.melt(gdp_olympic_years_df, id_vars=['Country', 'Country Code'], 
                  var_name='Year', value_name='GDP').dropna(subset=['GDP'])

gdp_clean['Year'] = gdp_clean['Year'].astype(int)
gdp_clean

Unnamed: 0,Country,Country Code,Year,GDP
1,Africa Eastern and Southern,AFE,1960,2.112502e+10
2,Afghanistan,AFG,1960,5.377778e+08
3,Africa Western and Central,AFW,1960,1.044764e+10
13,Australia,AUS,1960,1.860656e+10
14,Austria,AUT,1960,6.592694e+09
...,...,...,...,...
6112,Samoa,WSM,2020,8.688984e+08
6113,Kosovo,XKX,2020,7.717143e+09
6115,South Africa,ZAF,2020,3.376196e+11
6116,Zambia,ZMB,2020,1.811064e+10


In [611]:
# prep hosts df
hosts_clean = hosts_df.drop(columns=['game_end_date', 'game_start_date', 'game_name'])
hosts_clean.head()

Unnamed: 0,game_slug,game_location,game_season,game_year
0,beijing-2022,China,Winter,2022
1,tokyo-2020,Japan,Summer,2020
2,pyeongchang-2018,Republic of Korea,Winter,2018
3,rio-2016,Brazil,Summer,2016
4,sochi-2014,Russian Federation,Winter,2014


In [612]:
# prep results df
# Extract the year from the 'slug_game' column in results_df
results_df['year'] = results_df['slug_game'].str.extract(r'(\d{4})').astype(int)

# Filter the results DataFrame for only the Olympic years
results_olympic_df = results_df[results_df['year'].isin(olympic_years)]

results_olympic_df = results_olympic_df.drop(columns=['discipline_title', 'event_title', 'participant_type', 'athletes', 'rank_equal', 'rank_position', 'country_code', 'athlete_url', 'athlete_full_name', 'value_unit', 'value_type'])
results_clean = results_olympic_df.dropna(subset=['medal_type']).groupby(
    ['slug_game', 'country_name', 'country_3_letter_code', 'year']).agg(
    medal_count=('medal_type', 'size'),
    gold_count=('medal_type', lambda x: (x == 'GOLD').sum()),
    silver_count=('medal_type', lambda x: (x == 'SILVER').sum()),
    bronze_count=('medal_type', lambda x: (x == 'BRONZE').sum())).reset_index()

results_clean

Unnamed: 0,slug_game,country_name,country_3_letter_code,year,medal_count,gold_count,silver_count,bronze_count
0,albertville-1992,Austria,AUT,1992,21,6,7,8
1,albertville-1992,Canada,CAN,1992,7,2,3,2
2,albertville-1992,Czechoslovakia,TCH,1992,3,0,0,3
3,albertville-1992,Democratic People's Republic of Korea,PRK,1992,1,0,0,1
4,albertville-1992,Finland,FIN,1992,7,3,1,3
...,...,...,...,...,...,...,...,...
1323,vancouver-2010,Slovakia,SVK,2010,3,1,1,1
1324,vancouver-2010,Slovenia,SLO,2010,3,0,2,1
1325,vancouver-2010,Sweden,SWE,2010,11,5,2,4
1326,vancouver-2010,Switzerland,SUI,2010,9,6,0,3


In [613]:
# merge results and hosts

merged_games = results_clean.merge(hosts_clean, left_on=['slug_game'], right_on=['game_slug']).drop(columns=['game_slug', 'game_year'])

In [614]:
# Check for mismatches between 'country_name' and 'Country'
merged_games_countries = set(merged_games['country_name'].drop_duplicates())
gdp_clean_countries = set(gdp_clean['Country'].drop_duplicates())

# Find codes that are in merged_games but not in gdp_clean
missing_in_gdp_clean = merged_games_countries - gdp_clean_countries
print("Countries in merged_games but not in gdp_clean:", missing_in_gdp_clean)

# Find codes that are in gdp_clean but not in merged_games
missing_in_merged_games = gdp_clean_countries - merged_games_countries
print("Countires in gdp_clean but not in merged_games:", missing_in_merged_games)

len(missing_in_gdp_clean)

Countries in merged_games but not in gdp_clean: {'Soviet Union', 'Serbia and Montenegro', 'Czechoslovakia', 'Netherlands Antilles', 'Ivory Coast', 'Chinese Taipei', 'Hong Kong, China', 'The Former Yugoslav Republic of Macedonia', "Democratic People's Republic of Korea", 'Venezuela', 'Unified Team', 'Islamic Republic of Iran', 'United States of America', 'Turkey', 'United Arab Republic', 'Bahamas', 'Egypt', 'ROC', 'Slovakia', 'Republic of Korea', 'US Virgin Islands', 'West Indies Federation', 'United Republic of Tanzania', 'Federal Republic of Germany', 'Olympic Athletes from Russia', "People's Republic of China", 'Republic of Moldova', 'German Democratic Republic (Germany)', 'Independent Olympic Athletes', 'Czech Republic', "Côte d'Ivoire", 'Yugoslavia', 'Great Britain', 'Kyrgyzstan'}
Countires in gdp_clean but not in merged_games: {'Fragile and conflict affected situations', 'Equatorial Guinea', 'Aruba', 'Guinea', 'North America', 'Bolivia', 'Sint Maarten (Dutch part)', 'Lower middle 

34

In [615]:
country_name_mapping = {
    'Soviet Union': 'Russian Federation', 
    'Serbia and Montenegro': 'Serbia', 
    'Czechoslovakia': 'Czechia', 
    'Netherlands Antilles': 'Netherlands', 
    'Ivory Coast': "Cote d'Ivoire", 
    'Hong Kong, China': 'Hong Kong SAR, China',
    'The Former Yugoslav Republic of Macedonia': 'North Macedonia', 
    'Venezuela': 'Venezuela, RB', 
    'Unified Team': 'Russian Federation', 
    'Islamic Republic of Iran': 'Iran, Islamic Rep.', 
    'United States of America': 'United States', 
    'Turkey': 'Turkiye', 
    'United Arab Republic': 'Egypt, Arab Rep.', 
    'Bahamas': 'Bahamas, The', 
    'Egypt': 'Egypt, Arab Rep.',
    'Slovakia': 'Slovak Republic',
    'Republic of Korea': 'Korea, Rep.',
    'US Virgin Islands': 'Virgin Islands (U.S.)', 
    'United Republic of Tanzania': 'Tanzania', 
    'Federal Republic of Germany': 'Germany',  # Historical reference; now Germany
    'Olympic Athletes from Russia': 'Russian Federation',
    "People's Republic of China": 'China',
    'Republic of Moldova': 'Moldova',
    'German Democratic Republic (Germany)': 'Germany',  # Historical reference; now Germany
    'Czech Republic': 'Czechia',
    "Côte d'Ivoire": "Cote d'Ivoire",
    'Yugoslavia': 'Serbia',  # Or other successor states like Croatia, Slovenia, etc.
    'Great Britain': 'United Kingdom',
    'Kyrgyzstan': 'Kyrgyz Republic'
}

# Update merged_games and gdp_clean datasets with standardized country names
merged_games['country_name'] = merged_games['country_name'].replace(country_name_mapping)
gdp_clean['Country'] = gdp_clean['Country'].replace(country_name_mapping)

# Re-check for discrepancies after standardization
merged_games_countries = set(merged_games['country_name'].drop_duplicates())
gdp_clean_countries = set(gdp_clean['Country'].drop_duplicates())

missing_in_gdp_clean = merged_games_countries - gdp_clean_countries
missing_in_merged_games = gdp_clean_countries - merged_games_countries

print("Countries in merged_games but not in gdp_clean:", missing_in_gdp_clean)
print("Countries in gdp_clean but not in merged_games:", missing_in_merged_games)

Countries in merged_games but not in gdp_clean: {"Democratic People's Republic of Korea", 'Independent Olympic Athletes', 'ROC', 'Chinese Taipei', 'West Indies Federation'}
Countries in gdp_clean but not in merged_games: {'Fragile and conflict affected situations', 'Equatorial Guinea', 'Aruba', 'Guinea', 'North America', 'Bolivia', 'Sint Maarten (Dutch part)', 'Lower middle income', 'Sierra Leone', 'IDA & IBRD total', 'Haiti', 'Gambia, The', 'OECD members', 'Myanmar', 'IDA total', 'Bosnia and Herzegovina', 'Bhutan', 'Northern Mariana Islands', 'Comoros', 'Faroe Islands', 'Dominica', 'Nicaragua', 'East Asia & Pacific (excluding high income)', 'Central Europe and the Baltics', 'High income', 'Seychelles', 'Africa Western and Central', 'East Asia & Pacific', 'Latin America & Caribbean (excluding high income)', 'Middle East & North Africa (excluding high income)', 'Papua New Guinea', 'Sub-Saharan Africa (IDA & IBRD countries)', 'Latin America & Caribbean', 'Eswatini', 'Angola', 'South Asia

In [637]:
final_merged = merged_games.merge(gdp_clean, 
                                  left_on=['country_name', 'year'], 
                                  right_on=['Country', 'Year']).drop(columns=
                                                                          ['Country', 'Country Code', 'Year', 'slug_game'])

final_merged['country_name'].nunique() #132

final_merged

Unnamed: 0,country_name,country_3_letter_code,year,medal_count,gold_count,silver_count,bronze_count,game_location,game_season,GDP
0,Austria,AUT,1992,21,6,7,8,France,Winter,1.950781e+11
1,Canada,CAN,1992,7,2,3,2,France,Winter,5.943761e+11
2,Czechia,TCH,1992,3,0,0,3,France,Winter,3.480501e+10
3,Finland,FIN,1992,7,3,1,3,France,Winter,1.125325e+11
4,France,FRA,1992,9,3,5,1,France,Winter,1.401466e+12
...,...,...,...,...,...,...,...,...,...,...
1186,Slovak Republic,SVK,2010,3,1,1,1,Canada,Winter,9.116284e+10
1187,Slovenia,SLO,2010,3,0,2,1,Canada,Winter,4.820824e+10
1188,Sweden,SWE,2010,11,5,2,4,Canada,Winter,4.958126e+11
1189,Switzerland,SUI,2010,9,6,0,3,Canada,Winter,5.988510e+11


In [639]:
final_merged.to_csv('olympics.csv', index=False)