In [64]:
#import dependencies
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

In [65]:
#Load CSV Files

# sources: https://www.kaggle.com/heesoo37/120-years-of-olympic-history-athletes-and-results/data#
# Data starts in 1896
events_file = 'Resources/athlete_events.csv'
df_events_orig = pd.read_csv(events_file)

# source: World Bank (https://databank.worldbank.org/reports.aspx?source=2&series=NY.GDP.MKTP.CD&country=#) 
# Data starts in 1960
pop_gdp_file = 'Resources/population_gdp.csv'
df_pop_gdp_orig = pd.read_csv(pop_gdp_file)

In [66]:
# clean worldbank data
df_pop_gdp = df_pop_gdp_orig
df_pop_gdp = df_pop_gdp.drop(columns = ['Series Code'], axis =1)
df_pop_gdp = df_pop_gdp.rename(
    columns={
        "Series Name": "series_name", 
        "Country Name": "country_name",
        "Country Code": "country_code"})

df_pop_gdp.columns = df_pop_gdp.columns.str.split(' ').str[0].tolist()

keep_columns = ['series_name', 'country_name', 'country_code']
event_years = df_events_orig.Year.unique().astype(str)

for i in event_years:
    keep_columns.append(i)

df_pop_gdp = df_pop_gdp[df_pop_gdp.columns.intersection(keep_columns)]

df_gdp = df_pop_gdp.loc[df_pop_gdp["series_name"] == "GDP (current US$)"]
df_pop = df_pop_gdp.loc[df_pop_gdp["series_name"] == "Population, total"]

In [None]:
# clean Olympic NOC and worldbank country_code to match each other

# extract WorldBank's country_code and country_name
df_worldbanknames = df_pop[["country_code", "country_name"]]

# create a column with stripped country_name
# Sanitize
df_worldbanknames['Wclean'] = df_worldbanknames ['country_name'].str.replace(" ", "").str.lower()

# create a column with stripped Team name in Olympic Data
df_events = df_events_orig
# Santize
df_events['Oclean'] = df_events ['Team'].str.replace(" ", "").str.lower()

# Unique transformations of countries that are in both data bases, but didn't match up.
df_events['NOC'] = df_events['NOC'].str.replace("IRI", "IRN")
df_events['NOC'] = df_events['NOC'].str.replace("GER", "DEU")
df_events['NOC'] = df_events['NOC'].str.replace("BAH", "BHS")
df_events['NOC'] = df_events['NOC'].str.replace("SUI", "CHE")
df_events['NOC'] = df_events['NOC'].str.replace("ISV", "VIR")
df_events['NOC'] = df_events['NOC'].str.replace("GRE", "GRC")
df_events['NOC'] = df_events['NOC'].str.replace("DEN", "DNK")
df_events['NOC'] = df_events['NOC'].str.replace("NED", "NLD")
df_events['NOC'] = df_events['NOC'].str.replace("CGO", "COG")
df_events['NOC'] = df_events['NOC'].str.replace("LAT", "LVA")
df_events['NOC'] = df_events['NOC'].str.replace("INA", "IDN")
df_events['NOC'] = df_events['NOC'].str.replace("GAM", "GMB")
df_events['NOC'] = df_events['NOC'].str.replace("GBS", "GNB")
df_events['NOC'] = df_events['NOC'].str.replace("MAS", "MYS")
df_events['NOC'] = df_events['NOC'].str.replace("NGR", "NGA")
df_events['NOC'] = df_events['NOC'].str.replace("VIN", "VCT")
df_events['NOC'] = df_events['NOC'].str.replace("BRU", "BRN")
df_events['NOC'] = df_events['NOC'].str.replace("SLO", "SVN")
df_events['NOC'] = df_events['NOC'].str.replace("MRI", "MUS")
df_events['NOC'] = df_events['NOC'].str.replace("BUL", "BGR")
df_events['NOC'] = df_events['NOC'].str.replace("PUR", "PRI")
df_events['NOC'] = df_events['NOC'].str.replace("MON", "MCO")
df_events['NOC'] = df_events['NOC'].str.replace("SKN", "KNA")


# merge by stripped country names and make sure Olympic Data NOC matches WorldBank country_code
df_events = pd.merge(df_events, df_worldbanknames, how="left", left_on='Oclean', right_on = 'Wclean')
df_events['NOC'] = np.where(df_events['country_code'].isnull() == False, df_events['country_code'], df_events['NOC'])

# drop unncessary columns
df_events = df_events.drop(columns = ['Oclean', 'country_code', 'country_name', 'Wclean'], axis = 1)

# df_events

In [None]:
### DEBUG TO DISCUSS WITH TEAM ###

# Check for items that are not merged
checkmerge = pd.merge(df_events, df_worldbanknames, how="left", left_on='NOC', right_on = 'country_code')
unmatched = checkmerge[checkmerge['country_code'].isnull()]
# mergesuccess = checkmerge[checkmerge['country_code'].isnull() == False]

checking = unmatched
checking['Year'] = checking['Year'].astype('int')
checking = checking.sort_values("Year", ascending = False)

clearingbydate = checking

# We need to decide how to handle the following cases that are not in the World Bank Database.
# We can just keep them in the Olympics Database and note that they don't have GDP information.

# latest 2016 participants
# Taipei not recognized by WB
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('TPE') == False]
# Individual Olympics
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('IOA') == False]
# Palestine not in world bank
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('PLE') == False]
# Refugee
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('ROT') == False]
# Cook Islands
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('COK') == False]

# latest 2008 participants
# Netherlands Antilles
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('AHO') == False]
# Not sure if this is Portugal
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('POR') == False]

# Latest 2006 Participants
# Serbia and Montenegro
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('SCG') == False]

# Latest 1992 Participants
# bunch of soviet countries that don't exist anymore
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('TCH') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('EUN') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('YUG') == False]

# Latest 1988 Participants
# West and East Germany, Soviet Union
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('FRG') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('GDR') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('URS') == False]
# the two yemens
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('YMD') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('YAR') == False]

# latest 1964
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('CAM') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('BER') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('ZIM') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('PHI') == False]

# clearingbydate

In [None]:
checking = df_events
checking['Year'] = checking['Year'].astype('int')
checking = checking.sort_values("Year")
# checking[checking['Team'] == "Bohemia"]
# checking[checking['NOC'] == "URS"]
# checking

In [None]:
# unmatched, but before 1960
unmatched_valuecounts = clearingbydate['NOC'].value_counts().rename_axis('unique_values').reset_index(name='counts')
# unmatched_valuecounts 

In [63]:
# df_events.head()

In [8]:
# df_gdp.head()

In [9]:
# df_pop.head()