In [1]:
 import pandas as pd

# Reading in the data
df = pd.read_csv("OG_Data.csv")

In [2]:
"""------------
Cleaning Step 1
------------"""

# dropping douplicates to not count teamevents multiple times 
df = df.drop_duplicates(subset=['NOC', 'Year', 'Season', 'Event', 'Medal']).reset_index()

# Creating a new row "Total_Medals"
df["Total_Medals"] = 0
df.loc[(df["Medal"] == "Bronze") | (df["Medal"] == "Silver") | (df["Medal"] == "Gold"), "Total_Medals"] = 1

# Creating a key (only needed for the host-effect analysis)
df["key"] = df['Year'].astype(str) + df['Season']

# Converting "Year" to a timestamp
df['Year'] = pd.to_datetime(df['Year'], format='%Y')

In [3]:
"""------------
Cleaning Step 2
------------"""

# Some NOCs have to be changed --> Manually selected in DataCleaning.xlsx
change_NOC_dict = {"Almaz":"RUS",
                    "West Germany":"GER",
                    "Maid of Lebanon":"LBN",
                    "Lebanon":"LBN",
                    "Malaya":"MAS",
                    "East Germany":"GER",
                    "Burevestnik":"RUS",
                    "Druzhba":"RUS",
                    "Kon-Tiki":"RUS",
                    "Korshun":"RUS",
                    "Bohemia":"CZE",
                    "South Vietnam":"VIE",
                    "Netherlands Antilles":"NED",
                    "Neptun II":"RUS",
                    "Nokaut II":"RUS",
                    "Peri":"ZAM",
                    "North Yemen":"YEM",
                    "West Germany-2":"GER",
                    "East Germany-2":"GER",
                    "South Yemen":"YEM",
                    "East Germany-1":"GER",
                    "Olen":"RUS",
                    "Persey":"RUS",
                    "Soviet Union":"RUS",
                    "West Germany-1":"GER",
                    "East Germany-3":"GER",
                    "Soviet Union-1":"RUS",
                    "Crete":"GRE",
                    "Sydney Rowing Club":"AUS",
                    "Bohemia-2":"CZE",
                    "Bohemia-3":"CZE",
                    "Bohemia-1":"CZE",
                    "Soviet Union-2":"RUS",
                    "Soviet Union-3":"RUS",
                    "Taifun":"RUS",
                    "Tulilind":"RUS",
                    "West Germany-3":"GER",
                    "South Sudan":"SUD",
                    "Newfoundland":"CAN",
                    "Uragan":"RUS",
                    "Viktoriya":"RUS"}

# Changing the NOCs
for team, new_noc in change_NOC_dict.items():
    df.loc[df["Team"] == team, "NOC"] = new_noc

In [4]:
"""------------
Cleaning Step 3
------------"""

# Removing rows where we cannot map the team to a country --> Manually selected in DataCleaning.xlsx
teams_to_remove = ["Australasia",
                "Bohemia/Great Britain",
                "Cha-Cha III",
                "Circus",
                "Czechoslovakia",
                "Czechoslovakia-1",
                "Czechoslovakia-2",
                "Czechoslovakia-3",
                "Individual Olympic Athletes",
                "Konstanz",
                "North Borneo",
                "Primorka",
                "Refugee Olympic Athletes",
                "Rhodesia",
                "Saar",
                "Serbia and Montenegro",
                "Unified Team",
                "Unified Team-1",
                "Unified Team-2",
                "United Arab Republic",
                "Unknown",
                "West Indies Federation",
                "Yugoslavia",
                "Yugoslavia-1",
                "Yugoslavia-2",
                "Unified Team-3"]

df = df[~df["Team"].isin(teams_to_remove)]

In [5]:
df.to_csv("Olympic_data_final.csv")