In [None]:
import pandas as pd
import numpy as np
import re
olympic_data=pd.read_csv('athlete_events.csv')

In [None]:
#Number of countries and athletes participating is inconsistent before 1996 - use only data post 1996
condition_year_1996=olympic_data["Year"]>=1996
olympic_data=olympic_data[condition_year_1996]
condition_summer=olympic_data["Season"]=="Summer"
olympic_data=olympic_data[condition_summer]

In [None]:
# Create a mapping dictionary for corrections
# Replace incorrect names with correct ones

corrections = {
    "Athina": "Athens",
    "Roma": "Rome",
    "Moskva": "Moscow",
    "Sankt Moritz": "St. Moritz"
}


olympic_data['City'] = olympic_data['City'].replace(corrections)

In [None]:
#Investigating quality of ID column
#Duplicates exist but this is intentional, as one athlete can participate in more than one event/olympics.
#ID numbering is consistent throughout years - e.g. same ID for athlete in different olympic games.
#ID is based on alphabetical order of last name
#no missing values in the ID column
#Team column comments: Team names are inconsistent - mixed with countries and sports team names. We will use NOC column instead for our analysis

In [None]:
#Replace Na values as "No Medal" string as null values correspond to no winning athletes.
olympic_data["Medal"] = olympic_data["Medal"].apply(lambda x:"No Medal" if pd.isna(x) else x)

In [None]:
#The difference between the medal count can be due to differences on the number of team sport members and ties.
olympic_data["Medal"].value_counts()

In [None]:
olympic_data.drop_duplicates(inplace = True)
print(olympic_data.duplicated().sum())

In [None]:
#Deduplicating the number of medals from team sports
#Creat a dataframe just with the result of each olympic by event (Gold, Silver and Bronze)
olympic_data["Concat"]=olympic_data["Games"] + olympic_data["Event"] +olympic_data["Medal"]+olympic_data["NOC"]
olympic_data_podium=olympic_data[olympic_data["Medal"]!= "No Medal"]
olympic_data_podium.drop_duplicates(subset=["Concat"], inplace= True)
olympic_data_podium.Concat.duplicated().sum()

In [None]:
#Creat a new file droping teams duplicated
#Create a new dataframe
olympic_data_not_duplicated=olympic_data
olympic_data_not_duplicated["Concat"]=olympic_data_not_duplicated["Games"] + olympic_data_not_duplicated["Event"] +olympic_data_not_duplicated["Medal"]+olympic_data_not_duplicated["NOC"]
olympic_data_not_duplicated.drop_duplicates(subset=["Concat"], inplace= True)
olympic_data_not_duplicated.Concat.duplicated().sum()

#NOC column - we want to merge the NOC column in our Olympics dataset with the region column from the NOC regions dataset
noc_regions = pd.read_csv('noc_regions.csv')

#Merge with NOC
olympic_data_not_duplicated= pd.merge(
    olympic_data_not_duplicated, 
    noc_regions[['NOC', 'region']], 
    on='NOC', 
    how='left')

In [None]:
#NOC column - we want to merge the NOC column in our Olympics dataset with the region column from the NOC regions dataset
noc_regions = pd.read_csv('noc_regions.csv')

#Merge the datasets on the 'NOC' column
olympics_updated = pd.merge(
    olympic_data_podium, 
    noc_regions[['NOC', 'region']], 
    on='NOC', 
    how='left'
)

In [None]:
# Save the updated DataFrame to a new CSV file
olympic_data_not_duplicated.to_csv('teams_not_duplicated_summer_olympics_1996-2016_deduplicate_team_medals.csv', index=False)