In [None]:
import pandas as pd
import numpy as np
import re
olympic_data=pd.read_csv('athlete_events.csv')

In [None]:
#Number of countries and athletes participating is inconsistent before 1996 - use only data post 1996
condition_year_1996=olympic_data["Year"]>=1996
olympic_data=olympic_data[condition_year_1996]
condition_summer=olympic_data["Season"]=="Summer"
olympic_data=olympic_data[condition_summer]

In [None]:
# Create a mapping dictionary for corrections
# Replace incorrect names with correct ones

corrections = {
    "Athina": "Athens",
    "Roma": "Rome",
    "Moskva": "Moscow",
    "Sankt Moritz": "St. Moritz"
}


olympic_data['City'] = olympic_data['City'].replace(corrections)

In [None]:
#Investigating quality of ID column
#Duplicates exist but this is intentional, as one athlete can participate in more than one event/olympics.
#ID numbering is consistent throughout years - e.g. same ID for athlete in different olympic games.
#ID is based on alphabetical order of last name
#no missing values in the ID column
#Team column comments: Team names are inconsistent - mixed with countries and sports team names. We will use NOC column instead for our analysis

In [None]:
#Replace Na values as "No Medal" string as null values correspond to no winning athletes.
olympic_data["Medal"] = olympic_data["Medal"].apply(lambda x:"No Medal" if pd.isna(x) else x)

In [None]:
#The difference between the medal count can be due to differences on the number of team sport members and ties.
olympic_data["Medal"].value_counts()

In [None]:
olympic_data.drop_duplicates(inplace = True)
print(olympic_data.duplicated().sum())

In [None]:
#Deduplicating the number of medals from team sports
#Creat a dataframe just with the result of each olympic by event (Gold, Silver and Bronze)
olympic_data["Concat"]=olympic_data["Games"] + olympic_data["Event"] +olympic_data["Medal"]+olympic_data["NOC"]
olympic_data_podium=olympic_data[olympic_data["Medal"]!= "No Medal"]
olympic_data_podium.drop_duplicates(subset=["Concat"], inplace= True)
olympic_data_podium.Concat.duplicated().sum()

In [None]:
#Creat a new file droping teams duplicated
#Create a new dataframe
olympic_data_not_duplicated=olympic_data
olympic_data_not_duplicated["Concat"]=olympic_data_not_duplicated["Games"] + olympic_data_not_duplicated["Event"] +olympic_data_not_duplicated["Medal"]+olympic_data_not_duplicated["NOC"]
olympic_data_not_duplicated.drop_duplicates(subset=["Concat"], inplace= True)
olympic_data_not_duplicated.Concat.duplicated().sum()

#NOC column - we want to merge the NOC column in our Olympics dataset with the region column from the NOC regions dataset
noc_regions = pd.read_csv('noc_regions.csv')

#Merge with NOC
olympic_data_not_duplicated= pd.merge(
    olympic_data_not_duplicated, 
    noc_regions[['NOC', 'region']], 
    on='NOC', 
    how='left')

In [None]:
# Save the updated DataFrame to a new CSV file
#olympic_data_not_duplicated.to_csv('teams_not_duplicated_summer_olympics_1996-2016_deduplicate_team_medals.csv', index=False)

In [None]:
#NOC column - we want to merge the NOC column in our Olympics dataset with the region column from the NOC regions dataset
noc_regions = pd.read_csv('noc_regions.csv')

#Merge the datasets on the 'NOC' column
olympics_updated = pd.merge(
    olympic_data_podium, 
    noc_regions[['NOC', 'region']], 
    on='NOC', 
    how='left'
)

In [None]:
# Save the updated DataFrame to a new CSV file
olympics_updated.to_csv('summer_olympics_1996-2016_deduplicate_team_medals.csv', index=False)

In [None]:
import pandas as pd
import numpy as np
import re
olympic_podium=pd.read_csv('podium_1996-2016.csv')
olympic_podium.region.fillna("Singapore",inplace=True)
olympic_podium.region.isnull().sum()
olympic_podium=olympic_podium[olympic_podium.region !="Kosovo"]
olympic_podium

In [None]:
time_zones=pd.read_csv("time_zones2.csv", encoding="latin-1", sep=";")
time_zones

In [None]:
time_zones_athlete=time_zones

In [None]:
time_zones_athlete[["Hours","Minutes"]]=time_zones_athlete["GMT offset"].str.split(":", expand=True)
time_zones_athlete
time_zones_athlete.Minutes.fillna("00",inplace=True)
time_zones_athlete.head(40)

time_zones_athlete["Minutes"]=time_zones_athlete["Minutes"].map({
    "30": 0.5,
    "00": 0,
    "0": 0
})
time_zones_athlete["Hours"]=time_zones_athlete["Hours"].astype(float).abs()
time_zones_athlete["GMT Athlete Country"]=time_zones_athlete["Hours"]+time_zones_athlete["Minutes"]

time_zones_athlete

In [None]:
time_zones_athlete.rename(columns={"Country":'region'}, inplace=True)
time_zones_athlete

In [None]:
olympic_podium=pd.merge(
    olympic_podium, 
    time_zones_athlete[["region","GMT Athlete Country","Capital city"]],
    on="region",
    how="left")

olympic_podium

In [None]:
print(olympic_podium.region.isnull().sum())
print(olympic_podium["GMT Athlete Country"].isnull().sum())
print(olympic_podium["Capital city"].isnull().sum())

In [None]:
olympic_podium[olympic_podium["GMT Athlete Country"].isnull()].head()

In [None]:
olympic_podium.dropna(subset=["GMT Athlete Country"], inplace=True)
print(olympic_podium.region.isnull().sum())
print(olympic_podium["GMT Athlete Country"].isnull().sum())
print(olympic_podium["Capital city"].isnull().sum())

In [None]:
time_zones_host=time_zones_athlete
time_zones_host.rename(columns={"Capital city":'City'}, inplace=True)
time_zones_host.rename(columns={"GMT Athlete Country":'GMT Host City'}, inplace=True)
time_zones_host

In [None]:
olympic_podium=pd.merge(
    olympic_podium, 
    time_zones_host[["City","GMT Host City"]],
    on="City",
    how="left")

olympic_podium

In [None]:
olympic_podium.isnull().any()

In [None]:
olympic_podium.info()

In [None]:
olympic_podium["Time difference"]=(olympic_podium["GMT Host City"]-olympic_podium["GMT Athlete Country"]).abs()
olympic_podium

In [None]:
olympic_podium.groupby("Time difference")["Medal"].count()


In [None]:
def create_range_time(time):
    if time <= 2.0:
        return "[0-2.0]"
    elif time <=4:
        return "(2.0-4.0]"
    elif time <=6:
        return "(4.0-6.0]"
    elif time <=8:
        return "(6.0-8.0]"
    elif time <=10:
        return "(8.0-10.0]"
    elif time <=12:
        return "(10.0-12.0]"
    else:
        return "(12.0-13.00]"

In [None]:
olympic_podium["Range Time"]=olympic_podium["Time difference"].apply(create_range_time)
olympic_podium

In [None]:
olympic_podium.groupby("Range Time")["Medal"].count().sort_values(ascending=False)

In [None]:
olympic_podium_range_df=pd.DataFrame(olympic_podium.groupby("Range Time")["Medal"].count().sort_values(ascending=False)).reset_index()
olympic_podium_range_df

In [None]:
import plotly.express as px
fig1 = px.bar(olympic_podium_range_df, x = 'Range Time', y = 'Medal')

fig1.show()

In [None]:
olympic_podium_corr=olympic_podium
olympic_podium_corr

In [None]:
olympic_podium_corr["Time difference"]=olympic_podium_corr["Time difference"].map({
    0.0: 0.0,
    0.5: 1.0,
    1.5: 2.0,
    2.5: 3.0,
    3.5: 4.0,
    4.5: 5.0,
    5.5: 6.0,
    6.5: 4.0,
    7.5: 8.0,
    8.5: 9.0,
    9.5: 10.0,
    10.5: 11.0,
    11.5: 12.0,
    12.5: 13.0,
    1.0: 1.0,
    2.0: 2.0,
    3.0: 3.0,
    4.0: 4.0,
    5.0: 5.0,
    6.0: 6.0,
    7.0: 7.0,
    8.0: 8.0,
    9.0: 9.0,
    10.0: 10.0,
    11.0: 11.0,
    12.0: 12.0,
    13.0: 13.0,
    
})

olympic_podium_corr

In [None]:
olympic_podium_corr["Time difference"].unique()

In [None]:
olympic_podium_corr.groupby("Time difference")["Medal"].count()

In [None]:
olympic_podium_corr_df=pd.DataFrame(olympic_podium_corr.groupby("Time difference")["Medal"].count()).reset_index()
olympic_podium_corr_df

In [None]:
print('Correlación Pearson: ', olympic_podium_corr_df['Time difference'].corr(olympic_podium_corr_df['Medal'], method='pearson'))

In [None]:
import seaborn as sns

sns.regplot(x = olympic_podium_corr_df["Time difference"], y = olympic_podium_corr_df["Medal"])