In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
gdp_per_capita = pd.read_csv("../data/clean/dataset_olympics_clean/gdp_capita_final.csv", delimiter=";")
olympic_data = pd.read_csv("../data/clean/dataset_olympics_clean/teams_not_duplicated_summer_olympics_1996-2016_deduplicate_team_medals.csv")

In [None]:
countries_to_remove = [
    "North Korea", "Kosovo", "Montenegro", "Tuvalu", "Individual Athletes",
    "Virgin Islands, British", "Virgin Islands, US", "South Sudan", "Palestine",
    "Cook Islands", "American Samoa", "Swaziland", "Refugee Team"]

olympic_data = olympic_data[~olympic_data["region"].isin(countries_to_remove)]

In [None]:
gdp_per_capita

In [None]:
gdp_per_capita.shape

In [None]:
gdp_per_capita.isna().sum()

In [None]:
def clean_numeric_values(gdp_per_capita):
    gdp_per_capita = gdp_per_capita.copy()
    for col in gdp_per_capita.columns[2:]:  
        gdp_per_capita[col] = (
            gdp_per_capita[col]
            .astype(str)
            .str.replace('.', '', regex=False)  
            .str.replace(',', '.', regex=False)  
            .astype(float))
    return gdp_per_capita

gdp_per_capita = clean_numeric_values(gdp_per_capita)
olympic_years = [1996, 2000, 2004, 2008, 2012, 2016]

# New dataframe with aggregated gdp
agg_gdp_per_capita = gdp_per_capita.iloc[:, :2].copy()  # Keep "Country Name" and "Country Code"

# Calculate aggregated gdp per capita (based on previous 4 years)
for year in olympic_years:
    previous_years = [str(year - i) for i in range(1, 5)]  # get previous 4 years
    available_columns = [col for col in previous_years if col in gdp_per_capita.columns]  # check if the year exists
    agg_gdp_per_capita[str(year)] = gdp_per_capita[available_columns].sum(axis=1, skipna=True)  

In [None]:
agg_gdp_per_capita

In [None]:
all_countries_years = olympic_data[["region", "Year"]].drop_duplicates()
medals_per_country_year = (olympic_data[olympic_data["Medal"] != "No Medal"].groupby(["region", "Year"]).size().reset_index(name="Total Medals"))
full_medals_df = all_countries_years.merge(medals_per_country_year, on=["region", "Year"], how="left")
full_medals_df["Total Medals"] = full_medals_df["Total Medals"].fillna(0).astype(int)

In [None]:
agg_gdp_per_capita_long = agg_gdp_per_capita.melt(id_vars=["Country Name", "Country Code"], var_name="Year", value_name="Aggregated GDP per capita")
agg_gdp_per_capita_long["Year"] = agg_gdp_per_capita_long["Year"].astype(int)
full_medals_with_gdp = full_medals_df.merge(agg_gdp_per_capita_long, left_on=["region", "Year"], right_on=["Country Name", "Year"], how="left")
full_medals_with_gdp = full_medals_with_gdp.drop(columns=["Country Name", "Country Code"])
full_medals_with_gdp

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(full_medals_with_gdp["Aggregated GDP per capita"], full_medals_with_gdp["Total Medals"], alpha=0.7, edgecolors='k')

plt.xlabel("Aggregated GDP per capita (USD)")
plt.ylabel("Total Medals")
plt.title("Scatter Plot: GDP per capita vs. Total Medals")

plt.xscale("log")
plt.grid(True, linestyle="--", alpha=0.6)
plt.show()

In [None]:
medals_winners = full_medals_with_gdp[full_medals_with_gdp["Total Medals"] > 0]

plt.figure(figsize=(10, 6))
plt.scatter(medals_winners["Aggregated GDP per capita"], medals_winners["Total Medals"], alpha=0.7, edgecolors='k')

plt.xlabel("Aggregated GDP per capita (USD)")
plt.ylabel("Total Medals")
plt.title("Scatter Plot: GDP per capita vs. Total Medals (Only Medal Winners)")

plt.xscale("log")
plt.grid(True, linestyle="--", alpha=0.6)
plt.show()

In [None]:
unique_years = sorted(full_medals_with_gdp["Year"].unique())

# Counter for images
image_counter = 1

for year in unique_years:
    # Filter data for current year
    medals_winners_year = full_medals_with_gdp[(full_medals_with_gdp["Total Medals"] > 0) & (full_medals_with_gdp["Year"] == year)]
    
    medals_winners_year = medals_winners_year.copy()
    medals_winners_year["Log Aggregated GDP per capita"] = np.log(medals_winners_year["Aggregated GDP per capita"])

    
    plt.figure(figsize=(10, 6))
    
    # scatter plot with regression line
    sns.regplot(
        x=medals_winners_year["Log Aggregated GDP per capita"], 
        y=medals_winners_year["Total Medals"], 
        scatter=True, fit_reg=True,  
        scatter_kws={"alpha": 0.7, "edgecolors": "k"},
        line_kws={"color": "red", "linewidth": 2}
    )

    # Etiquetas y título
    plt.xlabel("Log Aggregated GDP per capita")
    plt.ylabel("Total Medals")
    plt.title(f"Log GDP per capita vs. Total Medals ({year})")
    plt.grid(True, linestyle="--", alpha=0.6)

    # Guardar la imagen con un nombre único
    image_counter += 1  # Counter increment for next image

   
    plt.show()

In [None]:
full_medals_with_gdp

In [None]:
gdp_medals_total = full_medals_with_gdp.groupby("region")[["Total Medals","Aggregated GDP per capita"]].sum()
gdp_medals_total

In [None]:
df_sorted = gdp_medals_total.sort_values("Aggregated GDP per capita", ascending=False)

fig, ax1 = plt.subplots(figsize=(14, 6))

ax1.bar(df_sorted.index, df_sorted["Aggregated GDP per capita"], color="lightblue", alpha=0.6, label="Aggregated GDP per capita")
ax1.set_yscale("log")  
ax1.set_ylabel("Aggregated GDP per capita (Log Scale)")

ax1.set_xticks([])
ax2 = ax1.twinx()
ax2.scatter(df_sorted.index, df_sorted["Total Medals"], color="red", label="Total Medals", zorder=3, s=20)
ax2.set_ylabel("Total Medals")

plt.title("Aggregated GDP per capita and Total Medals per Country")

ax1.legend(loc="upper left")
ax2.legend(loc="upper right")

ax1.grid(axis="y", linestyle="--", alpha=0.6)
plt.show()


In [None]:
#gdp_medals_mean = full_medals_with_gdp.groupby("region")[["Total Medals","Aggregated GDP per capita"]].mean()
full_medals_with_gdp["Mean GDP per capita"] = full_medals_with_gdp["Aggregated GDP per capita"]/4
full_medals_with_gdp

In [None]:
full_medals_mean = full_medals_with_gdp.groupby("region")[["Total Medals","Mean GDP per capita"]].mean()
full_medals_mean

In [None]:
correlation = full_medals_mean["Mean GDP per capita"].corr(full_medals_mean["Total Medals"])
correlation

In [None]:
full_medals_mean = full_medals_mean.copy()

full_medals_mean["Total Medals"] = full_medals_mean["Total Medals"].fillna(0).astype(float)
full_medals_mean["Mean GDP per capita"] = full_medals_mean["Mean GDP per capita"].fillna(0).astype(float)
full_medals_mean.index = full_medals_mean.index.astype(str)
full_medals_mean = full_medals_mean.sort_values("Mean GDP per capita", ascending=True)
threshold = np.percentile(full_medals_mean["Total Medals"], 90)
countries_to_label = {country: (medals >= threshold) for country, medals in zip(full_medals_mean.index, full_medals_mean["Total Medals"])}

fig, ax = plt.subplots(figsize=(10, 12))

ax.barh(full_medals_mean.index, full_medals_mean["Mean GDP per capita"], color="#c5e3ec", alpha=0.8)


for i, (country, medals, gdp) in enumerate(zip(full_medals_mean.index, full_medals_mean["Total Medals"], full_medals_mean["Mean GDP per capita"])):
    if medals > 0:  
        ax.scatter(gdp, i, s=medals * 50, color="red", edgecolors="black", alpha=0.8)
        if countries_to_label[country]:  
            ax.text(gdp + (0.1 * gdp + 500), i, country, fontsize=8, ha="left", va="center", color="black")


ax.set_yticks([])
ax.set_yticklabels([])


ax.set_xlabel("Mean GDP per capita (USD)")
ax.set_ylabel("Countries")
ax.set_title("Mean GDP per capita & Olympic Medals per Country")


plt.show()