# Gold Layer - Olympics and Country GDP
The gold layer aims to provide awswers to the questions we've proposed:
- Is there a correlation between age and getting a medal?
- Which countries have been improving their rankings?
- Is there a correlation between the country GDP and getting a medal?

In [0]:
# Base path for parquet files
path = "/dbfs/user/joaolopes1/"

# File names for different datasets
olympics = "olympics"
olympics_cleaned = "olympics_cleaned"
medalists = "medalists"
medalists_age = "medalists_age"
medal_rank = "medal_rank"
medals_ranked_by_year = "medals_ranked_by_year"
gdp = "gdp"
gdp_ranked = "gdp_ranked"
gdp_avg_by_country = "gdp_avg_by_country"

# Function to construct the full path to a parquet file
def get_parquet_path(file_name):
    return path + file_name + ".parquet"


# Function to load a dataframe from a parquet file
def load_dataframe(file_name):
    return spark.read.parquet(get_parquet_path(file_name))

In [0]:
# Load dataframes from parquet files and store them in a dictionary
df = {
    # Original dataset
    "olympics": load_dataframe(olympics),
    # Cleaned datasets
    "olympics_cleaned": load_dataframe(olympics_cleaned),
    "medalists": load_dataframe(medalists),
    "medalists_age": load_dataframe(medalists_age),
    "medal_rank": load_dataframe(medal_rank),
    "medals_ranked_by_year": load_dataframe(medals_ranked_by_year),
    # Original dataset
    "gdp": load_dataframe(gdp),
    # Cleaned datasets
    "gdp_ranked": load_dataframe(gdp_ranked),
    "gdp_avg_by_country": load_dataframe(gdp_avg_by_country),
}

## Data Overview

### Olympics

In [0]:
display(df[olympics])

ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
1724,Aristidis Akratopoulos,M,,,,Greece,GRE,1896 Summer,1896,Summer,Athina,Tennis,Tennis Men's Singles,
1724,Aristidis Akratopoulos,M,,,,Greece-3,GRE,1896 Summer,1896,Summer,Athina,Tennis,Tennis Men's Doubles,
1725,"Konstantinos ""Kostas"" Akratopoulos",M,,,,Greece,GRE,1896 Summer,1896,Summer,Athina,Tennis,Tennis Men's Singles,
1725,"Konstantinos ""Kostas"" Akratopoulos",M,,,,Greece-3,GRE,1896 Summer,1896,Summer,Athina,Tennis,Tennis Men's Doubles,
4113,Anastasios Andreou,M,,,,Greece,GRE,1896 Summer,1896,Summer,Athina,Athletics,Athletics Men's 110 metres Hurdles,
4116,Ioannis Andreou,M,,,,Greece,GRE,1896 Summer,1896,Summer,Athina,Swimming,"Swimming Men's 1,200 metres Freestyle",Silver
4189,Nikolaos Andriakopoulos,M,,,,Greece,GRE,1896 Summer,1896,Summer,Athina,Gymnastics,Gymnastics Men's Rope Climbing,Gold
4431,Georgios Anninos,M,,,,Greece,GRE,1896 Summer,1896,Summer,Athina,Swimming,Swimming Men's 100 metres Freestyle,
4493,Antelothanasis,M,,,,Greece,GRE,1896 Summer,1896,Summer,Athina,Shooting,"Shooting Men's Free Rifle, Three Positions, 300 metres",
5660,Georgios Aspiotis,M,,,,Greece,GRE,1896 Summer,1896,Summer,Athina,Cycling,"Cycling Men's Road Race, Individual",


### GDP

In [0]:
display(df[gdp])

Country Name,Country Code,Year,Value
Arab World,ARB,1968,25760683041.0857
Arab World,ARB,1969,28434203615.4829
Arab World,ARB,1970,31385499664.0672
Arab World,ARB,1971,36426909888.3928
Arab World,ARB,1972,43316056615.4562
Arab World,ARB,1973,55018394945.5825
Arab World,ARB,1974,105145803084.377
Arab World,ARB,1975,116337021938.341
Arab World,ARB,1976,144846175400.488
Arab World,ARB,1977,167308327683.593


## Is there a correlation between age and getting a medal?

Yes, It does. We can see in the data that the max amount of medal won where between the age of 19th to 31th years old. So if you age are between does ages you have a posibility to win Olympic Medal 🥇 🥈 🥉 

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

df_medalists_age = df[medalists_age]

display(df_medalists_age)

Age,Sex,Bronze,Gold,Silver,Total_Medals,Rank
23.0,F,64,78,57,199,1
25.0,F,68,56,67,191,2
27.0,F,66,63,60,189,3
26.0,F,55,69,62,186,4
24.0,F,54,66,53,173,5
22.0,F,53,61,53,167,6
21.0,F,61,47,57,165,7
29.0,F,53,48,34,135,8
20.0,F,34,41,38,113,9
28.0,F,38,38,37,113,10


Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

## Which countries have been improving their rankings?

- France
- Spain
- Armenia
- Jamaica
- Brazil
- New Zealand
- Cameroon
- Serbia
- United Kingdom

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

country_window_ordered_by_year = Window.partitionBy("NOC").orderBy("Year")


def calculate_rank_distribution(df):
    return df.withColumn(
        "Rank_Distribution", F.collect_list("Rank").over(country_window_ordered_by_year)
    )


def select_noc_and_rank_distribution(df):
    return df.select("NOC", "Rank_Distribution")


country_window_ordered_by_rank_dist = Window.partitionBy("NOC").orderBy(
    F.size("Rank_Distribution").desc()
)


def remove_duplicates(df):
    return (
        df.withColumn("Rank", F.row_number().over(country_window_ordered_by_rank_dist))
        .filter(F.col("Rank") == 1)
        .drop("Rank")
    )


def order_by_noc(df):
    return df.orderBy("NOC")


df_country_rank_distribution = (
    df[medals_ranked_by_year]
    .transform(order_by_noc)
    .transform(calculate_rank_distribution)
    .transform(select_noc_and_rank_distribution)
    .transform(remove_duplicates)
)

display(df_country_rank_distribution)

NOC,Rank_Distribution
AHO,List(28)
ALG,"List(20, 46, 51, 45)"
ARG,"List(21, 31, 23, 33, 27, 29, 7, 7, 20, 14)"
ARM,"List(53, 54, 27)"
AUS,"List(7, 10, 11, 6, 16, 11, 19, 5, 12, 10, 3, 2, 8, 12, 3)"
AUT,"List(30, 30, 21, 35, 30)"
AZE,"List(36, 25, 48, 45, 42, 42)"
BAH,"List(16, 30, 14, 50, 39, 25, 56)"
BEL,"List(30, 22, 35, 26, 40, 45, 42, 38)"
BLR,"List(31, 33, 40, 11, 22)"


In [0]:
df_medal_rank_by_year = df[medals_ranked_by_year].transform(order_by_noc)

display(df_medal_rank_by_year)

Year,Rank,NOC,Gold,Silver,Bronze
1988,28,AHO,0,1,0
1992,20,ALG,1,0,0
1996,46,ALG,0,0,1
2000,51,ALG,0,0,1
2008,45,ALG,0,1,0
1960,21,ARG,0,2,0
1968,31,ARG,0,0,1
1972,23,ARG,0,1,0
1988,33,ARG,0,0,6
1996,27,ARG,0,9,1


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType

# Define a UDF to rank the "Rank Distribution" array column
def rank_distribution(rank_dist):
    if not rank_dist:
        return 0
    rank = 0
    for i in range(1, len(rank_dist)):
        if rank_dist[i] < rank_dist[i - 1]:
            rank += rank_dist[i - 1] - rank_dist[i]
        else:
            rank -= rank_dist[i] - rank_dist[i - 1]
    return rank

rank_distribution_udf = F.udf(rank_distribution, IntegerType())

# Apply the UDF to the DataFrame
df_country_rank_distribution_ranked = df_country_rank_distribution.withColumn(
    "Rank_Distribution_Rank", rank_distribution_udf(F.col("Rank_Distribution"))
).orderBy(F.col("Rank_Distribution_Rank").desc())

display(df_country_rank_distribution_ranked)

NOC,Rank_Distribution,Rank_Distribution_Rank
SRB,"List(41, 49, 13)",28
ARM,"List(53, 54, 27)",26
BRA,"List(31, 25, 29, 27, 18, 14, 20, 9, 17, 31, 10, 10, 14, 5)",26
GBR,"List(24, 8, 7, 15, 17, 13, 9, 7, 13, 28, 7, 8, 6, 2, 2)",22
CMR,"List(28, 9)",19
ESP,"List(30, 23, 12, 15, 16, 4, 9, 17, 15, 19, 17, 12)",18
POR,"List(46, 51, 28)",18
FRA,"List(25, 21, 9, 21, 12, 3, 7, 10, 17, 3, 4, 11, 4, 3, 10)",15
JAM,"List(23, 23, 32, 34, 32, 18, 18, 10, 8)",15
NZL,"List(26, 10, 8, 5, 10, 14, 29, 36, 17, 12, 11, 11)",15


Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

In [0]:
def top_country_codes(df):
    return df.withColumnRenamed("NOC", "Country Code").withColumnRenamed(
        "Rank", "Olympic Rank"
    ).limit(10)

def select_distinct_countries(df):
    return df.select("Country Name").distinct()

df_gdp = df[gdp]

df_countries = df_country_rank_distribution_ranked.transform(top_country_codes).join(
    df_gdp, on="Country Code", how="inner"
).transform(select_distinct_countries)

display(df_countries)

Country Name
France
Spain
Armenia
Jamaica
Brazil
New Zealand
Cameroon
Serbia
United Kingdom


## Is there a correlation between the country GDP and getting a medal?

We have a correlation of 0.66489112836935. It's not a strong correlation but it is higher than 50%.

In [0]:
df_medal_rank = df[medal_rank]
display(df_medal_rank)

Rank,NOC,Gold,Silver,Bronze
1,USA,450,223,204
2,URS,142,111,114
3,ITA,98,106,121
4,AUS,79,111,129
5,GBR,79,81,85
6,GER,75,67,81
7,GDR,68,46,54
8,FRA,67,65,98
9,HUN,62,45,62
10,RUS,55,47,63


Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

In [0]:

df_gdp_avg = df[gdp_avg_by_country]
display(df_gdp_avg)

Rank,Country Name,Formatted Value,Country Code,Average Value
1,United States,"$6,773,383,877,192.98",USA,6773383877192.982
2,Japan,"$2,664,703,527,403.46",JPN,2664703527403.463
3,China,"$1,790,502,735,869.80",CHN,1790502735869.8003
4,France,"$1,154,818,578,804.54",FRA,1154818578804.5354
5,United Kingdom,"$1,137,233,329,744.12",GBR,1137233329744.1208
6,Italy,"$926,943,534,689.03",ITA,926943534689.0322
7,Russian Federation,"$898,476,612,097.77",RUS,898476612097.7684
8,Brazil,"$634,457,987,835.86",BRA,634457987835.8574
9,Canada,"$616,458,939,921.54",CAN,616458939921.535
10,Spain,"$527,344,712,711.14",ESP,527344712711.1363


Databricks visualization. Run in Databricks to view.

In [0]:
def rename_olympic_columns(df):
    return df.withColumnRenamed("NOC", "Country Code").withColumnRenamed(
        "Rank", "Olympic Rank"
    )


def rename_gdp_columns(df):
    return df.withColumnRenamed("Rank", "GDP Rank")


df_medal_rank_gdp = df_medal_rank.transform(rename_olympic_columns).join(
    df_gdp_avg.transform(rename_gdp_columns), on="Country Code"
)

display(
    df_medal_rank_gdp.select(
        "Country Name",
        "GDP Rank",
        "Olympic Rank",
    )
)

Country Name,GDP Rank,Olympic Rank
United States,1,1
Japan,2,26
China,3,11
France,4,8
United Kingdom,5,5
Italy,6,3
Russian Federation,7,10
Brazil,8,18
Canada,9,21
Spain,10,15


Databricks visualization. Run in Databricks to view.

In [0]:
from pyspark.sql.functions import corr

correlation = df_medal_rank_gdp.select(corr("Olympic Rank", "GDP Rank")).collect()[0][0]

display(correlation)

0.6400809002278126

In [0]:
from pyspark.sql.functions import col, abs

df_medal_rank_gdp_diff = df_medal_rank_gdp.withColumn(
    "Rank Diff", abs(col("Olympic Rank") - col("GDP Rank"))
).select(
    "Country Name",
    "Olympic Rank",
    "GDP Rank",
    "Rank Diff",
).orderBy(col("Rank Diff").asc())

display(df_medal_rank_gdp_diff)

Country Name,Olympic Rank,GDP Rank,Rank Diff
United States,1,1,0
United Kingdom,5,5,0
Belarus,45,44,1
Uzbekistan,51,49,2
Georgia,73,71,2
Uganda,75,73,2
Italy,3,6,3
Russian Federation,10,7,3
Morocco,46,43,3
Serbia,49,46,3


Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.