- Dataframe creation of the table players_male_Base in the silver DB, using Spark

In [0]:
avg_rating_df = spark.table('silver.players_male_base')

- Filtering top 10 under 20 y/o players per country, where flag is not inactive

In [0]:
from pyspark.sql import Window
from pyspark.sql import functions as F

# Filter out rows where the rating is 0 or null, birthday is greater than or equal to 2004 and flag is not inactive
filtered_df = avg_rating_df.filter((avg_rating_df.rating != 0) & (avg_rating_df.rating.isNotNull()) & (avg_rating_df.birthday >= 2004) & (avg_rating_df.flag.isNull()))
# Define a window specification to partition by country and order by rating descending
window_spec = Window.partitionBy('country').orderBy(F.desc('rating'))

# Add a row number within each partition
ranked_df = filtered_df.withColumn('rank', F.row_number().over(window_spec))

# Filter to keep only the top 10 players per country
top_10_df = ranked_df.filter(ranked_df.rank <= 10)

# Group by country and calculate the average rating
average_rating_by_country = top_10_df.groupBy('country').agg(
    F.avg('rating').alias('average_rating')
)

# Order by average_rating in descending order and round to 1 decimal place
average_rating_by_country = average_rating_by_country.orderBy(F.desc('average_rating')).withColumn('average_rating', F.round('average_rating', 1))

# Display the result
display(average_rating_by_country)

country,average_rating
IND,2633.5
USA,2532.3
GER,2493.9
FID,2473.1
TUR,2470.8
FRA,2460.7
UZB,2448.8
IRI,2426.6
ISR,2424.4
POL,2423.3


In [0]:
average_rating_by_country.write.mode('overwrite').saveAsTable('gold.M_20_average_rating_by_country')