- Dataframe creation of the table event_games and the table players_current_month in the silver DB, using Spark

In [0]:
players_df = spark.table('silver.players_current_month')
games_df = spark.table('silver.event_games')

- Making a join of the games and players tables, using the fide id as the connector, and adding the country column for black and white players at the games table

In [0]:
from pyspark.sql import functions as F

# Alias the DataFrames for clarity
games_df_aliased = games_df.alias("games")
players_df_white = players_df.alias("players_white")
players_df_black = players_df.alias("players_black")

# Join partidas_df with players_df (aliased as players_white) on white_fide_id to get white_country
joined_df = games_df_aliased.join(
    players_df_white,
    F.col("games.white_fide_id") == F.col("players_white.fideid"),
    "left_outer"
).select(
    F.col("games.*"),
    F.col("players_white.country").alias("white_country")
)

# Join the result with players_df (aliased as players_black) on black_fide_id to get black_country
joined_df = joined_df.join(
    players_df_black,
    F.col("games.black_fide_id") == F.col("players_black.fideid"),
    "left_outer"
).select(
    F.col("games.*"),
    F.col("white_country"),
    F.col("players_black.country").alias("black_country")
)

# Display the final DataFrame
display(joined_df)

- Ordering the columns

In [0]:
# Adjusting the order of columns in joined_df, placing white_country right after white and black_country right after black
ordered_columns = [
    col for col in joined_df.columns if col not in ["white_country", "black_country"]
]

# Assuming the columns to place after are named "white" and "black" respectively
white_index = ordered_columns.index("white") + 1
black_index = ordered_columns.index("black") + 1

# Inserting the country columns in the desired positions
ordered_columns.insert(white_index, "white_country")
ordered_columns.insert(black_index + 1, "black_country")  # +1 to account for the insertion of white_country

# Selecting columns in the new order
joined_df = joined_df.select(*ordered_columns)

# Display the reordered DataFrame
display(joined_df)

In [0]:
joined_df.write.mode('overwrite').saveAsTable('silver.games_with_country')

In [0]:
ind_df = joined_df.filter(
    F.col("white_country") == "IND"
)

display(ind_df)

In [0]:
ind_df.write.mode('overwrite').saveAsTable('silver.games_IND')

Filtering for white_country 'IND' and counting each opening, then ordering by count descending

In [0]:
openings_count_df = joined_df.filter(
    F.col("white_country") == "IND"
).groupBy("opening").agg(
    F.count("*").alias("openings_count")
).orderBy(
    F.col("openings_count").desc()
)

# Display the result
display(openings_count_df)

- Filtering for white_country 'IND' and result_of_match 'White wins', counting each opening, and displaying each unique result_of_match

In [0]:
from pyspark.sql import functions as F

openings_results_df = joined_df.filter(
    (F.col("white_country") == "IND") & 
    (F.col("result_of_match") == "White wins")
).groupBy("opening", "result_of_match").agg(
    F.count("*").alias("openings_count")
).orderBy(
    F.col("openings_count").desc(), F.col("opening")
)

# Display the result
display(openings_results_df)

- Calculating the win percentage for each opening of the indian players, where the white_wins_count is over 100, for a more accurate statistic 

In [0]:
# Calculate total openings count for normalization
total_openings_df = joined_df.filter(
    F.col("white_country") == "IND"
).groupBy("opening").agg(
    F.count("*").alias("total_openings_count")
)

# Calculate white wins count and percentage
white_wins_percentage_df_ind = joined_df.filter(
    (F.col("white_country") == "IND") & 
    (F.col("result_of_match") == "White wins")
).groupBy("opening").agg(
    F.count("*").alias("victory_count_ind")
).join(
    total_openings_df, ["opening"], "inner"
).withColumn(
    "percentage_ind", 
    F.round((F.col("victory_count_ind") / F.col("total_openings_count") * 100), 1)
).filter(
    F.col("victory_count_ind") > 100
).select(
    "opening", "victory_count_ind", "percentage_ind"
).orderBy(
    F.col("percentage_ind").desc()
)

# Display the result
display(white_wins_percentage_df_ind)

In [0]:
white_wins_percentage_df_ind.write.mode('overwrite').saveAsTable('gold.IND_most_ef_openings')

- Calculating the win percentage for each opening of the world players, where the white_wins_count is over 100, for a more accurate statistic

In [0]:
# Calculate total openings count for normalization
total_openings_df = joined_df.groupBy("opening").agg(
    F.count("*").alias("total_openings_count")
)

# Calculate white wins count and percentage
white_wins_percentage_df_world = joined_df.filter(
    F.col("result_of_match") == "White wins"
).groupBy("opening").agg(
    F.count("*").alias("white_wins_count")
).join(
    total_openings_df, ["opening"], "inner"
).withColumn(
    "percentage", 
    F.round((F.col("white_wins_count") / F.col("total_openings_count") * 100), 1)
).filter(
    F.col("white_wins_count") > 100
).select(
    "opening", "white_wins_count", "percentage"
).orderBy(
    F.col("percentage").desc()
)

# Display the result
display(white_wins_percentage_df_world)

In [0]:
# Rename columns in white_wins_percentage_df_world
white_wins_percentage_df_world = white_wins_percentage_df_world.withColumnRenamed(
    "white_wins_count", "victory_count_world"
).withColumnRenamed(
    "percentage", "percentage_world"
)

# Display the result
display(white_wins_percentage_df_world)

- Joining dataframes for openings compare

In [0]:
# Join the dataframes on the 'opening' column
openings_comp_ind_world = white_wins_percentage_df_world.join(
    white_wins_percentage_df_ind, 
    on="opening", 
    how="inner"
).select(
    "opening", 
    "victory_count_world", 
    "victory_count_ind", 
    "percentage_world", 
    "percentage_ind"
).orderBy(
    F.col("percentage_ind").desc()
)

# Display the result
display(openings_comp_ind_world)

In [0]:
openings_comp_ind_world.write.mode('overwrite').saveAsTable('gold.IND_world_comp_openings')