In [None]:
# %pip install -r requirements.txt

^C
Note: you may need to restart the kernel to use updated packages.


Defaulting to user installation because normal site-packages is not writeable
Collecting pyspark (from -r requirements.txt (line 1))
  Downloading pyspark-3.5.5.tar.gz (317.2 MB)
     ---------------------------------------- 0.0/317.2 MB ? eta -:--:--
     ---------------------------------------- 0.0/317.2 MB ? eta -:--:--
     ---------------------------------------- 0.5/317.2 MB 4.2 MB/s eta 0:01:17
     ---------------------------------------- 1.3/317.2 MB 2.9 MB/s eta 0:01:49
     ---------------------------------------- 1.8/317.2 MB 2.9 MB/s eta 0:01:50
     ---------------------------------------- 2.4/317.2 MB 2.8 MB/s eta 0:01:53
     ---------------------------------------- 3.1/317.2 MB 2.8 MB/s eta 0:01:51
     ---------------------------------------- 3.4/317.2 MB 2.8 MB/s eta 0:01:54
     ---------------------------------------- 3.9/317.2 MB 2.7 MB/s eta 0:01:55
      --------------------------------------- 4.5/317.2 MB 2.7 MB/s eta 0:01:56
      -----------------------------


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: C:\Users\User\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [None]:
# Import statements
from pyspark.sql import SparkSession
from pyspark.sql.functions import max, col, to_date, date_format
import csv
import json
import os

import pandas as pd

In [2]:
# Initialise Spark session
spark = SparkSession.builder.appName("PowerliftingData").getOrCreate()

# Directory containing CSV files
input_dir = "../csv_files/"

# Get list of CSV files
csv_files = [os.path.join(input_dir, file) for file in os.listdir(input_dir) if file.endswith(".csv")]

# Load CSV files into a single DataFrame
df = spark.read.csv(csv_files, header=True, inferSchema=True)

In [3]:
# Set relevant columns
columns_of_interest = [
    "Name",
    "Sex",
    "Country",
    "Event",
    "Equipment",
    "Age",
    "AgeClass",
    "BirthYearClass",
    "Division",
    "BodyweightKg",
    "WeightClassKg",
    "Federation",
    "Squat1Kg",
    "Squat2Kg",
    "Squat3Kg",
    "Best3SquatKg",
    "Bench1Kg",
    "Bench2Kg",
    "Bench3Kg",
    "Best3BenchKg",
    "Deadlift1Kg",
    "Deadlift2Kg",
    "Deadlift3Kg",
    "Best3DeadliftKg",
    "TotalKg",
    "Place",
    "Goodlift",
    "Tested",
    "Country",
    "Date",
]

In [4]:
# Select DF with relevant columns
df_cleaned = df.select(columns_of_interest)

In [5]:
# Filter rows where 'Federation' is 'MaltaPA'
df_filtered = df_cleaned.filter(
    (col("Federation").isin("MaltaPA", "IPF", "EPF")) & 
    ((col("Federation").isin("MaltaPA")) | (col("Country") == "Malta"))
)

In [6]:
# Cast 'Goodlift' to float
df_casted = df_filtered.withColumn("Goodlift", col("Goodlift").cast("float"))

# Group by 'Name' and retain the maximum 'Goodlift'
df_max_goodlift = df_casted.groupBy("Name").agg(max("Goodlift").alias("Goodlift"))

# Join back to retain other columns for the rows with the highest 'Goodlift' per 'Name'
df_sorted = df_casted.join(
    df_max_goodlift,
    on=["Name", "Goodlift"],  # Join on both 'Name' and the maximum 'Goodlift'
    how="inner",
).orderBy(
    col("Goodlift").desc()
)  # Final sorting by 'Goodlift' descending

In [7]:
# Validation
df_sorted.show(n=df_sorted.count(), truncate=False)

+---------------------+--------+---+-------+-----+---------+----+--------+--------------+-----------+------------+-------------+----------+--------+--------+--------+------------+--------+--------+--------+------------+-----------+-----------+-----------+---------------+-------+-----+------+-------+----------+
|Name                 |Goodlift|Sex|Country|Event|Equipment|Age |AgeClass|BirthYearClass|Division   |BodyweightKg|WeightClassKg|Federation|Squat1Kg|Squat2Kg|Squat3Kg|Best3SquatKg|Bench1Kg|Bench2Kg|Bench3Kg|Best3BenchKg|Deadlift1Kg|Deadlift2Kg|Deadlift3Kg|Best3DeadliftKg|TotalKg|Place|Tested|Country|Date      |
+---------------------+--------+---+-------+-----+---------+----+--------+--------------+-----------+------------+-------------+----------+--------+--------+--------+------------+--------+--------+--------+------------+-----------+-----------+-----------+---------------+-------+-----+------+-------+----------+
|Evita Otigbah        |108.69  |F  |NULL   |SBD  |Raw      |30.5

In [8]:
# Get columns that actually matter by selecting wanted columns
df_final = df_sorted.select(
    "Name", 
    "Goodlift", 
    col("Sex").alias("Gender"),
    # Cast 'Date' to DateType and format it to DD-MM-YYYY
    date_format(to_date(col("Date"), "yyyy-MM-dd"), "dd-MM-yyyy").alias("Date")
)

In [14]:
# Final validation
df_final.show(n=df_final.count(), truncate=False)

+---------------------+--------+------+----------+
|Name                 |Goodlift|Gender|Date      |
+---------------------+--------+------+----------+
|Evita Otigbah        |108.69  |F     |09-03-2025|
|Benjamin Sacco       |97.77   |M     |09-03-2025|
|Matthew Mifsud       |97.55   |M     |09-03-2025|
|Jurgen Dalli         |94.66   |M     |09-03-2025|
|Neil Bezzina         |94.31   |M     |09-03-2025|
|Nikola Vuksanovic    |94.21   |M     |09-11-2024|
|Ian Pace             |91.81   |M     |09-03-2025|
|Maria Farrugia       |91.57   |F     |09-03-2025|
|Shawn Farrugia       |90.84   |M     |04-12-2023|
|Wayne Gregoraci      |90.0    |M     |04-12-2023|
|Alessandro Gatt      |89.59   |M     |09-03-2025|
|Lorna Cachia         |89.06   |F     |06-04-2024|
|Joseph Abela #1      |87.91   |M     |13-09-2024|
|Paul Gauci           |87.63   |M     |06-04-2024|
|Daryl Ruggier        |87.36   |M     |09-03-2025|
|Shanel M Mallia      |86.23   |F     |09-03-2025|
|Nicolas Azzopardi    |85.63   

In [15]:
# Convert Spark DataFrame to Pandas DataFrame
df_pandas = df_final.toPandas()

In [16]:
# Write the Pandas DataFrame to CSV file
output_file = "../updated-rankings.csv"
df_pandas.to_csv(output_file, index=False)

In [17]:
# Convert CSV to JSON for frontend, split by gender
file = "../updated-rankings.csv"
json_file_male = "../malta-national-rankings/json/updated-rankings-male.json"
json_file_female = "../malta-national-rankings/json/updated-rankings-female.json"

# Read CSV File
def read_CSV(file, json_file_male, json_file_female):
    male_rows = []
    female_rows = []
    with open(file) as csvfile:
        reader = csv.DictReader(csvfile)
        field = reader.fieldnames
        for row in reader:
            if row['Gender'] == 'M':
                male_rows.append({field[i]: row[field[i]] for i in range(len(field))})
            elif row['Gender'] == 'F':
                female_rows.append({field[i]: row[field[i]] for i in range(len(field))})
        convert_write_json(male_rows, json_file_male)
        convert_write_json(female_rows, json_file_female)

# Convert csv data into json
def convert_write_json(data, json_file):
    with open(json_file, "w") as f:
        json.dump(data, f, sort_keys=False, indent=4, separators=(",", ": "))

read_CSV(file, json_file_male, json_file_female)
