In [5]:
from pyspark.sql import SparkSession

# Initialisation de la SparkSession
spark = SparkSession.builder.appName("ChargementWeatherData").getOrCreate()

df = spark.read.option("sep", ";").option("inferSchema", "true").option("header", "false").csv("/data/1800 weather new.csv")

# Ajout noms des colonnes
df = df.withColumnRenamed("_c0", "Wheather_Station").withColumnRenamed("_c1", "Date").withColumnRenamed("_c2", "Type_of_Measurement").withColumnRenamed("_c3", "Temperature")

df.show(5)

+----------------+--------+-------------------+-----------+
|Wheather_Station|    Date|Type_of_Measurement|Temperature|
+----------------+--------+-------------------+-----------+
|     ITE00100554|18000101|               TMAX|        -75|
|     ITE00100554|18000101|               TMIN|       -148|
|     GM000010962|18000101|               PRCP|          0|
|     EZE00100082|18000101|               TMAX|        -86|
|     EZE00100082|18000101|               TMIN|       -135|
+----------------+--------+-------------------+-----------+
only showing top 5 rows



### Expected Output: The average maximum temperature for the year 1800.

In [11]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("TMAX").getOrCreate()

def process_max_temperature(file_path):
    df = spark.read.option("sep", ";").option("inferSchema", "true").option("header", "false").csv(file_path)
    
    # Ajout noms des colonnes
    df = df.withColumnRenamed("_c0", "Weather_Station") \
           .withColumnRenamed("_c1", "Date") \
           .withColumnRenamed("_c2", "Type_of_Measurement") \
           .withColumnRenamed("_c3", "Temperature")
    
    # conserver uniquement les TMAX
    tmax_df = df.filter(df["Type_of_Measurement"] == "TMAX")

    # Ajuster les températures en les divisant par 10
    tmax_df = tmax_df.withColumn("Adjusted_Temperature", tmax_df["Temperature"] / 10)

    # moyenne des températures maximales
    avg_tmax_df = tmax_df.agg(F.avg("Adjusted_Temperature").alias("Average_Max_Temperature")).collect()[0]["Average_Max_Temperature"]
    
    
    tmax_df.show(5)

    # Afficher la moyenne
    print(f"La température maximale moyenne pour l'année 1800 est : {avg_tmax_df:.2f}°C")
    
    return tmax_df



df_only_tmax = process_max_temperature("/data/1800 weather new.csv")

+---------------+--------+-------------------+-----------+--------------------+
|Weather_Station|    Date|Type_of_Measurement|Temperature|Adjusted_Temperature|
+---------------+--------+-------------------+-----------+--------------------+
|    ITE00100554|18000101|               TMAX|        -75|                -7.5|
|    EZE00100082|18000101|               TMAX|        -86|                -8.6|
|    ITE00100554|18000102|               TMAX|        -60|                -6.0|
|    EZE00100082|18000102|               TMAX|        -44|                -4.4|
|    ITE00100554|18000103|               TMAX|        -23|                -2.3|
+---------------+--------+-------------------+-----------+--------------------+
only showing top 5 rows

La température maximale moyenne pour l'année 1800 est : 14.50°C


In [12]:
# Arrêt de la SparkSession
spark.stop()