In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

In [15]:
spark = SparkSession.builder.appName("MinTempeatures").getOrCreate()

In [34]:
schema = StructType([ \
        StructField("stationID", StringType(), True), \
        StructField("date", IntegerType(), True), \
        StructField("measure_type", StringType(), True), \
        StructField("temperature", FloatType(), True)])

In [37]:
df = spark.read.schema(schema).csv("resources/1800.csv")
df.printSchema()
df.show(5)

root
 |-- stationID: string (nullable = true)
 |-- date: integer (nullable = true)
 |-- measure_type: string (nullable = true)
 |-- temperature: float (nullable = true)

+-----------+--------+------------+-----------+
|  stationID|    date|measure_type|temperature|
+-----------+--------+------------+-----------+
|ITE00100554|18000101|        TMAX|      -75.0|
|ITE00100554|18000101|        TMIN|     -148.0|
|GM000010962|18000101|        PRCP|        0.0|
|EZE00100082|18000101|        TMAX|      -86.0|
|EZE00100082|18000101|        TMIN|     -135.0|
+-----------+--------+------------+-----------+
only showing top 5 rows



In [38]:
# filteirng out all but TMIN entries
minTemps = df.filter(df.measure_type == "TMIN")

In [39]:
# selecting only stationID and temperature
stationTemps = minTemps.select("stationID", "temperature")

In [40]:
# aggregating to find minimum temperature for every station
minTempsByStation = stationTemps.groupBy("stationID").min("temperature")
minTempsByStation.show()

+-----------+----------------+
|  stationID|min(temperature)|
+-----------+----------------+
|ITE00100554|          -148.0|
|EZE00100082|          -135.0|
+-----------+----------------+



In [42]:
# converting temperature to fahrenheir and sort the dataset
minTempsByStationF = minTempsByStation.withColumn("temperature", 
                            func.round(func.col("min(temperature)") * 0.1 * (9 / 5) + 32, 2))\
                                .select("stationID", "temperature")\
                                    .sort("temperature")

In [43]:
# collect, format and print the results
results = minTempsByStationF.collect()

In [47]:
for result in results:
    print(f"{result[0]} : {result[1]}")

ITE00100554 : 5.36
EZE00100082 : 7.7


In [48]:
spark.stop()