In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

In [None]:
spark = SparkSession.builder.appName("MinTemperatures").getOrCreate()

In [None]:
schema = StructType([ \
                     StructField("stationID", StringType(), True), \
                     StructField("date", IntegerType(), True), \
                     StructField("measure_type", StringType(), True), \
                     StructField("temperature", FloatType(), True)])

// Read the file as dataframe

In [None]:
df = spark.read.schema(schema).csv("1800.csv")
df.printSchema()

Filter out all but TMIN entries

In [None]:
minTemps = df.filter(df.measure_type == "TMIN")

Select only stationID and temperature

In [None]:
stationTemps = minTemps.select("stationID", "temperature")

Aggregate to find minimum temperature for every station

In [None]:
minTempsByStation = stationTemps.groupBy("stationID").min("temperature")
minTempsByStation.show()

Convert temperature to fahrenheit and sort the dataset

In [None]:
minTempsByStationF = minTempsByStation.withColumn("temperature",
                                                  func.round(func.col("min(temperature)") * 0.1 * (9.0 / 5.0) + 32.0, 2))\
                                                  .select("stationID", "temperature").sort("temperature")
                                                  
# Collect, format, and print the results
results = minTempsByStationF.collect()