In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, IntegerType, FloatType
from pyspark.sql.functions import *

### That code loads temperature measured by some stations and returns the lowest temperature each station measured in that time window

# Defining schema and reading file

In [29]:
#Starting Spark Session
spark = SparkSession.builder.appName("minTemperatures").master("local[*]").getOrCreate()

#Defining the schema
temperatureSchema = StructType([ \
    StructField("stationID", StringType(),True), \
    StructField("date", IntegerType(),True), \
    StructField("measure_type", StringType(),True), \
    StructField("temperature", FloatType(), True), \
    ])

#Reading data
stationTemps = spark.read.option("header", "true").schema(temperatureSchema).csv("data/1800.csv")


In [31]:
print("Here is our inferred schema:")
stationTemps.printSchema() #Printing the schema

#Get the lowest temperature by station
minTempStations = stationTemps.groupBy("stationID").min("temperature") 

minTempStations.show()

Here is our inferred schema:
root
 |-- stationID: string (nullable = true)
 |-- date: integer (nullable = true)
 |-- measure_type: string (nullable = true)
 |-- temperature: float (nullable = true)

+-----------+----------------+
|  stationID|min(temperature)|
+-----------+----------------+
|ITE00100554|          -148.0|
|GM000010962|             0.0|
|EZE00100082|          -135.0|
+-----------+----------------+



In [32]:
spark.stop()