### Calculate minimum temperature by location. ###

In [9]:
# Import packages.
import findspark
findspark.init()

from pyspark import SparkConf, SparkContext
import os, json

# Launch spark cluster. Restart cluster, if it is already started.
try:
    sc
    sc.stop()
except NameError:
    pass
finally:
    print('Spinning up Spark cluster ...')
    conf = SparkConf().setMaster("local").setAppName("MinimumTemperatures")
    sc = SparkContext(conf = conf)

# Display cluster information.
sc

Spinning up Spark cluster ...


Get working directory. Get configurations from configuration file. Use configuration to set data file location.

In [10]:
# Get current working directory.
current_working_directory = os.getcwd()

# Load configuration file.
with open(current_working_directory + '\configuration.json', 'r') as configuration_file:
    dict_configurations = json.load(configuration_file)

# Get path part for 1800.csv file from configuration file.
_1800_csv_path_part = dict_configurations['1800.csv_path_part']

# Get current working directory's parent.
current_working_directory_parent = os.path.dirname(current_working_directory)

# Get full path for 1800.csv file.
_1800_csv_path = os.path.abspath(os.path.join(current_working_directory_parent, _1800_csv_path_part))

In [11]:
# Create function to parse each line of data file.
def parseLine(line):
    fields = line.split(',')
    stationID = fields[0]
    entryType = fields[2]
    temperature = float(fields[3]) * 0.1 * (9.0 / 5.0) + 32.0
    return (stationID, entryType, temperature)

parseLine('ITE00100554,18000101,TMAX,-75,,,E,')

('ITE00100554', 'TMAX', 18.5)

In [12]:
lines = sc.textFile(_1800_csv_path)
lines.take(10)

['ITE00100554,18000101,TMAX,-75,,,E,',
 'ITE00100554,18000101,TMIN,-148,,,E,',
 'GM000010962,18000101,PRCP,0,,,E,',
 'EZE00100082,18000101,TMAX,-86,,,E,',
 'EZE00100082,18000101,TMIN,-135,,,E,',
 'ITE00100554,18000102,TMAX,-60,,I,E,',
 'ITE00100554,18000102,TMIN,-125,,,E,',
 'GM000010962,18000102,PRCP,0,,,E,',
 'EZE00100082,18000102,TMAX,-44,,,E,',
 'EZE00100082,18000102,TMIN,-130,,,E,']

In [13]:
parsedLines = lines.map(parseLine)
parsedLines.take(10)

[('ITE00100554', 'TMAX', 18.5),
 ('ITE00100554', 'TMIN', 5.359999999999999),
 ('GM000010962', 'PRCP', 32.0),
 ('EZE00100082', 'TMAX', 16.52),
 ('EZE00100082', 'TMIN', 7.699999999999999),
 ('ITE00100554', 'TMAX', 21.2),
 ('ITE00100554', 'TMIN', 9.5),
 ('GM000010962', 'PRCP', 32.0),
 ('EZE00100082', 'TMAX', 24.08),
 ('EZE00100082', 'TMIN', 8.599999999999998)]

In [14]:
# Filter out lines that contain "TMIN" in second field (entryType).
minTemps = parsedLines.filter(lambda x: "TMIN" in x[1])
minTemps.take(10)

[('ITE00100554', 'TMIN', 5.359999999999999),
 ('EZE00100082', 'TMIN', 7.699999999999999),
 ('ITE00100554', 'TMIN', 9.5),
 ('EZE00100082', 'TMIN', 8.599999999999998),
 ('ITE00100554', 'TMIN', 23.72),
 ('EZE00100082', 'TMIN', 18.86),
 ('ITE00100554', 'TMIN', 29.66),
 ('EZE00100082', 'TMIN', 18.68),
 ('ITE00100554', 'TMIN', 30.919999999999998),
 ('EZE00100082', 'TMIN', 21.56)]

In [15]:
# Create RDD of tuples containing only the 1st (stationID) and 3rd (temperature) fields.
stationTemps = minTemps.map(lambda x: (x[0], x[2]))
stationTemps.take(10)

[('ITE00100554', 5.359999999999999),
 ('EZE00100082', 7.699999999999999),
 ('ITE00100554', 9.5),
 ('EZE00100082', 8.599999999999998),
 ('ITE00100554', 23.72),
 ('EZE00100082', 18.86),
 ('ITE00100554', 29.66),
 ('EZE00100082', 18.68),
 ('ITE00100554', 30.919999999999998),
 ('EZE00100082', 21.56)]

In [17]:
# Compute minimim temperature for each stationID.
minTemps = stationTemps.reduceByKey(lambda x, y: min(x,y))
minTemps.take(10)

[('ITE00100554', 5.359999999999999), ('EZE00100082', 7.699999999999999)]

In [25]:
# Double check to confirm that there are only two stationIDs relevant to the current analysis.
parsedLines.filter(lambda x: 'TMIN' in x[1]).map(lambda x: x[0]).distinct().collect()

['ITE00100554', 'EZE00100082']

In [33]:
# Collect stationID & minimum temperature into results list on driver node.
results = minTemps.collect();

# Print out content of results list.
for result in results:
    print(result[0] + "\t{:.2f}F".format(result[1]))

ITE00100554	5.36F
EZE00100082	7.70F
