### Compute minimum and maximum temperatures by location. ###

Start by computing the minimum temperature for each location.

Afterwards, enhance program to compute both minimum and maximum temperatures for each location.

In [43]:
# Import packages.
import findspark
findspark.init()

from pyspark import SparkConf, SparkContext
import os, json

# Launch spark cluster. Restart cluster, if it is already started.
try:
    sc
    sc.stop()
except NameError:
    pass
finally:
    print('Spinning up Spark cluster ...')
    conf = SparkConf().setMaster("local").setAppName("MinimumTemperatures")
    sc = SparkContext(conf = conf)

# Display cluster information.
sc

Spinning up Spark cluster ...


Get working directory. Get configurations from configuration file. Use configuration to set data file location.

In [44]:
# Get current working directory.
current_working_directory = os.getcwd()

# Load configuration file.
with open(current_working_directory + '\configuration.json', 'r') as configuration_file:
    dict_configurations = json.load(configuration_file)

# Get path part for 1800.csv file from configuration file.
_1800_csv_path_part = dict_configurations['1800.csv_path_part']

# Get current working directory's parent.
current_working_directory_parent = os.path.dirname(current_working_directory)

# Get full path for 1800.csv file.
_1800_csv_path = os.path.abspath(os.path.join(current_working_directory_parent, _1800_csv_path_part))

In [45]:
# Create function to parse each line of data file.
def parseLine(line):
    fields = line.split(',')
    stationID = fields[0]
    entryType = fields[2]
    temperature = float(fields[3]) * 0.1 * (9.0 / 5.0) + 32.0
    return (stationID, entryType, temperature)

parseLine('ITE00100554,18000101,TMAX,-75,,,E,')

('ITE00100554', 'TMAX', 18.5)

In [46]:
lines = sc.textFile(_1800_csv_path)
lines.take(10)

['ITE00100554,18000101,TMAX,-75,,,E,',
 'ITE00100554,18000101,TMIN,-148,,,E,',
 'GM000010962,18000101,PRCP,0,,,E,',
 'EZE00100082,18000101,TMAX,-86,,,E,',
 'EZE00100082,18000101,TMIN,-135,,,E,',
 'ITE00100554,18000102,TMAX,-60,,I,E,',
 'ITE00100554,18000102,TMIN,-125,,,E,',
 'GM000010962,18000102,PRCP,0,,,E,',
 'EZE00100082,18000102,TMAX,-44,,,E,',
 'EZE00100082,18000102,TMIN,-130,,,E,']

In [47]:
parsedLines = lines.map(parseLine)
parsedLines.take(10)

[('ITE00100554', 'TMAX', 18.5),
 ('ITE00100554', 'TMIN', 5.359999999999999),
 ('GM000010962', 'PRCP', 32.0),
 ('EZE00100082', 'TMAX', 16.52),
 ('EZE00100082', 'TMIN', 7.699999999999999),
 ('ITE00100554', 'TMAX', 21.2),
 ('ITE00100554', 'TMIN', 9.5),
 ('GM000010962', 'PRCP', 32.0),
 ('EZE00100082', 'TMAX', 24.08),
 ('EZE00100082', 'TMIN', 8.599999999999998)]

In [48]:
# Filter out lines that contain "TMIN" in second field (entryType).
minTemps = parsedLines.filter(lambda x: "TMIN" in x[1])
minTemps.take(10)

[('ITE00100554', 'TMIN', 5.359999999999999),
 ('EZE00100082', 'TMIN', 7.699999999999999),
 ('ITE00100554', 'TMIN', 9.5),
 ('EZE00100082', 'TMIN', 8.599999999999998),
 ('ITE00100554', 'TMIN', 23.72),
 ('EZE00100082', 'TMIN', 18.86),
 ('ITE00100554', 'TMIN', 29.66),
 ('EZE00100082', 'TMIN', 18.68),
 ('ITE00100554', 'TMIN', 30.919999999999998),
 ('EZE00100082', 'TMIN', 21.56)]

In [49]:
# Create RDD of tuples containing only the 1st (stationID) and 3rd (temperature) fields.
stationTemps = minTemps.map(lambda x: (x[0], x[2]))
stationTemps.take(10)

[('ITE00100554', 5.359999999999999),
 ('EZE00100082', 7.699999999999999),
 ('ITE00100554', 9.5),
 ('EZE00100082', 8.599999999999998),
 ('ITE00100554', 23.72),
 ('EZE00100082', 18.86),
 ('ITE00100554', 29.66),
 ('EZE00100082', 18.68),
 ('ITE00100554', 30.919999999999998),
 ('EZE00100082', 21.56)]

In [89]:
# Compute minimim temperature for each stationID.
minTemps = stationTemps.reduceByKey(lambda x, y: min(x,y))
minTemps.take(10)

[('ITE00100554', 5.359999999999999), ('EZE00100082', 7.699999999999999)]

In [25]:
# Double check to confirm that there are only two stationIDs relevant to the current analysis.
parsedLines.filter(lambda x: 'TMIN' in x[1]).map(lambda x: x[0]).distinct().collect()

['ITE00100554', 'EZE00100082']

In [112]:
# Collect stationID & minimum temperature into results list on driver node.
results = minTemps.collect();

# Print out content of results list.
for result in results:
    print(result[0] + "\t{:.2f}F".format(result[1]))

ITE00100554	5.36F
EZE00100082	7.70F


Extend analytics to include computation of both minimum and maximum temperatures for each location.

In [107]:
# Filter for minimum and maximum temperatures.
minAndMaxTemps = parsedLines.filter(lambda x: ('TMIN' in x[1]) or ('TMAX' in x[1]))
print(minAndMaxTemps.take(10));print('')

# Map to RDD of (stationID, temperature)
minAndMaxTemps = minAndMaxTemps.map(lambda x: (x[0], x[2]))
print(minAndMaxTemps.take(10))

[('ITE00100554', 'TMAX', 18.5), ('ITE00100554', 'TMIN', 5.359999999999999), ('EZE00100082', 'TMAX', 16.52), ('EZE00100082', 'TMIN', 7.699999999999999), ('ITE00100554', 'TMAX', 21.2), ('ITE00100554', 'TMIN', 9.5), ('EZE00100082', 'TMAX', 24.08), ('EZE00100082', 'TMIN', 8.599999999999998), ('ITE00100554', 'TMAX', 27.86), ('ITE00100554', 'TMIN', 23.72)]

[('ITE00100554', 18.5), ('ITE00100554', 5.359999999999999), ('EZE00100082', 16.52), ('EZE00100082', 7.699999999999999), ('ITE00100554', 21.2), ('ITE00100554', 9.5), ('EZE00100082', 24.08), ('EZE00100082', 8.599999999999998), ('ITE00100554', 27.86), ('ITE00100554', 23.72)]


In [109]:
# Group by station. Convert each group into (stationID, list[temperature]) tuple.
groupedStations = minAndMaxTemps.groupByKey().map(lambda x: (x[0], list(x[1])))
groupedStations.take(10)

[('ITE00100554',
  [18.5,
   5.359999999999999,
   21.2,
   9.5,
   27.86,
   23.72,
   32.0,
   29.66,
   33.8,
   30.919999999999998,
   34.34,
   34.34,
   37.58,
   33.8,
   37.22,
   34.52,
   38.3,
   36.14,
   40.28,
   37.58,
   43.88,
   39.38,
   39.38,
   37.22,
   36.14,
   34.34,
   39.38,
   36.14,
   41.72,
   39.38,
   42.08,
   39.38,
   47.120000000000005,
   40.64,
   42.620000000000005,
   40.28,
   42.620000000000005,
   41.0,
   42.8,
   36.5,
   46.4,
   37.4,
   46.58,
   45.32,
   47.3,
   45.5,
   47.3,
   45.5,
   49.64,
   45.5,
   45.5,
   38.120000000000005,
   45.14,
   37.22,
   46.22,
   41.72,
   45.14,
   41.0,
   43.88,
   41.0,
   47.3,
   41.0,
   48.379999999999995,
   43.34,
   43.88,
   41.72,
   42.08,
   41.72,
   50.72,
   42.620000000000005,
   46.22,
   40.28,
   45.5,
   39.38,
   45.14,
   38.3,
   41.0,
   36.5,
   38.3,
   32.0,
   38.84,
   34.88,
   35.78,
   33.62,
   34.34,
   32.54,
   34.34,
   32.0,
   34.34,
   32.0,
   35.78,
 

In [113]:
# Map each group into (stationID, minimum temperature, maximum temperature) tuple.
summarizedTemps = groupedStations.map(lambda x: (x[0], min(x[1]), max(x[1])))
summarizedTemps.take(10)

[('ITE00100554', 5.359999999999999, 90.14000000000001),
 ('EZE00100082', 7.699999999999999, 90.14000000000001)]

In [114]:
# Collect station ID, minimum temperature and maximum temperature
# into results_2 list on driver node.
results_2 = summarizedTemps.collect();

# Print out content of results_2 list.
for result in results_2:
    print(result[0] + "\t{:.2f}F".format(result[1]) + "\t{:.2f}F".format(result[2]))

ITE00100554	5.36F	90.14F
EZE00100082	7.70F	90.14F
