### Part A

In [1]:
#You can use RDD or DataFrame
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql import functions

try:
    sc.stop()
except:
    pass

conf = SparkConf().setMaster("local").setAppName("PopularHeroNetwork")
sc = SparkContext(conf = conf)

#countCoOccurenes will split up each line based on the space and return key/value pairs.
def countCoOccurences(line):
    elements = line.split()
    return (int(elements[0]), len(elements) - 1)

# parseNames will split based on the back-slash delimiter ('\'), extract hero ID and store it as an integer as a key. Then, it will encode thiings into UTF-8 format as a string. From this process, namesRDD will be a key/value RDD, where the key is hero ID , and the value is the name of hero.
def parseNames(line):
    fields = line.split('\"')
    return (int(fields[0]), fields[1].encode("utf8"))

# Load up your file into an RDD called names.
names = sc.textFile("hero-name.txt")
namesRdd = names.map(parseNames)

# We are loading up the social network data into a lines RDD.
lines = sc.textFile("hero-network.txt")

# countCoOccurrences wil turn the input data into useful data.
pairings = lines.map(countCoOccurences)

# Add up all co-occurrences of the same hero across multiple lines
pairings_agg = pairings.reduceByKey(lambda a, b: a + b)

# Flip co-occurences and hero-id so that max() function can be used to find the hero with most co-occurences 
pairings_flipped = pairings_agg.map(lambda x: (x[1], x[0]))

mostPopular = pairings_flipped.max()

# Define function to extract hero name from parsed txt file
def get_hero_name(hero_id):
    for (value, name) in namesRdd.collect():
        if value == hero_id:
            return name.decode('utf8')

mostPopularName = get_hero_name(mostPopular[1])

print(mostPopularName + " is the most popular superhero, with " + \
    str(mostPopular[0]) + " co-appearances.")

sc.stop()

CAPTAIN AMERICA is the most popular superhero, with 1933 co-appearances.


### Part B

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

import collections

try:
    spark.stop()
except:
    pass

spark = SparkSession.builder.config("spark.sql.warehouse.dir", "file:///C:/temp").appName("CustomerOrders").getOrCreate()

# Define function to parse data
def mapper(line):
    fields = line.split(',')
    return Row(customerID = int(fields[0]), 
               itemID = int(fields[1]), 
               amountSpent = float(fields[2]))

lines = spark.sparkContext.textFile("customer-orders.csv")

orders = lines.map(mapper)
schemaOrders = spark.createDataFrame(orders).cache()
schemaOrders.createOrReplaceTempView("orders")

# Select customerID and amountSpent columns using SQL query
customerSpending = spark.sql("SELECT customerID, amountSpent FROM orders")

# Aggregate using groupBy(customerID) to find total amount spent by each customer
customerSpending.groupBy("customerID").sum("amountSpent").orderBy("customerID").show()

spark.stop()

+----------+------------------+
|customerID|  sum(amountSpent)|
+----------+------------------+
|         0| 5524.949999999998|
|         1| 4958.600000000001|
|         2|           5994.59|
|         3|           4659.63|
|         4| 4815.050000000002|
|         5| 4561.069999999999|
|         6| 5397.879999999998|
|         7| 4755.070000000001|
|         8| 5517.240000000001|
|         9| 5322.649999999999|
|        10| 4819.700000000001|
|        11| 5152.290000000002|
|        12| 4664.589999999998|
|        13|           4367.62|
|        14| 4735.030000000001|
|        15| 5413.510000000001|
|        16|           4979.06|
|        17| 5032.679999999999|
|        18|           4921.27|
|        19|5059.4299999999985|
+----------+------------------+
only showing top 20 rows



### Part C

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

import collections

try:
    spark.stop()
except:
    pass

spark = SparkSession.builder.config("spark.sql.warehouse.dir", "file:///C:/temp").appName("MaxTemp").getOrCreate()

# Define function to parse csv data
def mapper(line):
    fields = line.split(',')
    return Row(stationID = str(fields[0]), 
               date = int(fields[1]), 
               obsType = str(fields[2]),
               temperature = float(fields[3])/10)

lines = spark.sparkContext.textFile("1800.csv")

temperatureData = lines.map(mapper)
schemaTemp = spark.createDataFrame(temperatureData).cache()
schemaTemp.createOrReplaceTempView("temperatureData")

schemaTemp.printSchema()
schemaTemp.show()

# Filter only 'TMAX' observations and select all columns
maxTemp = spark.sql("SELECT * FROM temperatureData WHERE obsType LIKE 'TMAX'")

# Find the max temperature by station using groupBy(stationID)
maxTemp.groupBy("stationID").max("temperature").orderBy("stationID").show()

spark.stop()

root
 |-- stationID: string (nullable = true)
 |-- date: long (nullable = true)
 |-- obsType: string (nullable = true)
 |-- temperature: double (nullable = true)

+-----------+--------+-------+-----------+
|  stationID|    date|obsType|temperature|
+-----------+--------+-------+-----------+
|ITE00100554|18000101|   TMAX|       -7.5|
|ITE00100554|18000101|   TMIN|      -14.8|
|GM000010962|18000101|   PRCP|        0.0|
|EZE00100082|18000101|   TMAX|       -8.6|
|EZE00100082|18000101|   TMIN|      -13.5|
|ITE00100554|18000102|   TMAX|       -6.0|
|ITE00100554|18000102|   TMIN|      -12.5|
|GM000010962|18000102|   PRCP|        0.0|
|EZE00100082|18000102|   TMAX|       -4.4|
|EZE00100082|18000102|   TMIN|      -13.0|
|ITE00100554|18000103|   TMAX|       -2.3|
|ITE00100554|18000103|   TMIN|       -4.6|
|GM000010962|18000103|   PRCP|        0.4|
|EZE00100082|18000103|   TMAX|       -1.0|
|EZE00100082|18000103|   TMIN|       -7.3|
|ITE00100554|18000104|   TMAX|        0.0|
|ITE00100554|1800010

### Data Validation for Part C

In [4]:
import numpy as np
import pandas as pd

df = pd.read_csv("1800.csv", header=None, names=['stationID', 'date', 'obsType', 'temperature'] + list(range(4)))

df['temperature'] = df['temperature']/10
df = df.iloc[:, :4]

df

Unnamed: 0,stationID,date,obsType,temperature
0,ITE00100554,18000101,TMAX,-7.5
1,ITE00100554,18000101,TMIN,-14.8
2,GM000010962,18000101,PRCP,0.0
3,EZE00100082,18000101,TMAX,-8.6
4,EZE00100082,18000101,TMIN,-13.5
...,...,...,...,...
1820,ITE00100554,18001231,TMAX,5.0
1821,ITE00100554,18001231,TMIN,2.5
1822,GM000010962,18001231,PRCP,1.6
1823,EZE00100082,18001231,TMAX,1.4


In [5]:
# Data Validation (Are all TMAX greater than TMIN values?)
all_stations = list(df['stationID'].value_counts().index)
all_dates = list(df['date'].value_counts().index)
validation = 0

for station in all_stations[:-1]:
    
    for date in all_dates:
        max_val = df[(df['stationID']==station)][df['date']==date][df['obsType']=='TMAX']['temperature'].values[0]
        min_val = df[(df['stationID']==station)][df['date']==date][df['obsType']=='TMIN']['temperature'].values[0]
        
        if max_val >= min_val:
            validation += 1

print(validation)

  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.


730


In [6]:
pd.concat([df[(df['stationID']==all_stations[0])], df[(df['stationID']==all_stations[1])]])

Unnamed: 0,stationID,date,obsType,temperature
3,EZE00100082,18000101,TMAX,-8.6
4,EZE00100082,18000101,TMIN,-13.5
8,EZE00100082,18000102,TMAX,-4.4
9,EZE00100082,18000102,TMIN,-13.0
13,EZE00100082,18000103,TMAX,-1.0
...,...,...,...,...
1811,ITE00100554,18001229,TMIN,1.6
1815,ITE00100554,18001230,TMAX,5.0
1816,ITE00100554,18001230,TMIN,3.1
1820,ITE00100554,18001231,TMAX,5.0
