In [None]:
from operator import add
import re
from collections import OrderedDict
from operator import itemgetter 
import itertools
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# (8 cores, 16gb per machine) x 5 = 40 cores

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.1.153:7077") \
        .appName("linneaeriksson_A2.2")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",4)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext
spark_context.setLogLevel("DEBUG")

In [None]:
# B1
citations = spark_session.read\
.option("header", "true")\
.csv("hdfs://192.168.1.153:9000/parking-citations.csv")\
.cache()
citations.show()

In [None]:
# B2
citations.printSchema()

In [None]:
# B3
print("Total number of rows in CSV: " + str(citations.count()))

In [None]:
# B4
print("Total number of partitions in RDD: " + str(citations.rdd.getNumPartitions()))

In [None]:
# B5
citations_dropped = citations.drop("VIN", "Latitude", "Longitude")
citations_dropped.printSchema()

In [None]:
# B6
citations_fine_amount_conv2float = citations_dropped.withColumn("Fine_amount_float", citations_dropped['Fine amount'].cast("float")).na.fill(0)
citations_fine_view = citations_fine_amount_conv2float.createOrReplaceTempView("citations_fine")

spark_session.sql("SELECT Fine_amount_float, count(Fine_amount_float) as Number_of_Fines " + \
                          "FROM citations_fine " + \
                          "GROUP BY Fine_amount_float " + \
                          "ORDER BY Fine_amount_float DESC").show()

In [None]:
# B7
spark_session.sql("SELECT Make, count(Make) AS Nr_of_Vehicles, count(Make)/(SELECT count(*) FROM citations_fine) as FREQ " + \
                          "FROM citations_fine " + \
                          "GROUP BY Make " + \
                          "ORDER BY FREQ DESC").show()

In [None]:
# B8
def expand_color_abbr(col):
    COLORS = {
'AL':'Aluminum', 'AM':'Amber', 'BG':'Beige', 'BK':'Black',
'BL':'Blue', 'BN':'Brown', 'BR':'Brown', 'BZ':'Bronze',
'CH':'Charcoal', 'DK':'Dark', 'GD':'Gold', 'GO':'Gold',
'GN':'Green', 'GY':'Gray', 'GT':'Granite', 'IV':'Ivory',
'LT':'Light', 'OL':'Olive', 'OR':'Orange', 'MR':'Maroon',
'PK':'Pink', 'RD':'Red', 'RE':'Red', 'SI':'Silver', 'SL':'Silver',
'SM':'Smoke', 'TN':'Tan', 'VT':'Violet', 'WT':'White',
'WH':'White', 'YL':'Yellow', 'YE':'Yellow', 'UN':'Unknown'
}
    if col in COLORS:
        return COLORS[col]
    else:
        return col

udf_expand_color_abbr = udf(expand_color_abbr, StringType())
color_long = citations_fine_amount_float.withColumn("Color_Long", udf_expand_color_abbr("Color"))


In [None]:
# B9
color_long_view = color_long.createOrReplaceTempView("citations_color")
toyota_color = spark_session.sql("SELECT Color_Long, count(Color_Long) AS Nr_of_Toyotas " + \
                                  "FROM citations_color " + \
                                  "WHERE Make='TOYT' " + \
                                  "GROUP BY Color_Long " + \
                                  "ORDER BY count(Color_Long) DESC").show()

In [None]:
spark_context.stop()