# Working with RDDs and exploring the data

In [38]:
# Change width of Jupyter notebook
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [40]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("review_and_category_analytics") \
    .config("spark.executor.memory", '8g') \
    .config('spark.executor.cores', '4') \
    .config('spark.cores.max', '4') \
    .config("spark.driver.memory",'8g') \
    .getOrCreate()

sc = spark.sparkContext

# Read in Dataset

In [41]:
lines = sc.textFile("NYC_Merged_Complaints_Data.csv")

# Display the first 5 rows to see what the beginning of the RDD looks like


In [42]:
lines.take(5)

[',ComplaintID,ProblemID,UnitTypeID,UnitType,SpaceTypeID ,SpaceType,TypeID,Type,MajorCategoryID,MajorCategory,MinorCategoryID,MinorCategory,CodeID,Code,StatusID,Status,StatusDate,StatusDescription,BuildingID,BoroughID,Borough,HouseNumber,StreetName,Zip,Block,Lot,Apartment,CommunityBoard,ReceivedDate',
 '0,2397487,3768602,20,APARTMENT,68,ENTIRE APARTMENT,2,HAZARDOUS,13,NONCONST,106,VERMIN,886,ROACHES,2,CLOSE,08/12/2004,"The Department of Housing Preservation and Development was not able to gain access to inspect the following conditions. The complaint has been closed. If the condition still exists, please file a new complaint.",580051,4,QUEENS,88-47,179 PLACE,11432,9915,11,2FL,12,06/20/2004',
 '1,2397487,3768603,20,APARTMENT,159,OTHER,2,HAZARDOUS,13,NONCONST,106,VERMIN,884,MICE,2,CLOSE,08/12/2004,"The Department of Housing Preservation and Development was not able to gain access to inspect the following conditions. The complaint has been closed. If the condition still exists, please fil

# Get non-header records

In [44]:
# the first row is 'index,review_emp_txt,categories', which are our 3 headers

# in order to get the non-header records, I could save down that first line as 
# the headers and then filter the RDD for rows that do not contain 'index,review_emp_txt,categories'

header = lines.first()
#header
#split_categories = non_header.flatMap(lambda line: line.split("\'"))

# now save the non-header rows down 
non_header = lines.filter(lambda line: header not in line)

# print the first 2 records (note: exclude the header in all calculations)

In [45]:
non_header.take(2)

['0,2397487,3768602,20,APARTMENT,68,ENTIRE APARTMENT,2,HAZARDOUS,13,NONCONST,106,VERMIN,886,ROACHES,2,CLOSE,08/12/2004,"The Department of Housing Preservation and Development was not able to gain access to inspect the following conditions. The complaint has been closed. If the condition still exists, please file a new complaint.",580051,4,QUEENS,88-47,179 PLACE,11432,9915,11,2FL,12,06/20/2004',
 '1,2397487,3768603,20,APARTMENT,159,OTHER,2,HAZARDOUS,13,NONCONST,106,VERMIN,884,MICE,2,CLOSE,08/12/2004,"The Department of Housing Preservation and Development was not able to gain access to inspect the following conditions. The complaint has been closed. If the condition still exists, please file a new complaint.",580051,4,QUEENS,88-47,179 PLACE,11432,9915,11,2FL,12,06/20/2004']

# get a record count

In [46]:
non_header.count()

2834416

# Return the count of records containing the phrase 'HEAT/HOT WATER'

In [47]:
# I am going to use the .contains() method with an input of the word "awesome" to search
# whether the "review_emp_txt" column of reviews has any reviews that contain the word

data_heathotwater = non_header.filter(lambda line: 'HEAT/HOT WATER' in line)
#data_awesome.take(3)
data_heathotwater.count()

# there are 3 reviews that contain the word "awesome" 

954975

# Print the records contains the phrase 'HEAT/HOT WATER'

In [50]:
# data_heathotwater.collect()

# ^ that would be way too much, so i'm just going to print the first 4

data_heathotwater.take(4)

['134,6981408,14581340,92,BUILDING-W,550,BUILDING-WIDE,1,EMERGENCY,59,HEAT/HOT WATER,349,ENTIRE BUILDING,2717,NO HOT WATER,2,CLOSE,08/06/2014,"More than one complaint was received for this building-wide condition.This complaint status is for the initial complaint. The Department of Housing Preservation and Development contacted a tenant in the building and verified that the following conditions were corrected. The complaint has been closed. If the condition still exists, please file a new complaint.",27733,1,MANHATTAN,22,SPRING STREET,10012,479,17,BLDG,2,07/31/2014',
 '149,6982351,14584111,92,BUILDING-W,550,BUILDING-WIDE,1,EMERGENCY,59,HEAT/HOT WATER,349,ENTIRE BUILDING,2717,NO HOT WATER,2,CLOSE,08/04/2014,"More than one complaint was received for this building-wide condition.This complaint status is for the initial complaint. The Department of Housing Preservation and Development contacted an occupant of the apartment and verified that the following conditions were corrected. The comp

# Return the top 10 most frequent categories

In [64]:
# by using map and split on ',', we can take the 10th index after the split (which contains the MajorCategory)
# and then use that to return to a new variable containing the categories for each row
categories = non_header.map(lambda line: line.split(',')[10])

categories.take(10)

['NONCONST',
 'NONCONST',
 'PAINT/PLASTER',
 'NONCONST',
 'PLUMBING',
 'PAINT/PLASTER',
 'HEATING',
 'PAINT/PLASTER',
 'GENERAL',
 'PAINT/PLASTER']

In [65]:
# taken right from the textbook (Learning Spark Lightning Fast, page 52)
count = categories.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)

count.take(10)

# currently unordered

[('ELECTRIC', 132104),
 ('ELEVATOR', 6038),
 ('APPLIANCE', 71518),
 ('CONSTRUCTION', 10),
 ('HEATING', 2552),
 ('NONCONST', 293),
 ('SAFETY', 71051),
 ('UNSANITARY CONDITION', 404932),
 ('GENERAL', 127369),
 ('PLUMBING', 264352)]

In [66]:
# we need to sort by value, so i'm going to invert the key and value pair
count = count.map(lambda x: (x[1],x[0]))

#count.take(5)
count.sortByKey(False).take(10)
# sortByKey() function has an 'ascending' set to true, so I'm setting it to False
# so the largest is set first

[(954975, 'HEAT/HOT WATER'),
 (404932, 'UNSANITARY CONDITION'),
 (303129, 'PAINT/PLASTER'),
 (264352, 'PLUMBING'),
 (197229, 'DOOR/WINDOW'),
 (161869, 'WATER LEAK'),
 (132104, 'ELECTRIC'),
 (128794, 'FLOORING/STAIRS'),
 (127369, 'GENERAL'),
 (71518, 'APPLIANCE')]

# Top 10 Categories 

[(954975, 'HEAT/HOT WATER'), <br>
 (404932, 'UNSANITARY CONDITION'),<br>
 (303129, 'PAINT/PLASTER'),<br>
 (264352, 'PLUMBING'),<br>
 (197229, 'DOOR/WINDOW'),<br>
 (161869, 'WATER LEAK'),<br>
 (132104, 'ELECTRIC'),<br>
 (128794, 'FLOORING/STAIRS'),<br>
 (127369, 'GENERAL'),<br>
 (71518, 'APPLIANCE')]<br>