In [1]:
# Declare all path to the dataset
REVIEW_FN = './dataset/yelp_review.csv'
BUSINESS_FN = './dataset/yelp_business.csv'

# Parsing `yelp_review.csv` dataset

Since this dataset contains multi-line field, parsing it was proven to be a real challenge. This problem was mitigated by using `SparkContext` built-in method `mapPartitionsWithIndex()` which takes as its argument a function with two parameters.

One downside to this solution is that some of the records from the dataset will be lost since they lies in the boundary of the partitions when Spark decides to split the dataset into smaller subsets to be consumed by worker nodes.

In [2]:
def extract_row(index, lines):
    """
    Yield a row of strings that has been parsed
    correctly by the CSV reader.
    
    Only rows with nine elements are valid as per
    the schema of the original CSV file
    """
    import csv
    
    if index == 0:
        lines.next()
        
    reader = csv.reader(lines)
    for row in reader:
        if len(row) == 9:
            yield row

In [3]:
# Create an RDD that correctly parses the Yelp review dataset
reviews_rdd = sc.textFile(REVIEW_FN, use_unicode=False) \
                .coalesce(29) \
                .mapPartitionsWithIndex(extract_row)

# Parse `yelp_business.csv` dataset

In [4]:
# Create an RDD that correctly parses the Yelp business dataset
from operator import add
business_rdd = sc.textFile(BUSINESS_FN, use_unicode=False) 

## Top 40 Business Categories

In [5]:
def extract_business_categories(pid, lines):
    """
    Yield the 13th column of the yelp_business.csv dataset
    which contains the categories in which a business can 
    be categorized
    """
    import csv
    
    if pid == 0:
        lines.next()
        
    reader = csv.reader(lines)
    for row in reader:
        yield row[12]

In [6]:
top_business_rdd = business_rdd \
                .mapPartitionsWithIndex(extract_business_categories) \
                .flatMap(lambda x: x.split(';')) \
                .map(lambda x: (x, 1)) \
                .reduceByKey(add) \
                .sortBy(lambda x: x[1], ascending=False)
top_business_rdd.take(40)

[('Restaurants', 54618),
 ('Shopping', 27971),
 ('Food', 24777),
 ('Beauty & Spas', 17014),
 ('Home Services', 16205),
 ('Health & Medical', 14230),
 ('Nightlife', 12154),
 ('Local Services', 11232),
 ('Automotive', 11052),
 ('Bars', 10563),
 ('Event Planning & Services', 9078),
 ('Active Life', 8257),
 ('Fashion', 7019),
 ('Sandwiches', 6345),
 ('Fast Food', 6280),
 ('Hair Salons', 6140),
 ('American (Traditional)', 6097),
 ('Pizza', 6067),
 ('Coffee & Tea', 5936),
 ('Hotels & Travel', 5736),
 ('Arts & Entertainment', 5515),
 ('Home & Garden', 5379),
 ('Auto Repair', 5172),
 ('Doctors', 4838),
 ('Professional Services', 4752),
 ('Italian', 4662),
 ('Real Estate', 4611),
 ('Burgers', 4558),
 ('Breakfast & Brunch', 4497),
 ('Nail Salons', 4294),
 ('Mexican', 4105),
 ('Fitness & Instruction', 4078),
 ('Chinese', 3987),
 ('American (New)', 3979),
 ('Specialty Food', 3966),
 ('Pets', 3579),
 ('Bakeries', 3261),
 ('Grocery', 3247),
 ('Hair Removal', 3190),
 ('Dentists', 3079)]

## Business Categories with Ratings Greater than 3

In [7]:
biz_rating_total = reviews_rdd \
                .map(lambda x: (x[2], int(x[3]))) \
                .filter(lambda x: x[1] >= 3) \
                .map(lambda x: (x[0], 1)) \
                .reduceByKey(add) \
                .sortBy(lambda x: x[1], ascending=False)

biz_rating_total.take(40)
# business_ratings_aggr.count()

[('4JNXUYY8wbaaDmk3BPzlWw', 6710),
 ('RESDUcs7fIiihp38-d6_6g', 5723),
 ('K7lWdNUhCbcnEvI0NhGewg', 4909),
 ('cYwJA2A6I12KNkm2rtXd5g', 4655),
 ('DkYS3arLOhA8si5uUEmHOw', 4620),
 ('f4x1YBxkLrZg652xt2KR5g', 4094),
 ('KskYqH1Bi7Z_61pH6Om8pg', 3621),
 ('iCQpiavjjPzJ5_3gPD5Ebg', 3480),
 ('2weQS-RnoOBhb1KsHKyoSQ', 3244),
 ('rcaPajgKOJC2vo_l3xa42A', 3139),
 ('hihud--QRriCYZw1zZvW4g', 3133),
 ('5LNZ67Yw9RD6nf4_UhXOjw', 3066),
 ('ujHiaprwCQ5ewziu0Vi9rw', 2893),
 ('SMPbvZLSMMb7KU76YNYMGg', 2844),
 ('XZbuPXdyA0ZtTu3AzqtQhg', 2644),
 ('7sPNbCx7vGAaH7SbNPZ6oA', 2634),
 ('eoHdUeQDNgQ6WYEnP2aiRw', 2627),
 ('Wxxvi3LZbHNIDwJ-ZimtnA', 2623),
 ('FaHADZARwnY4yvlvpnsfGA', 2620),
 ('g8OnV26ywJlZpezdBnOWUQ', 2528),
 ('3kdSl5mo9dWC4clrQjEDGg', 2484),
 ('P7pxQFqr7yBKMMI2J51udw', 2450),
 ('OETh78qcgDltvHULowwhJg', 2396),
 ('XXW_OFaYQkkGOGniujZFHg', 2380),
 ('MpmFFw0GE_2iRFPdsRpJbA', 2344),
 ('QJatAcxYgK1Zp9BRZMAx7g', 2296),
 ('faPVqws-x-5k2CQKDNtHxw', 2225),
 ('NvKNe9DnQavC9GstglcBJQ', 2223),
 ('na4Th5DrNauOv-c43

In [8]:
def extract_biz_cat(pid, lines):
    """
    Yield a tuple of business_id and the list
    of categories the business falls into
    """
    import csv
    
    if pid == 0:
        lines.next()
        
    reader = csv.reader(lines)
    for row in reader:
        yield (row[0], row[12].split(';'))

biz_cat_rdd = business_rdd \
                .mapPartitionsWithIndex(extract_biz_cat)

biz_cat_rdd.take(10)

[('FYWN1wneV18bWNgQjJ2GNg',
  ['Dentists',
   'General Dentistry',
   'Health & Medical',
   'Oral Surgeons',
   'Cosmetic Dentists',
   'Orthodontists']),
 ('He-G7vWjzVUysIKrfNbPUQ',
  ['Hair Stylists',
   'Hair Salons',
   "Men's Hair Salons",
   'Blow Dry/Out Services',
   'Hair Extensions',
   'Beauty & Spas']),
 ('KQPW8lFf1y5BT2MxiSZ3QA',
  ['Departments of Motor Vehicles', 'Public Services & Government']),
 ('8DShNS-LuFqpEWIp0HxijA', ['Sporting Goods', 'Shopping']),
 ('PfOCPjBrlQAnz__NXj9h_w',
  ['American (New)',
   'Nightlife',
   'Bars',
   'Sandwiches',
   'American (Traditional)',
   'Burgers',
   'Restaurants']),
 ('o9eMRCWt5PkpLDE0gOPtcQ', ['Italian', 'Restaurants']),
 ('kCoE3jvEtg6UVz5SOD3GVw',
  ['Real Estate Services',
   'Real Estate',
   'Home Services',
   'Property Management']),
 ('OD2hnuuTJI9uotcKycxg1A', ['Shopping', 'Sporting Goods']),
 ('EsMcGiZaQuG1OOvL9iUFug',
  ['Coffee & Tea', 'Ice Cream & Frozen Yogurt', 'Food']),
 ('TGWhGNusxyMaA4kQVBNeew', ['Automotive',

In [11]:
cat_freq = biz_cat_rdd \
        .join(biz_rating_total) \
        .map(lambda x: [(r, x[1][1]) for r in x[1][0]]) \
        .flatMap(lambda x: x) \
        .reduceByKey(add) \
        .sortBy(lambda x: x[1], ascending=False)
cat_freq.take(40)

[('Restaurants', 2549014),
 ('Food', 815298),
 ('Nightlife', 691681),
 ('Bars', 636709),
 ('American (New)', 419021),
 ('American (Traditional)', 395256),
 ('Breakfast & Brunch', 377027),
 ('Event Planning & Services', 292425),
 ('Shopping', 291758),
 ('Sandwiches', 270878),
 ('Beauty & Spas', 253839),
 ('Arts & Entertainment', 243321),
 ('Mexican', 232217),
 ('Pizza', 232158),
 ('Italian', 231012),
 ('Burgers', 208839),
 ('Coffee & Tea', 198537),
 ('Seafood', 197664),
 ('Japanese', 180204),
 ('Hotels & Travel', 175893),
 ('Desserts', 158282),
 ('Sushi Bars', 151200),
 ('Chinese', 147546),
 ('Home Services', 146764),
 ('Steakhouses', 144534),
 ('Asian Fusion', 137991),
 ('Active Life', 134516),
 ('Cafes', 134165),
 ('Salad', 134157),
 ('Automotive', 132026),
 ('Health & Medical', 131158),
 ('Hotels', 124240),
 ('Bakeries', 116805),
 ('Specialty Food', 116690),
 ('Local Services', 113395),
 ('Pubs', 103119),
 ('Fast Food', 102919),
 ('Barbeque', 98390),
 ('Wine Bars', 94795),
 ('Cocktai