In [14]:
import pandas as pd
df=pd.read_csv("yelp.csv")
df = df.replace(r'\n',' ', regex=True)
df.to_csv("modified_yelp.csv")

1. Average number of words in each review (define “words” however you like but be explicit about it)

In [15]:
%%file wordcount.py
from mrjob.job import MRJob
import csv

class AvgWordCount(MRJob):

    def mapper(self, _, line):
        #parse the input line as a CSV row
        row = next(csv.reader([line]))
        text = row[5]
        yield "word_count", len(text.split())

    def reducer(self, key, values):
        total_words = 0
        total_reviews = 0
        #iterating through the word counts and counting the total number of words
        #also count the total number of reviews
        for word_count in values:
            total_words += word_count
            total_reviews += 1
        #average number of words per review
        average_words_per_review = total_words / total_reviews
        yield "average", average_words_per_review

if __name__ == '__main__':
    AvgWordCount.run()

Overwriting wordcount.py


In [16]:
import wordcount

mr_job = wordcount.AvgWordCount(args=['modified_yelp.csv'])

with mr_job.make_runner() as runner:
    runner.run()
    
    total_word_count = 0
    total_review_count = 0
    
    for key, value in mr_job.parse_output(runner.cat_output()):
        total_word_count += value  
        total_review_count += 1  

    average_word_count = total_word_count / total_review_count
    print("average:", average_word_count)

No configs specified for inline runner


average: 131.02659734026597


2. Count of Reviews by year-month (eg "2021-09")

In [17]:
%%file countpermonth.py
from mrjob.job import MRJob
import csv

class CountPerMonth(MRJob):

    def mapper(self, _, line):
        #parse the input line as a CSV row
        row = next(csv.reader([line]))
        #extract the date field
        date = row[2]  
        #extract the year and month
        year_month = date[:7]  
        #emit key-value pair with year-month as key and count 1 as value
        yield year_month, 1

    def reducer(self, year_month, counts):
        total_count = sum(counts)
        yield year_month, total_count

if __name__ == '__main__':
    CountPerMonth.run()

Overwriting countpermonth.py


In [18]:
import countpermonth
mr_job = countpermonth.CountPerMonth(args=['modified_yelp.csv'])
with mr_job.make_runner() as runner:
    runner.run()
    
    counts_per_month = {}
    for year_month, count in mr_job.parse_output(runner.cat_output()):
        counts_per_month[year_month] = counts_per_month.get(year_month, 0) + count
    
    for year_month, count in counts_per_month.items():
        print(year_month, count)

No configs specified for inline runner


2012-02 219
2012-03 259
2012-04 265
2012-05 275
2012-06 272
2012-07 281
2010-09 150
2010-10 144
2010-11 147
2010-12 160
2011-01 239
2009-04 101
2009-05 101
2009-06 67
2009-07 95
2009-08 98
2009-09 113
2009-10 101
2009-11 78
2009-12 104
2010-01 154
2010-02 148
2010-03 168
2010-04 148
2010-05 154
2010-06 118
2010-07 160
2010-08 201
2012-11 208
2012-12 196
2013-01 52
date 1
2012-08 249
2012-09 239
2012-10 258
2011-08 266
2011-09 193
2011-10 204
2008-07 80
2008-08 75
2008-09 59
2008-10 79
2008-11 66
2008-12 71
2009-01 108
2009-02 79
2009-03 126
2011-02 216
2011-03 263
2011-04 263
2011-05 229
2011-06 230
2011-07 236
2005-04 1
2005-07 2
2005-12 1
2006-01 6
2006-02 9
2006-04 2
2006-05 1
2006-06 5
2006-07 2
2006-08 9
2006-09 4
2006-10 5
2006-11 6
2006-12 6
2007-01 14
2007-02 20
2007-03 42
2007-04 8
2007-05 23
2007-06 12
2007-07 35
2007-08 29
2007-09 26
2007-10 23
2007-11 28
2007-12 25
2008-01 46
2008-02 48
2008-03 47
2008-04 53
2008-05 65
2008-06 76
2011-11 203
2011-12 249
2012-01 304


3. Average rating of any review marked "cool" (eg where cool!=0)

In [19]:
%%file cool.py
from mrjob.job import MRJob
import csv
from statistics import mean
import re

class cool(MRJob):

    def mapper(self, _, line):
        row = next(csv.reader([line]))
        id, business_id, date, review_id, stars, text, type, user_id, cool, useful, funny = row
        if re.match(r'^[0-9]+$', stars) and int(cool) != 0:
            yield None, int(stars)

    def reducer(self, _, star_ratings):
        # Computing the average star rating of cool reviews
        ratings = list(star_ratings)
        if ratings:
            yield "average", mean(ratings)

if __name__ == '__main__':
    cool.run()

Overwriting cool.py


In [20]:
import cool
mr_job = cool.cool(args=['modified_yelp.csv'])
try:
    with mr_job.make_runner() as runner:
        runner.run()
        for key, value in mr_job.parse_output(runner.cat_output()):
            print("average", value)
except Exception as e:
    print("An error occurred:", e)

No configs specified for inline runner


average 3.8649595687331537
