In [None]:
from smart_open import open
import json
import pyspark
import nltk # pip install nltk
from nltk.corpus import stopwords


def analyze_electronics_reviews():

    # For uploading data
    bucket = "msds-694-cohort-14-group9/data"
    filename = "Electronics.jsonl"
    path = f"gs://{bucket}/{filename}"

    sc = pyspark.SparkContext()
    base_rdd = sc.textFile(path).map(json.loads)

    def get_average_length_by_rating(base_rdd):

        average_review_length_by_rating = {}
        for i in range(5):
            rating = i+1
            filtered_rdd = base_rdd.filter(lambda x: int(x['rating']) == rating)
            review_lengths = filtered_rdd.map(lambda x: len(x['text']))
            average_review_length = review_lengths.reduce(lambda x, y: x+y)/filtered_rdd.count()

            average_review_length_by_rating[rating] = round(average_review_length, 2)
        
        return average_review_length_by_rating

    def get_top_words_by_rating(base_rdd):

        nltk.download('stopwords')
        stop_words = set(stopwords.words('english'))

        def parse_review(text):
            problem_chars = ['.','><br','/', '-']
            filtered_text = text.lower()
            for c in problem_chars:
                filtered_text = filtered_text.replace(c,'')
            return filtered_text.split(' ')

        top_words_by_rating = {}
        for i in range(5):
            rating = i+1
            filtered_rdd = base_rdd.filter(lambda x: int(x['rating']) == rating)
            word_rdd = filtered_rdd.flatMap(lambda x: parse_review(x['text']))
            clean_word_rdd = word_rdd.filter(lambda x: x not in stop_words and x not in ['', '>'])

            n = clean_word_rdd.count()
            
            word_counts = clean_word_rdd.map(lambda word: (word, 1)).reduceByKey(lambda x, y: x + y)
            sorted_word_counts = word_counts.sortBy(lambda x: x[1], ascending=False)
            sorted_word_ratios = sorted_word_counts.mapValues(lambda x: round(x/n, 5))

            top_words_by_rating[rating] = sorted_word_ratios.take(10)

        return top_words_by_rating
    
    rating_lengths, top_words = get_average_length_by_rating(base_rdd), get_top_words_by_rating(base_rdd)
    print(rating_lengths)
    print(top_words)

    sc.stop()

    return rating_lengths, top_words

In [2]:
analyze_electronics_reviews()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/02 15:16:45 WARN Utils: Your hostname, MacBook-Pro.local, resolves to a loopback address: 127.0.0.1; using 10.167.8.119 instead (on interface en0)
25/12/02 15:16:45 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/02 15:16:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/12/02 15:16:46 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/12/02 15:16:46 WARN TaskSetManager: Stage 0 contains a task of very large size (1206 KiB). The maximum recommended task size is 1000 KiB.
25/12/02 15:16:47 WARN TaskSetManager: Stage 1 contains a task of very large size (1

{1: 277.34, 2: 497.0, 3: 524.11, 4: 640.64, 5: 337.32}
{1: [('one', 0.00999), ('work', 0.00916), ('would', 0.00796), ('get', 0.00698), ('even', 0.00573), ('use', 0.00531), ('product', 0.00442), ('like', 0.00401), ('buy', 0.00385), ('time', 0.00385)], 2: [('one', 0.00875), ('would', 0.00732), ('get', 0.00691), ('work', 0.00655), ('like', 0.00577), ('use', 0.00524), ('even', 0.00441), ('time', 0.00435), ('keyboard', 0.00381), ('good', 0.00381)], 3: [('one', 0.00659), ('would', 0.00623), ('use', 0.00607), ('good', 0.00584), ('like', 0.00562), ('get', 0.00545), ('work', 0.00481), ('case', 0.00461), ('sound', 0.00432), ('much', 0.00364)], 4: [('one', 0.00788), ('use', 0.00746), ('good', 0.00726), ('like', 0.00689), ('would', 0.00588), ('well', 0.00526), ('get', 0.00473), ('little', 0.00465), ('sound', 0.00448), ('great', 0.00414)], 5: [('great', 0.00999), ('one', 0.00818), ('use', 0.00763), ('good', 0.00665), ('like', 0.00598), ('works', 0.00574), ('easy', 0.00545), ('love', 0.00525), ('wel

({1: 277.34, 2: 497.0, 3: 524.11, 4: 640.64, 5: 337.32},
 {1: [('one', 0.00999),
   ('work', 0.00916),
   ('would', 0.00796),
   ('get', 0.00698),
   ('even', 0.00573),
   ('use', 0.00531),
   ('product', 0.00442),
   ('like', 0.00401),
   ('buy', 0.00385),
   ('time', 0.00385)],
  2: [('one', 0.00875),
   ('would', 0.00732),
   ('get', 0.00691),
   ('work', 0.00655),
   ('like', 0.00577),
   ('use', 0.00524),
   ('even', 0.00441),
   ('time', 0.00435),
   ('keyboard', 0.00381),
   ('good', 0.00381)],
  3: [('one', 0.00659),
   ('would', 0.00623),
   ('use', 0.00607),
   ('good', 0.00584),
   ('like', 0.00562),
   ('get', 0.00545),
   ('work', 0.00481),
   ('case', 0.00461),
   ('sound', 0.00432),
   ('much', 0.00364)],
  4: [('one', 0.00788),
   ('use', 0.00746),
   ('good', 0.00726),
   ('like', 0.00689),
   ('would', 0.00588),
   ('well', 0.00526),
   ('get', 0.00473),
   ('little', 0.00465),
   ('sound', 0.00448),
   ('great', 0.00414)],
  5: [('great', 0.00999),
   ('one', 0.00818