## Preliminaries

### Installing Dependencies (Oracle Java)

In [1]:
# Installing dependencies on Linux
!apt update -qq > /dev/null
# Installs Java 8 as Pyspark 2.4.4 still requires Java 8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null





### Installing Spark on the machine

In [2]:
!rm -r spark*
!wget http://mirror.rise.ph/apache/spark/spark-3.3.2/spark-3.3.2-bin-hadoop2.tgz # other version: spark-3.2.3
!ls
!tar xvf ./spark-3.3.2-bin-hadoop2.tgz > /dev/null 2>/dev/null
!ls
!pip install -q findspark

rm: cannot remove 'spark*': No such file or directory
--2023-03-29 11:31:06--  http://mirror.rise.ph/apache/spark/spark-3.3.2/spark-3.3.2-bin-hadoop2.tgz
Resolving mirror.rise.ph (mirror.rise.ph)... 43.226.6.79
Connecting to mirror.rise.ph (mirror.rise.ph)|43.226.6.79|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 274077580 (261M) [application/x-gzip]
Saving to: ‘spark-3.3.2-bin-hadoop2.tgz’


2023-03-29 11:31:24 (14.7 MB/s) - ‘spark-3.3.2-bin-hadoop2.tgz’ saved [274077580/274077580]

sample_data  spark-3.3.2-bin-hadoop2.tgz
sample_data  spark-3.3.2-bin-hadoop2  spark-3.3.2-bin-hadoop2.tgz


In [3]:
# Set environment variables
import os
os.environ["SPARK_HOME"] = "/content/spark-3.3.2-bin-hadoop2/"
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
!update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
!java -version

update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java to provide /usr/bin/java (java) in manual mode
openjdk version "1.8.0_362"
OpenJDK Runtime Environment (build 1.8.0_362-8u362-ga-0ubuntu1~20.04.1-b09)
OpenJDK 64-Bit Server VM (build 25.362-b09, mixed mode)


### Setup Folder

In [4]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization and mount your drive.
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
DATA_PATH = '/content/drive/MyDrive/CSCI_273'
#!ls '$DATA_PATH'

In [6]:
%cd /content/drive/MyDrive/CSCI_273/

/content/drive/MyDrive/CSCI_273


In [7]:
ls -lt

ls: 'GCP Code': No such file or directory
ls: SampleData: No such file or directory
total 267666
-rw------- 1 root root 274077580 Mar 27 02:02  spark-3.3.2-bin-hadoop2.tgz
drwx------ 2 root root      4096 Mar 27 02:02  [0m[01;34mspark-3.3.2-bin-hadoop2[0m/
drwx------ 2 root root      4096 Mar 25 01:19  [01;34mData[0m/
lrw------- 1 root root         0 Jan 21 14:18  [01;36mSampleData[0m -> [01;34m/content/drive/.shortcut-targets-by-id/1yXHtbgesL8zhtdsq8PXmqR-Upl80CrBh/SampleData[0m[K/
lrw------- 1 root root         0 Jan 21 14:13 [01;36m'GCP Code'[0m -> [01;34m'/content/drive/.shortcut-targets-by-id/1qYg9SXcc9minIErchqWXR9Rq4CYj7dqf/GCP Code'[0m[K/
drwx------ 2 root root      4096 Jan 21 01:40  [01;34mSlides[0m/


## Assignment

#### Setup

In [8]:
import findspark
findspark.init()

# Imports the basic spark functions needed
from pyspark import SparkConf, SparkContext
from operator import add

In [9]:
# Import necessary modules for functions
import csv
import re

In [10]:
# Sets the Spark configuration
conf = SparkConf().setMaster("local").setAppName("GooglePlayApps")
sc = SparkContext.getOrCreate(conf = conf)

create a spark application that
* returns a list of Google Play Store apps with the counts of the positive, neutral, and negative reviews sorted by the app's rating with the highest rated apps on top
* output the results into a file instead of the logs

#### Reviews file

For the `googleplaystore_user_reviews.csv` file

In [11]:
reviews = sc.textFile(DATA_PATH + "/Data/google-play-store-apps/googleplaystore_user_reviews.csv")
reviews.take(5)

['App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity',
 '10 Best Foods for You,"I like eat delicious food. That\'s I\'m cooking food myself, case ""10 Best Foods"" helps lot, also ""Best Before (Shelf Life)""",Positive,1.0,0.5333333333333333',
 '10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.28846153846153844',
 '10 Best Foods for You,nan,nan,nan,nan',
 '10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875']

In [12]:
# Since the file contains headers
header = reviews.first()
reviews = reviews.filter(lambda line: line != header)
reviews.take(5)

['10 Best Foods for You,"I like eat delicious food. That\'s I\'m cooking food myself, case ""10 Best Foods"" helps lot, also ""Best Before (Shelf Life)""",Positive,1.0,0.5333333333333333',
 '10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.28846153846153844',
 '10 Best Foods for You,nan,nan,nan,nan',
 '10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875',
 '10 Best Foods for You,Best idea us,Positive,1.0,0.3']

In [13]:
def get_app_name(row):

    # use csv.reader to split the row by comma only when the comma is not enclosed by double quotes
    parts = next(csv.reader([row], delimiter=',', quotechar='"'))
    
    # extract the app name and sentiment
    app_name = parts[0]
    sentiment = parts[-3]
    
    return app_name, sentiment

# Broadcast the function
get_app_name_bc = sc.broadcast(get_app_name)

In [14]:
reviews_final = reviews.map(lambda x: get_app_name(x))
reviews_final.take(5)

[('10 Best Foods for You', 'Positive'),
 ('10 Best Foods for You', 'Positive'),
 ('10 Best Foods for You', 'nan'),
 ('10 Best Foods for You', 'Positive'),
 ('10 Best Foods for You', 'Positive')]

In [15]:
reviews_final = reviews_final.map(lambda x: (x[0], 1 if x[1]=='Positive' else (0 if x[1]=='Neutral' else (-1 if x[1]=='Negative' else None))))
reviews_final.take(5)

[('10 Best Foods for You', 1),
 ('10 Best Foods for You', 1),
 ('10 Best Foods for You', None),
 ('10 Best Foods for You', 1),
 ('10 Best Foods for You', 1)]

In [16]:
reviews_final = reviews_final.mapValues(lambda x: (1 if x==1 else 0, 1 if x==0 else 0, 1 if x==-1 else 0, 1 if x==None else 0))
reviews_final.take(5)

[('10 Best Foods for You', (1, 0, 0, 0)),
 ('10 Best Foods for You', (1, 0, 0, 0)),
 ('10 Best Foods for You', (0, 0, 0, 1)),
 ('10 Best Foods for You', (1, 0, 0, 0)),
 ('10 Best Foods for You', (1, 0, 0, 0))]

In [17]:
reviews_final = reviews_final.reduceByKey(lambda x, y: tuple(map(lambda a, b: a+b, x, y)))
reviews_final.take(10)

[('10 Best Foods for You', (162, 22, 10, 6)),
 ('104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室', (31, 8, 1, 0)),
 ('11st', (23, 10, 7, 0)),
 ('1800 Contacts - Lens Store', (64, 10, 6, 0)),
 ('1LINE – One Line with One Touch', (27, 3, 8, 2)),
 ('2018Emoji Keyboard 😂 Emoticons Lite -sticker&gif', (25, 6, 1, 8)),
 ('21-Day Meditation Experience', (68, 2, 10, 0)),
 ('2Date Dating App, Love and matching', (26, 5, 7, 2)),
 ('2GIS: directory & navigator', (23, 11, 6, 0)),
 ('2RedBeans', (31, 6, 2, 1))]

In [18]:
reviews_final.count()

1074

#### Apps file

For the `googleplaystore.csv` file

In [19]:
apps = sc.textFile(DATA_PATH + "/Data/google-play-store-apps/googleplaystore.csv")
apps.take(5)

['Photo Editor & Candy Camera & Grid & ScrapBook\tART_AND_DESIGN\t4.1\t159\t19M\t10,000+\tFree\t0\tEveryone\tArt & Design\tJanuary 7, 2018\t1.0.0\t4.0.3 and up',
 'Coloring book moana\tART_AND_DESIGN\t3.9\t967\t14M\t500,000+\tFree\t0\tEveryone\tArt & Design;Pretend Play\tJanuary 15, 2018\t2.0.0\t4.0.3 and up',
 'U Launcher Lite – FREE Live Cool Themes, Hide Apps\tART_AND_DESIGN\t4.7\t87510\t8.7M\t5,000,000+\tFree\t0\tEveryone\tArt & Design\tAugust 1, 2018\t1.2.4\t4.0.3 and up',
 'Sketch - Draw & Paint\tART_AND_DESIGN\t4.5\t215644\t25M\t50,000,000+\tFree\t0\tTeen\tArt & Design\tJune 8, 2018\tVaries with device\t4.2 and up',
 'Pixel Draw - Number Art Coloring Book\tART_AND_DESIGN\t4.3\t967\t2.8M\t100,000+\tFree\t0\tEveryone\tArt & Design;Creativity\tJune 20, 2018\t1.1\t4.4 and up']

In [20]:
# Define the functions to be broadcasted
def get_cat_index(s):
      match = re.search("(?<=[A-Z])[A-Z_]+", s)
      if match:
          return match.start()
      return -1


def get_rating(x):
    lines_split = x.split("\t")

    if re.match(r'^\d+\.?\d*$', lines_split[1]):
        rating = lines_split[1]
    else:
        rating = lines_split[2]

    if rating == "" or rating == "NaN" or rating == "nan":
        return 0.0

    rating_float = float(rating)
    if rating_float <= 5.0:
        return rating_float
    else:
      cat_index = get_cat_index(x.split("\t"))
      value = x.split("\t")[cat_index+1]
      if value != "" or value != "nan" or value != "NaN":
        return value
      else:
        return 0.0 

# Broadcast the function
get_cat_index_bc = sc.broadcast(get_cat_index)
get_rating_bc = sc.broadcast(get_rating)

In [21]:
apps_final = apps.map(lambda x: (x.split("\t")[0], get_rating(x))).distinct()
apps_final.take(10)

[('Photo Editor & Candy Camera & Grid & ScrapBook', 4.1),
 ('Coloring book moana', 3.9),
 ('U Launcher Lite – FREE Live Cool Themes, Hide Apps', 4.7),
 ('Sketch - Draw & Paint', 4.5),
 ('Pixel Draw - Number Art Coloring Book', 4.3),
 ('Paper flowers instructions', 4.4),
 ('Smoke Effect Photo Maker - Smoke Editor', 3.8),
 ('Infinite Painter', 4.1),
 ('Garden Coloring Book', 4.4),
 ('Kids Paint Free - Drawing Fun', 4.7)]

In [22]:
apps_final.count()

9678

#### Joining Reviews & Apps

In [29]:
left = reviews_final.leftOuterJoin(apps_final)
left = left.map(lambda x: (x[0], 1))
leftcount = left.reduceByKey(add)
leftcount.sortBy(lambda x: x[1], ascending=False).take(9)

[('Blood Pressure', 2),
 ('Bubble Shooter', 2),
 ('Calculator', 2),
 ('High Blood Pressure Symptoms', 2),
 ('Call Blocker', 2),
 ('Chess Free', 2),
 ('Dairy Queen', 2),
 ('English Grammar Test', 2),
 ('Flashlight', 2)]

In [None]:
apps_reviews = apps_final.fullOuterJoin(reviews_final)
apps_reviews = apps_reviews.map(lambda x: (x[0], (0.0 if x[1][0] is None else x[1][0], (0, 0, 0, 0) if x[1][1] is None else x[1][1])))
apps_reviews = apps_reviews.map(lambda x: (x[0], x[1][0], x[1][1][0], x[1][1][1], x[1][1][2], x[1][1][3]))
apps_reviews.take(10)

[('Coloring book moana', 3.9, 26, 4, 14, 14),
 ('U Launcher Lite – FREE Live Cool Themes, Hide Apps', 4.7, 0, 0, 0, 0),
 ('Sketch - Draw & Paint', 4.5, 0, 0, 0, 0),
 ('Pixel Draw - Number Art Coloring Book', 4.3, 0, 0, 0, 0),
 ('Paper flowers instructions', 4.4, 0, 0, 0, 0),
 ('Tattoo Name On My Photo Editor', 4.2, 0, 0, 0, 0),
 ('3D Color Pixel by Number - Sandbox Art Coloring', 4.4, 0, 0, 0, 40),
 ('Learn To Draw Kawaii Characters', 3.2, 0, 0, 0, 0),
 ('Photo Designer - Write your name with shapes', 4.7, 0, 0, 0, 0),
 ('FlipaClip - Cartoon animation', 4.3, 4, 0, 0, 36)]

In [None]:
# For double checking the counts and quality
print(apps_reviews.count())
print(apps_reviews.filter(lambda x: x[1] is None).count())
print(apps_reviews.filter(lambda x: x[1] == 0.0).count())

9732
0
1518


In [None]:
sorted_apps_all = apps_reviews.sortBy(lambda x: x[1], ascending=False)

print(sorted_apps_all.count())
sorted_apps_all.take(10)

9732


[('Awake Dating', 5.0, 0, 0, 0, 0),
 ('Spine- The dating app', 5.0, 0, 0, 0, 0),
 ('Girls Live Talk - Free Text and Video Chat', 5.0, 0, 0, 0, 0),
 ('Online Girls Chat Group', 5.0, 0, 0, 0, 0),
 ('Speeding Joyride & Car Meet App', 5.0, 0, 0, 0, 0),
 ('SUMMER SONIC app', 5.0, 0, 0, 0, 0),
 ('Prosperity', 5.0, 0, 0, 0, 0),
 ('Super Hearing Secret Voices Recorder PRO', 5.0, 0, 0, 0, 0),
 ('Sway Medical', 5.0, 0, 0, 0, 0),
 ('Galaxies of Hope', 5.0, 0, 0, 0, 0)]

In [None]:
sorted_apps_rat = sorted_apps_all.filter(lambda x: x[1] > 0.0)

print(sorted_apps_rat.count())
sorted_apps_rat.take(10)

8214


[('Awake Dating', 5.0, 0, 0, 0, 0),
 ('Spine- The dating app', 5.0, 0, 0, 0, 0),
 ('Girls Live Talk - Free Text and Video Chat', 5.0, 0, 0, 0, 0),
 ('Online Girls Chat Group', 5.0, 0, 0, 0, 0),
 ('Speeding Joyride & Car Meet App', 5.0, 0, 0, 0, 0),
 ('SUMMER SONIC app', 5.0, 0, 0, 0, 0),
 ('Prosperity', 5.0, 0, 0, 0, 0),
 ('Super Hearing Secret Voices Recorder PRO', 5.0, 0, 0, 0, 0),
 ('Sway Medical', 5.0, 0, 0, 0, 0),
 ('Galaxies of Hope', 5.0, 0, 0, 0, 0)]

In [None]:
sorted_apps_rev = sorted_apps_all.filter(lambda x: (x[2]+x[3]+x[4]+x[5]) > 0.0)

print(sorted_apps_rev.count())
sorted_apps_rev.take(10)

1083


[('Down Dog: Great Yoga Anywhere', 4.9, 40, 0, 0, 0),
 ('CDL Practice Test 2018 Edition', 4.9, 21, 2, 1, 16),
 ('DMV Permit Practice Test 2018 Edition', 4.9, 30, 2, 2, 6),
 ('FREE LIVE TALK', 4.9, 0, 0, 0, 40),
 ('Home Workout - No Equipment', 4.8, 24, 1, 1, 14),
 ('30 Day Fitness Challenge - Workout at Home', 4.8, 27, 2, 2, 9),
 ('Home Workout for Men - Bodybuilding', 4.8, 22, 4, 0, 14),
 ('Cash, Inc. Money Clicker Game & Business Adventure', 4.8, 0, 0, 0, 40),
 ('GoodRx Drug Prices and Coupons', 4.8, 24, 10, 3, 3),
 ('FreePrints – Free Photos Delivered', 4.8, 33, 1, 2, 4)]

In [None]:
sorted_apps_ratrev = sorted_apps_all.filter(lambda x: x[1] > 0.0)
sorted_apps_ratrev = sorted_apps_ratrev.filter(lambda x: (x[2]+x[3]+x[4]+x[5]) > 0.0)

print(sorted_apps_ratrev.count())
sorted_apps_ratrev.take(10)

1028


[('Down Dog: Great Yoga Anywhere', 4.9, 40, 0, 0, 0),
 ('CDL Practice Test 2018 Edition', 4.9, 21, 2, 1, 16),
 ('DMV Permit Practice Test 2018 Edition', 4.9, 30, 2, 2, 6),
 ('FREE LIVE TALK', 4.9, 0, 0, 0, 40),
 ('Home Workout - No Equipment', 4.8, 24, 1, 1, 14),
 ('30 Day Fitness Challenge - Workout at Home', 4.8, 27, 2, 2, 9),
 ('Home Workout for Men - Bodybuilding', 4.8, 22, 4, 0, 14),
 ('Cash, Inc. Money Clicker Game & Business Adventure', 4.8, 0, 0, 0, 40),
 ('GoodRx Drug Prices and Coupons', 4.8, 24, 10, 3, 3),
 ('FreePrints – Free Photos Delivered', 4.8, 33, 1, 2, 4)]

In [None]:
# Add header to output files
header = sc.parallelize([('App', 'Rating', 'Positive_Count', 'Neutral_Count', 'Negative_Count', 'NaN_Count')])

In [None]:
sorted_apps_all = header.union(sorted_apps_all)
sorted_apps_rat = header.union(sorted_apps_rat)
sorted_apps_rev = header.union(sorted_apps_rev)
sorted_apps_ratrev = header.union(sorted_apps_ratrev)

In [None]:
sorted_apps_all.saveAsTextFile("Data/Output/sorted_apps_all.csv")
sorted_apps_rat.saveAsTextFile("Data/Output/sorted_apps_rat.csv")
sorted_apps_rev.saveAsTextFile("Data/Output/sorted_apps_rev.csv")
sorted_apps_ratrev.saveAsTextFile("Data/Output/sorted_apps_ratrev.csv")