## SUD Data Analysis

#### Library Installation

In [None]:
%pip install -q google_play_scraper
%pip install -q num2words
%pip install -q nltk
%pip install -q pandas
%pip install -q beautifulsoup4

#### Imports

In [None]:
import pandas as pd
import numpy as np
import json
import nltk
import re
import random as rnd

from google_play_scraper import app, Sort, reviews_all # google play web scraper
from bs4 import BeautifulSoup #scrape info from webpages
from num2words import num2words # changes numbers to text
nltk.download('wordnet') # download wordnet
nltk.download('stopwords') # download stopwords
from nltk.stem import WordNetLemmatizer # reduces words to their base
from nltk.corpus import stopwords as sw # tool to identity stopwords
wnl = WordNetLemmatizer()

In [99]:
def toCSV(domain):
    # domain input string: "com.minar.birday"

    norm = pd.json_normalize(reviews_all(domain, country = 'CA', sort=Sort.NEWEST))
    # Formating for CSV file(s)
    headerList = ['Package Name', 'Reviewer Name', 'Review', 'Rating']
    names = []
    reviews = []
    ratings = []

    for name in norm['userName']:
        names.append(name)

    for review in norm['content']:
        reviews.append(review)
        
    for rating in norm['score']:
        ratings.append(rating)

    df = pd.DataFrame({'Package Name':domain ,'Reviewer Name': names, 'Review': reviews, 'Rating': ratings})
    df.to_csv(domain+'.csv', index=False, header=headerList, encoding='utf-8')

    return df

In [100]:
sud = toCSV('com.minar.birday')
print("In our SUD (Birday - Birthday Manager) we have a total of",sud.shape[0],"reviews\n")

comp1 = toCSV('com.ioanalazar.bdays')
comp2 = toCSV('dev.cwolf.birthdaycalendar')
print("In our 1st Competitor App (Birthday Reminder & Calendar) we have a total of",comp1.shape[0],"reviews")
print("In our 2nd Competitor App (Birthday Calendar Reminder) we have a total of",comp2.shape[0],"reviews\n")

sim1 = toCSV('net.futasaji.keep')
sim2 = toCSV('com.goreminders')
print("In our 1st Similar App (Shared To-Do List: We-Do) we have a total of",sim1.shape[0],"reviews")
print("In our 2nd Similar App (Go Appt Reminder & Scheduling) we have a total of",sim2.shape[0],"reviews")

In our SUD (Birday - Birthday Manager) we have a total of 866 reviews

In our 1st Competitor App (Birthday Reminder & Calendar) we have a total of 109 reviews
In our 2nd Competitor App (Birthday Calendar Reminder) we have a total of 2466 reviews

In our 1st Similar App (Shared To-Do List: We-Do) we have a total of 39 reviews
In our 2nd Similar App (Go Appt Reminder & Scheduling) we have a total of 59 reviews


#### Preprocessing Reviews Function

In [101]:
def cleanReview(review):
    result = [] # final list of review after all the cleaning
    tempReview = [] # list of review
    review_words = [] # list of words in each review
    stopwords = sw.words('english') # stopwords to remove
    stopwords.extend(['from', 'subject', 're', 'edu', 'use'])
    clean_review = ""

    temp = "" # variable used to store words in all reviews
    tempR = "" # variable used to store description of reviews before being appended to list
    for words in review:
        for letter in words:
            clean_word = re.sub('[^A-Za-z0-9 ]+', ' ', letter) # remove punctuations and special letters/emojis
            clean_review += clean_word.lower() # lower case of each word
        clean_review = " ".join(clean_review.split()) # remove extra whitespace
        tempReview.append(clean_review) # tempReview - review in lower-case, removed punctuations, and special letters/emojis
        clean_review = "" # reset clean_review variable

    for i in range(len(tempReview)): 
        tempWord = tempReview[i].split() # goes through each review and separate by words (partially "cleaned")
        for word in tempWord: # each individual words in the Review
            if word.isdigit():
                temp += num2words(word).replace('-', "")  + ' ' # replace integer to string and remove hyphen
            else:
                if word not in stopwords: # remove stop words
                    #if len(word) > # should only lemmatize words with more than 5 letter (does becomes doe)
                    temp += wnl.lemmatize(word) + ' ' # lemmatize the reviews

        tempR += " ".join(temp.split()) # temp cleaned review (extra whitespace(s) removed)
        result.append(tempR) # fully clean review gets added to result
        temp = "" # reset temp variable
        tempR = "" # reset tempR variable
    return result

#### Cleaning Reviews from Competitors and Similar Alternative of SUD

In [102]:
# SUD
clean_SUD = cleanReview(sud.Review) # Clean Review(s) in "Birday - Birthday Manager"

# Competitor Apps
clean_comp1 = cleanReview(comp1.Review) # Clean Review(s) in "Birthday Reminder & Calendar"
clean_comp2 = cleanReview(comp2.Review) # Clean Review(s) in "Birthday Calendar Reminder"

# Similar Apps
clean_sim1 = cleanReview(sim1.Review) # Clean Review(s) in "Shared To-Do List: We-Do"
clean_sim2 = cleanReview(sim2.Review) # Clean Review(s) in "Go Appt Reminder & Scheduling"

#### Outputs comparison of raw and pre-processed reviews

In [None]:
# prints out 15 examples, display raw and pre-processed reviews 
for i in range(1, 16):
    random = rnd.randint(0, len(sud)-1)
    if i == 15:
        #random = rnd.randint(0, len(sud)-1)
        print(f'Selected Random Review {i})')
        print(f'Original Review [{random}]: \n"{sud.Review[random]}"')
        print("")
        print(f'Pre-Processed Review [{random}]: \n"{clean_SUD[random]}"')
    else:
        print(f'Selected Random Review {i})')
        print(f'Original Review [{random}]: \n"{sud.Review[random]}"')
        print("")
        print(f'Pre-Processed Review [{random}]: \n"{clean_SUD[random]}"')
        print("\n\n")


Selected Review 1)
Original Review [679]: 
"Super"

Pre-Processed Review [679]: 
"super"



Selected Review 2)
Original Review [809]: 
"Edited 2/18. Developer went above and beyond to help me get the app working. Definitely would recommend it for keeping track of birthdays. Calendar was stuck on today's date. I could not add information. I Uninstalled it immediately"

Pre-Processed Review [809]: 
"edited two eighteen developer went beyond help get app working definitely would recommend keeping track birthday calendar stuck today date could add information uninstalled immediately"



Selected Review 3)
Original Review [354]: 
"EXCELLENT"

Pre-Processed Review [354]: 
"excellent"



Selected Review 4)
Original Review [196]: 
"Works great. Saves me on what could be forgotten birthdays."

Pre-Processed Review [196]: 
"work great save could forgotten birthday"



Selected Review 5)
Original Review [564]: 
"All good.. try to add notification/ alarm facility"

Pre-Processed Review [564]: 
"go