In [1]:
import numpy as np
import pandas as pd
import nltk
import langid
from tqdm.notebook import tqdm
from google_play_scraper import reviews_all

In [2]:
apps = ["no.dnb.vipps", 
        "no.nrk.yr", 
        "no.db.app", 
        "com.norwegian.travelassistant", 
        "com.tibber.android", 
        "com.netflix.mediaclient",
        "no.posten.sporing.controller",
        "no.nrk.tv",
        "no.skandiabanken",
        "com.snapchat.android"]

In [3]:
raw_data = []

for app in tqdm(apps):
    app_reviews = reviews_all(
        app,
        sleep_milliseconds=0,
        lang="no",
        country="no"
    )
    print("{} reviews found for {}".format(len(app_reviews), app))
    raw_data.extend(app_reviews)

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

6151 reviews found for no.dnb.vipps
1385 reviews found for no.nrk.yr
563 reviews found for no.db.app
1449 reviews found for com.norwegian.travelassistant
1067 reviews found for com.tibber.android
5103 reviews found for com.netflix.mediaclient
1023 reviews found for no.posten.sporing.controller
1682 reviews found for no.nrk.tv
1061 reviews found for no.skandiabanken
22883 reviews found for com.snapchat.android



In [17]:
def process_raw_data(raw_data, possible_codes=["no", "nb", "nn"], prob_show_rejected=0):
    clean_data = []
    
    for review in tqdm(raw_data):
        content = review["content"]
        if content is not None:
            content = content.replace("\n", "")
            score = review["score"]
            words = nltk.word_tokenize(content)
            # Check if there is at least one actual word
            alpha = any([word.isalpha() for word in words])

            if (score != 3 and                                  # Remove neutral reviews
                alpha and                                       # Remove reviews where there are only emojis
                len(words) >= 3 and                             # Remove really short reviews
                langid.classify(content)[0] in possible_codes): # Remove reviews written in other languages
                # Keep review
                clean_data.append((int(score > 3), content)) # Convert score to binary
            else:
                if np.random.binomial(1, p=prob_show_rejected) and score != 3:
                    # Print some rejected examples (except neutral)
                    print("Rejected ->", content)
    return clean_data

In [18]:
data = process_raw_data(raw_data, prob_show_rejected=0.001)

HBox(children=(FloatProgress(value=0.0, max=42367.0), HTML(value='')))

Rejected -> Kjempe app 👍👍
Rejected -> Meget bra
Rejected -> Den er god
Rejected -> Super
Rejected -> The new update sucks
Rejected -> Appen kræsjer hele tiden!
Rejected -> Vil
Rejected -> Snapchat is one of my favorite apps on ny phone, but often when I take a snap and press on the screen to type, the text box ends up behind the keyboard so I have to put the keyboard down and up a few times before the text box ends up where it should be do I can see what I write. Not a big issue, but can be annoying   - from a Galaxy Note 8 user
Rejected -> Bra
Rejected -> SUPERT OG DEILIG. SOVE LENGE!!
Rejected -> Wow😍
Rejected -> Verdens beste app!!!
Rejected -> Veldig bra
Rejected -> Hvor lang tid skal det ta????
Rejected -> Keyboard doesn't work.
Rejected -> Dritt
Rejected -> Eige



In [19]:
data[np.random.randint(len(data))]

(0,
 'Den er bra men den nye oppdateringen gungerer ikke på samsung:-( fiks det plz')

In [20]:
len(data)

22930

In [21]:
pd.DataFrame(data).to_csv("app_reviews_norwegian.csv", index=False, header=False)