In [14]:
import numpy as np
import pandas as pd
import nltk
import langid
from tqdm.notebook import tqdm
from google_play_scraper import reviews_all

In [15]:
apps = ["no.dnb.vipps", 
        "no.nrk.yr", 
        "no.db.app", 
        "com.norwegian.travelassistant", 
        "com.tibber.android", 
        "com.netflix.mediaclient",
        "no.posten.sporing.controller",
        "no.nrk.tv",
        "no.skandiabanken",
        "com.snapchat.android"]

In [16]:
raw_data = []

for app in tqdm(apps):
    app_reviews = reviews_all(
        app,
        sleep_milliseconds=0,
        lang="no",
        country="no"
    )
    print("{} reviews found for {}".format(len(app_reviews), app))
    raw_data.extend(app_reviews)

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

6151 reviews found for no.dnb.vipps
1384 reviews found for no.nrk.yr
561 reviews found for no.db.app
1449 reviews found for com.norwegian.travelassistant
1067 reviews found for com.tibber.android
5102 reviews found for com.netflix.mediaclient
1023 reviews found for no.posten.sporing.controller
1681 reviews found for no.nrk.tv
1061 reviews found for no.skandiabanken
22882 reviews found for com.snapchat.android



In [232]:
def process_raw_data(raw_data, possible_codes=["no", "nb", "nn"], prob_show_rejected=0):
    clean_data = []
    
    for review in tqdm(raw_data):
        content = review["content"]
        if content is not None:
            score = review["score"]
            words = nltk.word_tokenize(content)
            # Check if there is at least one actual word
            alpha = any([word.isalpha() for word in words])
            b = TextBlob(content) # For language detection

            if (score != 3 and                                  # Remove neutral reviews
                alpha and                                       # Remove reviews where there are only emojis
                len(words) >= 3 and                             # Remove really short reviews
                langid.classify(content)[0] in possible_codes): # Remove reviews written in other languages
                # Keep review
                clean_data.append((int(score > 3), content)) # Convert score to binary
            else:
                if np.random.binomial(1, p=prob_show_rejected) and score != 3:
                    # Print some rejected examples (except neutral)
                    print("Rejected ->", content)
    return clean_data

In [233]:
data = process_raw_data(raw_data, prob_show_rejected=0.001)

HBox(children=(FloatProgress(value=0.0, max=42361.0), HTML(value='')))

Rejected -> Takk
Rejected -> Ubrukelig, kræsjer konstant
Rejected -> bra
Rejected -> Braaa
Rejected -> Treg, klikker ofte
Rejected -> Gøy



In [234]:
data[np.random.randint(len(data))]

(0,
 'Virker ikke lenger på min enhet, påstår at jeg har usikre apper installert. Alt annet funker, inkludert google pay og PayPal.')

In [235]:
len(data)

22926

In [240]:
pd.DataFrame(data).to_csv("app_reviews_norwegian.csv", index=False, header=False)