## Importing and Downloading Necessary Libraries

In [None]:
!pip install google_play_scraper

Collecting google_play_scraper
  Downloading google_play_scraper-1.2.4-py3-none-any.whl (28 kB)
Installing collected packages: google_play_scraper
Successfully installed google_play_scraper-1.2.4


In [None]:
import pandas as pd

from google_play_scraper import app, Sort, reviews

from pprint import pprint

import datetime as dt
from tzlocal import get_localzone

import random
import time

## Defining App

In [None]:
app_name = 'Duolingo'
app_id = 'com.duolingo&hl=en'

In [None]:
app_info = []

info = app(app_id)
del info['comments']
app_info.append(info)


pprint(app_info)

[{'adSupported': True,
  'appId': 'com.duolingo&hl=en',
  'categories': [{'id': 'EDUCATION', 'name': 'Education'}],
  'containsAds': True,
  'contentRating': 'Everyone',
  'contentRatingDescription': None,
  'currency': 'USD',
  'description': 'Learn a new language with the world’s most-downloaded '
                 'education app! Duolingo is the fun, free app for learning '
                 '40+ languages through quick, bite-sized lessons. Practice '
                 'speaking, reading, listening, and writing to build your '
                 'vocabulary and grammar skills.\r\n'
                 '\r\n'
                 'Designed by language experts and loved by hundreds of '
                 'millions of learners worldwide, Duolingo helps you prepare '
                 'for real conversations in Spanish, French, Chinese, Italian, '
                 'German, English, and more.\r\n'
                 '\r\n'
                 'Whether you’re learning a language for travel, school, '
      

## Retrieving Google Play Reviews

## Looping for more Reviews

In [None]:
start = dt.datetime.now(tz=get_localzone())
fmt= "%m/%d/%y - %T %p"

print('---'*20)
print('---'*20)
print(f'***** {app_name} started at {start.strftime(fmt)}')
print()

app_reviews = []
count = 200
batch_num = 0

------------------------------------------------------------
------------------------------------------------------------
***** Duolingo started at 08/17/23 - 13:33:11 PM



In [None]:
content_list = []
score_list = []

for batch in range(100):
  rvws, token = reviews(
              app_id = 'com.duolingo',
              lang='en',
              country='us',
              sort=Sort.NEWEST,
              count=count,
              continuation_token=token
          )

  for d in rvws:
      if 'content' in d:
        content_list.append(d['content'])

  for s in rvws:
      if 'score' in s:
        score_list.append(d['score'])

  batch_num +=1
  time.sleep(random.randint(1,5))

In [None]:
pprint(rvws)

[{'appVersion': '5.115.4',
  'at': datetime.datetime(2023, 8, 12, 13, 23, 30),
  'content': 'Really challenging way to learn a language. I am currently '
             'learninv napaneese and its going great. Really 😊happy',
  'repliedAt': None,
  'replyContent': None,
  'reviewCreatedVersion': '5.115.4',
  'reviewId': 'ffd5ab76-128c-4779-8fdc-0b6262f66d00',
  'score': 5,
  'thumbsUpCount': 0,
  'userImage': 'https://play-lh.googleusercontent.com/a-/AD_cMMSlClrJGNaC4sy4GRWBmHCX7U0l7La0rMHy7dTxaY0eyH0',
  'userName': 'Johan J Anil'},
 {'appVersion': '5.114.3',
  'at': datetime.datetime(2023, 8, 12, 13, 21, 48),
  'content': '👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍',
  'repliedAt': None,
  'replyContent': None,
  'reviewCreatedVersion': '5.114.3',
  'reviewId': 'efbedc15-16fd-485d-9d20-7fd54e777d73',
  'score': 5,
  'thumbsUpCount': 0,
  'userImage': 'https://play-lh.googleusercontent.com/a/AAcHTtfVlvKWPwh3pw90Jr556sNKaRyjyiCWtM7gxYgb3_na=mo',
  'userName': 'Tapsi Yadav'},
 {'appVersion': '5.115.4',
  'at': da

In [None]:
df = pd.DataFrame({'id': range(1, len(content_list) + 1), 'review': content_list, 'score': score_list})
df.head()

Unnamed: 0,id,review,score
0,1,it's pretty good,5
1,2,the app is great for learning languages,5
2,3,This is my fav app and i love this app In this...,5
3,4,love it.,5
4,5,It's very fun to play we can pronounce write v...,5


In [None]:
df.shape

(5000, 3)

## Sentiment Analysis

In [None]:
import numpy as np
pd.options.mode.chained_assignment = None

from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB

import re
import nltk
from nltk import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from collections import Counter

nltk.download('vader_lexicon')

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

Splitting Data into training and validation sets

In [None]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

Defining Function to Pre-Process the Reviews

In [None]:
class ReviewPreprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = X.copy()

        for col in X.columns:
            X_transformed[col] = X_transformed[col].astype(str).apply(lambda text: text.lower())
            X_transformed[col] = X_transformed[col].apply(lambda text: re.sub(r'[^A-Za-z0-9 ]+', ' ', text))

        return X_transformed

In [None]:
preprocessing_pipeline = Pipeline([
    ('text_preprocessor', ReviewPreprocessor())
])

In [None]:
train_df = preprocessing_pipeline.transform(train_df)
val_df = preprocessing_pipeline.transform(val_df)

train_df.head()

Unnamed: 0,id,review,score
4227,4228,great experience,5
4676,4677,ma sha allah this app is really amazing it hel...,5
800,801,just getting started day 2 it is going reall...,5
3671,3672,best,5
4193,4194,very good way to learn i love this,5


Analyzing Reviews by Predicting Sentiments

In [None]:
sia = SentimentIntensityAnalyzer()

def get_sentiment_score(text):
    sentiment_score = sia.polarity_scores(text)
    if sentiment_score['compound'] >= 0.05:
        return 'positive'
    elif sentiment_score['compound'] <= -0.05:
        return 'negative'
    else:
        return 'neutral'

def get_sentiment_score_specific(text):
    sentiment_score = sia.polarity_scores(text)
    if sentiment_score['compound'] >= 0.05:
        return sentiment_score['compound']
    elif sentiment_score['compound'] <= -0.05:
        return sentiment_score['compound']
    else:
        return sentiment_score['compound']


train_df['sentiment'] = train_df['review'].apply(get_sentiment_score)
train_df['sentiment_score'] = train_df['review'].apply(get_sentiment_score_specific)

In [None]:
sentiment_counts = train_df['sentiment'].value_counts()

print(sentiment_counts)

positive    3361
neutral      592
negative      47
Name: sentiment, dtype: int64


In [None]:
score_counts = train_df['score'].value_counts()

print(score_counts)

5    4000
Name: score, dtype: int64


In [None]:
train_df.head(20)

Unnamed: 0,id,review,score,sentiment,sentiment_score
4227,4228,great experience,5,positive,0.6249
4676,4677,ma sha allah this app is really amazing it hel...,5,positive,0.7939
800,801,just getting started day 2 it is going reall...,5,positive,0.7818
3671,3672,best,5,positive,0.6369
4193,4194,very good way to learn i love this,5,positive,0.8122
2968,2969,nice enviroment like a game,5,positive,0.6486
4793,4794,the best it feels mandatory to learn and to p...,5,positive,0.8807
4368,4369,makes language learning easy and fun,5,positive,0.7351
2776,2777,very good app i like it this very useful this app,5,positive,0.8462
2970,2971,i now know how to tell people that i eat cats ...,5,neutral,0.0


In [None]:
negative_rows = train_df[train_df['sentiment'] == 'negative']
negative_rows

Unnamed: 0,id,review,score,sentiment,sentiment_score
2495,2496,it s good but the ads is so annoying and it s like a game,5,negative,-0.1373
3322,3323,good app but i lost my streak of 16 weeks today and i can t find my mom dad and sister now,5,negative,-0.25
356,357,the app does not open i have struggled since yesterday what is wrong,5,negative,-0.2649
452,453,it s very nice for learning different languages the first time i heard about it i didn t too much about it but after some time i don t regret downloading it,5,negative,-0.3926
2304,2305,this is an excellent app for learning all areas of french but they have begun to use loud and annoying ads which i think cheapens their brand,5,negative,-0.296
3231,3232,green bird is pointing a gun to my head i had no choice,5,negative,-0.5574
3869,3870,extremely practical app uses repetitive questions to memorize all words and pointing your mistakes to learn more,5,negative,-0.5423
557,558,the killer owl memes made me start but learning german made me stay,5,negative,-0.3919
3011,3012,you can t stop learning,5,negative,-0.296
2955,2956,i am learning spanish so it s working i now know how to ask for an apple which has been a problem in the past whoops i meant manzana,5,negative,-0.4019


## Exploring Results

In [None]:
pd.set_option('max_colwidth', 1000)

ultra_positive = train_df[(train_df['sentiment'] == 'positive') & (train_df['sentiment_score'] > 0.9)]
ultra_positive.head()

Unnamed: 0,id,review,score,sentiment,sentiment_score
2264,2265,i very much like how you got to learn language for free i appreciate the continuous upgrade the developer and his team keep up the good works thank you for being people closer by offering free language lessons,5,positive,0.9475
3017,3018,this is a very good learning app i m learning spanish but sometimes some words a southern american rarely i have difficulty remembering and being motivated to actually do a lesson so i make sure to keep notifications on i also think sometimes i find the legendary levels to be a bit hard since they always give you words and you only have to rearrange them affing to the difficulty could be beneficial overall its helped me learn more spanish for my family,5,positive,0.9503
1215,1216,i love this app i have really learned alot in my desired language and it s easy and fun,5,positive,0.91
3997,3998,it s really great even better than a language class you can learn languages free it s really fun if you don t practice duolingo will send you emails,5,positive,0.9377
432,433,duolingo provides a lot of practice repetition of vocabulary as well as allows you to practice listening reading writing and speaking skills there are some fun games you can play to master vocabulary and grammar you can also compete with fellow students and move through leagues from week to week there is a point and rewards system to keep you interested i would recommend keeping notes to refer to from the grammar notes and or daily practices have fun,5,positive,0.9545


In [None]:
ultra_negative = train_df[(train_df['sentiment'] == 'negative') & (train_df['sentiment_score'] < -0.5)]
ultra_negative.head()

Unnamed: 0,id,review,score,sentiment,sentiment_score
3231,3232,green bird is pointing a gun to my head i had no choice,5,negative,-0.5574
3869,3870,extremely practical app uses repetitive questions to memorize all words and pointing your mistakes to learn more,5,negative,-0.5423
2476,2477,the internet is very poor,5,negative,-0.5256
4329,4330,apparently if you use the app on your computer you don t have to worry about losing stars which is somewhat irritating since when you run out you have to practice to earn more and continue moving through the program it is a good program to learn from but i admit i m not starting from scratch i am trying to re learn from my high school and college days i haven t tried to start a program with no prior knowledge so i can t speak to that aspect,5,negative,-0.5029
1579,1580,ich sch tze diesen app wirklich habe schon viel gelernt und ist mir eine gro e hilfe mit die landessprache sprache mehr bekannt zu werden,5,negative,-0.5994


In [None]:
average_lengths = train_df.groupby('sentiment')['review'].apply(lambda x: x.str.len().mean()).reset_index()

average_lengths.columns = ['sentiment', 'average_review_length']
print(average_lengths)

  sentiment  average_review_length
0  negative             119.234043
1   neutral              25.869932
2  positive              56.905385


Most common negative words

In [None]:
negative_reviews = train_df[train_df['sentiment'] == 'negative']['review']

stop_words = set(stopwords.words('english'))
all_words = [word.lower() for review in negative_reviews for word in nltk.word_tokenize(review) if word.isalnum() and word.lower() not in stop_words]

most_common_words = Counter(all_words).most_common(10)

print(most_common_words)