# App review sentiment over time

In [5]:
# import packages
import pandas as pd
import numpy as np







In [None]:
import json
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import datetime
from google_play_scraper import Sort, reviews, app
from tqdm import tqdm
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [18]:
from top2vec import Top2Vec

In [None]:
# set plot size
plt.rcParams['figure.figsize'] = [12, 10]
sns.set_theme()

Scrape reviews from United States of Tiktok 

In [None]:
# list of companies to scrape
app_list = [
    'com.thredup.android',
    'com.poshmark.app',
    'com.depop',
    'com.tradesy.android',
    'fr.vestiairecollective',
    'fr.vinted'
]



In [None]:
# scrape info for each company
app_info = []

for i in tqdm(app_list):
    info = app(i, lang = 'en', country = 'us')
    del info['comments']
    app_info.append(info)

In [None]:
# make function to print organized JSON object
def print_json(json_object):
    json_str = json.dumps(
    json_object,
    indent = 4,
    default = str
    )
    
    print(json_str)

In [None]:
print_json(app_info[3])

In [None]:
# scrape app reviews
app_reviews = []

for ap in tqdm(app_list):
    for score in list(range(1, 6)):
        for sort_order in [Sort.MOST_RELEVANT, Sort.NEWEST]:
            rvs, _ = reviews(
            ap,
            lang='en',
            country='us', # may want to look at other regions to get bigger picture
            sort=sort_order,
            #count = 50000
            count= 20000 if score == 3 else 20000,
            filter_score_with=score
            )
            for r in rvs:
                r['sortOrder'] = 'most_relevant' if sort_order == Sort.MOST_RELEVANT else 'newest'
                r['appId'] = ap
            app_reviews.extend(rvs)
    

In [6]:
# read in csv file of scraped reviews
app_reviews = pd.read_csv('app_reviews.csv')

In [None]:
# check what a review object looks like
print_json(app_reviews[0])

In [7]:
# number of reviews
print(len(app_reviews))

# convert into dataframe
app_df = pd.DataFrame(app_reviews)



212255


In [None]:
# write to csv file so don't have to run the scrape every time (+15 min)
#app_df.to_csv('app_reviews.csv',index=False)

In [None]:
app_df.head(1)

In [None]:
app_df.info()

In [8]:
# drop columns related to replied and not related to score or appId, because they have many NaNs or are not relevant
clean_df = app_df.drop(['replyContent',
                        'repliedAt',
                        'userName',
                        'userImage',
                        'reviewCreatedVersion',
                        'sortOrder'
                       ], axis = 1, inplace = False)

In [None]:
clean_df.info()
# review Created Version is has some nulls, but the column may be useful because it contains the app's version

In [None]:
# possible future analysis -> review created version for each app
# maybe group by app version and get sentiment for each version


In [None]:
# count how many reviews pulled from each app

clean_df.groupby(['appId', 'score'])['reviewId'].count()
p=clean_df.groupby('score')['reviewId'].count().plot(kind='bar')

p.savefig('stars.png')

In [None]:

p_df = clean_df.copy()

# extract a combined month year key

p_df['month_year'] = pd.to_datetime(p_df['at']).dt.to_period('M')

p_df2 = p_df[['month_year', 'appId']].copy()

count_df = (p_df2.reset_index()
        .groupby(['month_year','appId'], as_index = False)
        .count()
        .rename(columns={'index':'count'})
       )

count_df.sort_values(['appId','month_year'], inplace = True)
count_df.head()
# extract depop data frame and plot 
depop_count = count_df.loc[count_df['appId'] == 'com.depop']

depop_count.index = depop_count['month_year']


# depop_count.plot()


In [None]:
# plot_df[:,0:2]

In [None]:
# subplot for each app


plot_df = pd.pivot_table(count_df.reset_index(),
               index='month_year', columns='appId', values='count'
              )#.plot(subplots=True)

plot_df = plot_df.ffill()

#plot_df.head()
fig, axes = plt.subplots(nrows=3,ncols=2,figsize=(12,6))

# plot first pandas frame in subplot style
df1 = plot_df.iloc[:, 0:3]
df2 = plot_df.iloc[:, 3:]

plt.style.use('ggplot')
df1.plot(ax = axes[:,0],subplots=True) 
# plot second pandas frame in subplot style
#sns.color_pallette('Set1',3)
plt.style.use('seaborn-dark-palette')
df2.plot(ax = axes[:,1],subplots=True)

fig.supxlabel('Time')
fig.supylabel('Number of Reviews')


plt.show()

In [None]:
fig.savefig("num_rev_time.png")

In [None]:
clean_df.groupby(['appId'])['at'].max()

# fill in missing dates with zero 

In [None]:
# plot number of comments over time
# need to fix number of tick marks

# p = sns.relplot(
#     data = count_df,
#     x = count_df['month_year'].astype(str), y = 'count', col = "appId", hue = 'appId',
#     kind = "line", linewidth = 4, zorder = 2,
#     col_wrap = 2, height = 4, aspect = 2, legend = False
# )


In [None]:


# now in log scale
# p = sns.relplot(
#     data = count_df,
#     x = count_df['month_year'].astype(str), y = 'count', col = "appId", hue = 'appId',
#     kind = "line", linewidth = 4, zorder = 2,
#     col_wrap = 2, height = 4, aspect = 2, legend = False
# )

# start, end = p.get_xlim()
# p.set_axis_labels("Time", "Number of Reviews")

# iterate over axes of FacetGrid
# for ax in p.axes.flat:
#     labels = ax.get_xticklabels() # get x labels
#     for i,l in enumerate(labels):
#         if(i%12 == 0): labels[i] = '' # skip even labels
#     ax.set_xticklabels(labels, rotation=30) # set new labels
# plt.show()

#p.set(xticks=count_df['month_year'].astype(str)[2::50])
#p.set_xticks(range(len(count_df)/12))#, labels=range(2011, 2019))
#plt.xticks(np.arange(min(x), max(x)+1, 1.0))

# p.set_xticklabels(rotation=45)


# #p.set_xticks(np.arange(min(count_df['month_year']),max(count_df['month_year']), 12))

# p.set(yscale="log")


In [None]:
#  rolling mean of score for each app - simple sentiment analysis

roll_df = p_df.copy()

roll_df['day_month_year'] = pd.to_datetime(roll_df['at']).dt.to_period('D')

roll_df2 = roll_df[['appId','day_month_year', 'score']].copy()


#roll_df2 = roll_df2.sort_values(by=['appId', 'day_month_year'])
daily_mean = roll_df2.groupby(['appId', 'day_month_year'], as_index = False)['score'].mean()

In [None]:
# plot mean review scores - daily
plot2_df = pd.pivot_table(daily_mean.reset_index(),
               index='day_month_year', columns='appId', values='score'
              )#.plot(subplots=True)

plot2_df = plot2_df.ffill()

#plot_df.head()
fig, axes = plt.subplots(nrows=3,ncols=2,figsize=(12,6))
# plot first pandas frame in subplot style
df1 = plot2_df.iloc[:, 0:3]
df2 = plot2_df.iloc[:, 3:]

plt.style.use('ggplot')
df1.plot(ax = axes[:,0],subplots=True) 
# plot second pandas frame in subplot style
#sns.color_pallette('Set1',3)
plt.style.use('seaborn-dark-palette')
df2.plot(ax = axes[:,1],subplots=True)

fig.supxlabel('Time')
fig.supylabel('Daily Average Review Rating')

plt.show()
fig.savefig('daily_avg_star.png')

In [None]:

# plot daily review score mean for each app
score_p = sns.relplot(
    data = daily_mean,
    x = daily_mean['day_month_year'].astype(str), y = 'score', col = "appId", hue = 'appId',
    kind = "line", linewidth = 2, zorder = 2,
    col_wrap = 2, height = 4, aspect = 2, legend = False
)


In [None]:
daily_mean.head(5)

In [None]:
# smooth, take 2 week rolling average - include in presentation
daily_mean['2wk_avg'] = daily_mean.groupby('appId').rolling(14)['score'].mean().reset_index(drop=True)

# plot
plot2_df = pd.pivot_table(daily_mean.reset_index(),
               index='day_month_year', columns='appId', values='2wk_avg'
              )#.plot(subplots=True)

plot2_df = plot2_df.ffill()

#plot_df.head()
fig, axes = plt.subplots(nrows=3,ncols=2,figsize=(12,6))

df1 = plot2_df.iloc[:, 0:3]
df2 = plot2_df.iloc[:, 3:]

# plot first pandas frame in subplot style
plt.style.use('ggplot')
df1.plot(ax = axes[:,0],subplots=True) 
# plot second pandas frame in subplot style
#sns.color_pallette('Set1',3)
plt.style.use('seaborn-dark-palette')
df2.plot(ax = axes[:,1],subplots=True, kind='line')

fig.supxlabel('Time')
fig.supylabel('2 Week Average Review Rating')

plt.show()
fig.savefig('2wk_avg_star.png')

In [None]:
# now smooth, take 2 week rolling average - include in presentation
daily_mean['2wk_avg'] = daily_mean.groupby('appId').rolling(14)['score'].mean().reset_index(drop=True)

moving_score_p = sns.relplot(
    data = daily_mean,
    x = daily_mean['day_month_year'].astype(str), y = '2wk_avg', col = "appId", hue = 'appId',
    kind = "line", linewidth = 2, zorder = 2,
    col_wrap = 2, height = 4, aspect = 2, legend = False
)

In [None]:
# now smooth, take 30 day rolling average - don't include in presentation
daily_mean['month_avg'] = daily_mean.groupby('appId').rolling(30)['score'].mean().reset_index(drop=True)

# plot
plot3_df = pd.pivot_table(daily_mean.reset_index(),
               index='day_month_year', columns='appId', values='month_avg'
              )#.plot(subplots=True)

plot3_df = plot3_df.ffill()

#plot_df.head()
fig, axes = plt.subplots(nrows=3,ncols=2,figsize=(12,6))

df1 = plot2_df.iloc[:, 0:3]
df2 = plot2_df.iloc[:, 3:]

# plot first pandas frame in subplot style
plt.style.use('ggplot')
df1.plot(ax = axes[:,0],subplots=True) 
# plot second pandas frame in subplot style
#sns.color_pallette('Set1',3)
plt.style.use('seaborn-dark-palette')
df2.plot(ax = axes[:,1],subplots=True, kind='line')

plt.show()

# moving_month_score_p = sns.relplot(
#     data = daily_mean,
#     x = daily_mean['day_month_year'].astype(str), y = 'month_avg', col = "appId", hue = 'appId',
#     kind = "line", linewidth = 2, zorder = 2,
#     col_wrap = 2, height = 4, aspect = 2, legend = False
# )

In [None]:
# VADER sentiment analysis - designed and trained for social media texts
sentAnalyser = SentimentIntensityAnalyzer()

In [None]:
# function to extract sentiment from each review
def calculate_sentiment(comment):
    # Run VADER on the reviews
    scores = sentAnalyser.polarity_scores(comment)
    # Extract the compound score
    compound_score = scores['compound']
    # Return compound score
    return compound_score

In [None]:
# check if there are any NaN comments
clean_df.loc[clean_df['content'] == None].count().iloc[0]
print(sum(x is None for x in clean_df['content']))
print(sum(x is None for x in clean_df['at']))
clean_df = clean_df[clean_df['content'].notnull()]

In [None]:
# Apply the function to every row in the "content" column and output the results into a new column "sentiment_score"
clean_df['sentiment_score'] = clean_df['content'].apply(calculate_sentiment)

In [None]:
good = "I think this app is good, the clothes came right on time"
print(calculate_sentiment(good))
bad = "The clothes came late, would not recommend"
print(calculate_sentiment(bad))

In [None]:
# now get rolling average for sentiment over time
sen_df = clean_df[['at','appId','sentiment_score']].copy()

sen_df['day_month_year'] = pd.to_datetime(sen_df['at']).dt.to_period('D')

sen_daily_mean = sen_df.groupby(['appId', 'day_month_year'], as_index = False)['sentiment_score'].mean()

# now smooth, take 2 week rolling average - include in presentation
sen_daily_mean['2wk_avg_sen'] = sen_daily_mean.groupby('appId').rolling(14)['sentiment_score'].mean().reset_index(drop=True)



In [None]:

# plot rolling 2 week sentiment
plot4_df = pd.pivot_table(sen_daily_mean.reset_index(),
               index='day_month_year', columns='appId', values='2wk_avg_sen'
              )


plot4_df = plot4_df.ffill()

fig, axes = plt.subplots(nrows=3,ncols=2,figsize=(12,6))

df1 = plot4_df.iloc[:, 0:3]
df2 = plot4_df.iloc[:, 3:]


# plot first pandas frame in subplot style
plt.style.use('ggplot')
df1.plot(ax = axes[:,0],subplots=True) 
# plot second pandas frame in subplot style
#sns.color_pallette('Set1',3)
plt.style.use('seaborn-dark-palette')
df2.plot(ax = axes[:,1],subplots=True, kind='line')

fig.supxlabel('Time')
fig.supylabel('2 Week Average Sentiment')

plt.show()
fig.savefig('2wk_avg_sentiment.png')

In [None]:
# could further analyze with top2vec, topic modeling and analysis