In [1]:
import pandas as pd
import numpy as np
import gensim
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
import wordcloud
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
df = pd.read_csv('.../clean_text.csv')
df

Unnamed: 0,voted_up,clean_token,clean_str
0,True,"['fall', 'guy', 'chill', 'fun', 'game', 'reall...",fall guy chill fun game really need server imp...
1,True,"['give', 'little', 'hug', 'player', 'stare', '...",give little hug player stare back second jump ...
2,True,"['reccomend', 'much', 'buy']",reccomend much buy
3,True,"['hug', 'people']",hug people
4,True,"['little', 'boy', 'like', 'see', 'wipeout', 't...",little boy like see wipeout tv always dream ta...
...,...,...,...
13201,True,"['well', 'make', 'game', 'get', 'enough', 'man...",well make game get enough many way differentia...
13202,True,"['fun', 'competitive', 'especially', 'friend',...",fun competitive especially friend recommend bu...
13203,True,"['game', 'extremely', 'fun', 'nice', 'break', ...",game extremely fun nice break usual drop loot ...
13204,True,"['fun', 'play', 'keep', 'work', 'game']",fun play keep work game


In [3]:
df_yes = df[df['voted_up'] == True]

In [4]:
df_yes.reset_index(inplace = True, drop = True)

In [5]:
df_yes

Unnamed: 0,voted_up,clean_token,clean_str
0,True,"['fall', 'guy', 'chill', 'fun', 'game', 'reall...",fall guy chill fun game really need server imp...
1,True,"['give', 'little', 'hug', 'player', 'stare', '...",give little hug player stare back second jump ...
2,True,"['reccomend', 'much', 'buy']",reccomend much buy
3,True,"['hug', 'people']",hug people
4,True,"['little', 'boy', 'like', 'see', 'wipeout', 't...",little boy like see wipeout tv always dream ta...
...,...,...,...
9779,True,"['well', 'make', 'game', 'get', 'enough', 'man...",well make game get enough many way differentia...
9780,True,"['fun', 'competitive', 'especially', 'friend',...",fun competitive especially friend recommend bu...
9781,True,"['game', 'extremely', 'fun', 'nice', 'break', ...",game extremely fun nice break usual drop loot ...
9782,True,"['fun', 'play', 'keep', 'work', 'game']",fun play keep work game


In [104]:
df_yes.info()

&lt;class &#39;pandas.core.frame.DataFrame&#39;&gt;
RangeIndex: 9784 entries, 0 to 9783
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   voted_up     9784 non-null   bool  
 1   clean_token  9784 non-null   object
 2   clean_str    9784 non-null   object
dtypes: bool(1), object(2)
memory usage: 162.6+ KB


In [6]:
df_yes.clean_str=df_yes.clean_str.astype(str)

In [109]:
vectorizer = CountVectorizer(analyzer='word',  
                             min_df=3,   
                             stop_words=['game', 'fun', 'good'],
                             ngram_range=(2,2),                           
                             lowercase=True,                  
                             token_pattern='[a-zA-Z0-9]{3,}', 
                             max_features=5000,           
                            )

data_vectorized = vectorizer.fit_transform(df_yes['clean_str'])

lda_model = LatentDirichletAllocation(n_components= 6, 
                                      learning_method='online',
                                      random_state= 0,       
                                      n_jobs = -1 
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

In [110]:
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=8):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=8)        

# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7
Topic 0,server issue,negative review,launch day,yellow team,recommend anyone,review bomb,want play,wait see
Topic 1,play friend,would recommend,fall guy,battle royale,battle royal,great play,get eliminate,even well
Topic 2,battle royale,takeshis castle,mario party,jelly bean,friend play,feel like,obstacle course,get knock
Topic 3,big big,feel like,tail tag,final round,fall mountain,fall ball,slime climb,many people
Topic 4,fall guy,long time,battle royale,battle pass,get win,new map,play long,play hour
Topic 5,play friend,run run,guy fall,highly recommend,server issue,ever play,definitely worth,really play


In [122]:
Topics_theme = ['server issue on launch day', 'recommend to play with friend', 'similar with other games and tv show', 'like some of the stages', 'need new map and battle pass', 'enjoy the game with friends but server issue']
df_topic_keywords['topic_theme'] = Topics_theme

In [123]:
df_topic_keywords.set_index('topic_theme', inplace=True)

In [124]:
df_topic_keywords.T

topic_theme,server issue on launch day,recommend to play with friend,similar with other games and tv show,like some of the stages,need new map and battle pass,enjoy the game with friends but server issue
Word 0,server issue,play friend,battle royale,big big,fall guy,play friend
Word 1,negative review,would recommend,takeshis castle,feel like,long time,run run
Word 2,launch day,fall guy,mario party,tail tag,battle royale,guy fall
Word 3,yellow team,battle royale,jelly bean,final round,battle pass,highly recommend
Word 4,recommend anyone,battle royal,friend play,fall mountain,get win,server issue
Word 5,review bomb,great play,feel like,fall ball,new map,ever play
Word 6,want play,get eliminate,obstacle course,slime climb,play long,definitely worth
Word 7,wait see,even well,get knock,many people,play hour,really play


In [125]:
lda_output = lda_model.transform(data_vectorized)

topicnames = df_topic_keywords.T.columns

docnames = ["Doc" + str(i) for i in range(len(df_yes))]

df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

In [126]:
df_document_topic.reset_index(inplace=True)
df_sent_topic= pd.merge(df_yes, df_document_topic, left_index=True, right_index=True)
df_sent_topic.drop('index', axis=1, inplace=True)

In [127]:
df_sent_topic

Unnamed: 0,voted_up,clean_token,clean_str,server issue on launch day,recommend to play with friend,similar with other games and tv show,like some of the stages,need new map and battle pass,enjoy the game with friends but server issue,dominant_topic
0,True,"['fall', 'guy', 'chill', 'fun', 'game', 'reall...",fall guy chill fun game really need server imp...,0.01,0.01,0.01,0.01,0.96,0.01,4
1,True,"['give', 'little', 'hug', 'player', 'stare', '...",give little hug player stare back second jump ...,0.03,0.03,0.03,0.03,0.86,0.03,4
2,True,"['reccomend', 'much', 'buy']",reccomend much buy,0.08,0.08,0.58,0.08,0.08,0.08,2
3,True,"['hug', 'people']",hug people,0.58,0.08,0.08,0.08,0.08,0.08,0
4,True,"['little', 'boy', 'like', 'see', 'wipeout', 't...",little boy like see wipeout tv always dream ta...,0.02,0.02,0.90,0.02,0.02,0.02,2
...,...,...,...,...,...,...,...,...,...,...
9779,True,"['well', 'make', 'game', 'get', 'enough', 'man...",well make game get enough many way differentia...,0.02,0.19,0.31,0.02,0.17,0.28,2
9780,True,"['fun', 'competitive', 'especially', 'friend',...",fun competitive especially friend recommend bu...,0.04,0.04,0.04,0.04,0.04,0.79,5
9781,True,"['game', 'extremely', 'fun', 'nice', 'break', ...",game extremely fun nice break usual drop loot ...,0.06,0.39,0.39,0.06,0.06,0.06,1
9782,True,"['fun', 'play', 'keep', 'work', 'game']",fun play keep work game,0.06,0.06,0.39,0.06,0.39,0.06,2


In [128]:
df_topic_theme = df_sent_topic[['clean_str', 'dominant_topic']]

In [129]:
df_topic_theme.head(10)

Unnamed: 0,clean_str,dominant_topic
0,fall guy chill fun game really need server imp...,4
1,give little hug player stare back second jump ...,4
2,reccomend much buy,2
3,hug people,0
4,little boy like see wipeout tv always dream ta...,2
5,nice br fun would fall,1
6,funnychaotic madness player bounce collect pus...,2
7,fun short burst tedious long stretch moment mi...,3
8,see game potential good design game good multi...,5
9,great game fun play need get rid cheater asap ...,1


In [130]:
topic_remap = {0:'server issue on launch day', 1:'recommend to play with friend', 2:'similar with other games and tv show', 3:'like some of the stages', 4:'need new map and battle pass', 5:'enjoy the game with friends but server issue'}

In [131]:
df_topic_theme['dominant_topic_theme'] = df_topic_theme['dominant_topic'].map(topic_remap)

In [132]:
df_topic_theme

Unnamed: 0,clean_str,dominant_topic,dominant_topic_theme
0,fall guy chill fun game really need server imp...,4,need new map and battle pass
1,give little hug player stare back second jump ...,4,need new map and battle pass
2,reccomend much buy,2,similar with other games and tv show
3,hug people,0,server issue on launch day
4,little boy like see wipeout tv always dream ta...,2,similar with other games and tv show
...,...,...,...
9779,well make game get enough many way differentia...,2,similar with other games and tv show
9780,fun competitive especially friend recommend bu...,5,enjoy the game with friends but server issue
9781,game extremely fun nice break usual drop loot ...,1,recommend to play with friend
9782,fun play keep work game,2,similar with other games and tv show


In [133]:
df_topic_theme['dominant_topic_theme'].value_counts()

server issue on launch day                      4716
recommend to play with friend                   1198
similar with other games and tv show            1162
need new map and battle pass                     948
enjoy the game with friends but server issue     908
like some of the stages                          852
Name: dominant_topic_theme, dtype: int64

In [134]:
df_result = df_topic_theme['dominant_topic_theme'].value_counts().rename_axis('topic').reset_index(name='counts')

In [135]:
df_result

Unnamed: 0,topic,counts
0,server issue on launch day,4716
1,recommend to play with friend,1198
2,similar with other games and tv show,1162
3,need new map and battle pass,948
4,enjoy the game with friends but server issue,908
5,like some of the stages,852


In [136]:
fig = go.Figure([go.Bar(x=df_result['topic'], y=df_result['counts'])])
fig.update_layout(title=go.layout.Title(text="Topics in Recommended Reviews"))
fig.show()