In [1]:
import pandas as pd
import numpy as np
import gensim
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
import wordcloud
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from pprint import pprint

In [2]:
df = pd.read_csv('.../clean_text.csv')
df

Unnamed: 0,voted_up,clean_token,clean_str
0,True,"['fall', 'guy', 'chill', 'fun', 'game', 'reall...",fall guy chill fun game really need server imp...
1,True,"['give', 'little', 'hug', 'player', 'stare', '...",give little hug player stare back second jump ...
2,True,"['reccomend', 'much', 'buy']",reccomend much buy
3,True,"['hug', 'people']",hug people
4,True,"['little', 'boy', 'like', 'see', 'wipeout', 't...",little boy like see wipeout tv always dream ta...
...,...,...,...
13089,True,"['well', 'make', 'game', 'get', 'enough', 'man...",well make game get enough many way differentia...
13090,True,"['fun', 'competitive', 'especially', 'friend',...",fun competitive especially friend recommend bu...
13091,True,"['game', 'extremely', 'fun', 'nice', 'break', ...",game extremely fun nice break usual drop loot ...
13092,True,"['fun', 'play', 'keep', 'work', 'game']",fun play keep work game


In [3]:
df_no = df[df['voted_up'] == False]

In [4]:
df_no.reset_index(inplace = True, drop = True)

In [5]:
df_no

Unnamed: 0,voted_up,clean_token,clean_str
0,False,"['hacker', 'destroy', 'game', 'use', 'hack', '...",hacker destroy game use hack final matchill ch...
1,False,"['hacking', 'ruin', 'game', 'recommend', 'repo...",hacking ruin game recommend reporting antichea...
2,False,"['update', 'dev', 'address', 'cheat', 'issue',...",update dev address cheat issue finally add eac...
3,False,"['game', 'full', 'hacker', 'need', 'report', '...",game full hacker need report system something
4,False,"['much', 'enjoy', 'game', 'rampant', 'hacker',...",much enjoy game rampant hacker discord remove ...
...,...,...,...
3321,False,"['really', 'fun', 'first', 'time', 'play', 'ge...",really fun first time play get redundant annoy...
3322,False,"['start', 'game']",start game
3323,False,"['buy', 'game', 'day', 'release', 'wait', 'rel...",buy game day release wait release whole hour t...
3324,False,"['really', 'fun', 'start', 'griefer', 'hacker'...",really fun start griefer hacker infest game pl...


In [6]:
df_no.clean_str=df_no.clean_str.astype(str)

In [17]:
vectorizer = CountVectorizer(analyzer='word',  
                             min_df=3,   
                             stop_words=['game'],
                             ngram_range=(2,2),                           
                             lowercase=True,                  
                             token_pattern='[a-zA-Z0-9]{3,}', 
                             max_features=10000,           
                            )

data_vectorized = vectorizer.fit_transform(df_no['clean_str'])

lda_model = LatentDirichletAllocation(n_components= 6,
                                      learning_method='online',
                                      random_state=0,       
                                      n_jobs = -1
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

In [18]:
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=10):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=10)        

# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
Topic 0,many cheater,cheater cheater,fun play,fix server,completely ruin,way many,anti cheat,positive review,last round,please fix
Topic 1,fall guy,battle royale,tail tag,fun play,remove team,play friend,get repetitive,lot fun,want play,get eliminate
Topic 2,feel like,team mode,fall guy,play friend,anti cheat,waste time,time get,fun friend,team base,current state
Topic 3,many hacker,full hacker,hacker every,almost every,would recommend,cheater ruin,ruin fun,current state,chance win,recommend buy
Topic 4,cheater every,waste money,connection issue,get disconnect,fall ball,seem like,report system,hacker ruin,get well,every second
Topic 5,server issue,feel like,really fun,fun first,every time,way report,full cheater,luck base,may well,worth price


In [19]:
Topics_theme = ['cheaters ruin the game', 'some stages get repetitive', 'team mode is fun but a waste of time with cheaters', 'hackers in almost every stages', 'server connection issues', 'cheater reporting system']
df_topic_keywords['topic_theme'] = Topics_theme

In [20]:
df_topic_keywords.set_index('topic_theme', inplace=True)

In [21]:
df_topic_keywords.T

topic_theme,cheaters ruin the game,some stages get repetitive,team mode is fun but a waste of time with cheaters,hackers in almost every stages,server connection issues,cheater reporting system
Word 0,many cheater,fall guy,feel like,many hacker,cheater every,server issue
Word 1,cheater cheater,battle royale,team mode,full hacker,waste money,feel like
Word 2,fun play,tail tag,fall guy,hacker every,connection issue,really fun
Word 3,fix server,fun play,play friend,almost every,get disconnect,fun first
Word 4,completely ruin,remove team,anti cheat,would recommend,fall ball,every time
Word 5,way many,play friend,waste time,cheater ruin,seem like,way report
Word 6,anti cheat,get repetitive,time get,ruin fun,report system,full cheater
Word 7,positive review,lot fun,fun friend,current state,hacker ruin,luck base
Word 8,last round,want play,team base,chance win,get well,may well
Word 9,please fix,get eliminate,current state,recommend buy,every second,worth price


In [23]:
lda_output = lda_model.transform(data_vectorized)

topicnames = df_topic_keywords.T.columns

docnames = ["Doc" + str(i) for i in range(len(df_no))]

df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

In [24]:
df_document_topic.reset_index(inplace=True)
df_sent_topic= pd.merge(df_no, df_document_topic, left_index=True, right_index=True)
df_sent_topic.drop('index', axis=1, inplace=True)

In [25]:
df_sent_topic

Unnamed: 0,voted_up,clean_token,clean_str,cheaters ruin the game,some stages get repetitive,team mode is fun but a waste of time with cheaters,hackers in almost every stages,server connection issues,cheater reporting system,dominant_topic
0,False,"['hacker', 'destroy', 'game', 'use', 'hack', '...",hacker destroy game use hack final matchill ch...,0.02,0.02,0.02,0.02,0.02,0.92,5
1,False,"['hacking', 'ruin', 'game', 'recommend', 'repo...",hacking ruin game recommend reporting antichea...,0.03,0.03,0.03,0.03,0.83,0.03,4
2,False,"['update', 'dev', 'address', 'cheat', 'issue',...",update dev address cheat issue finally add eac...,0.03,0.00,0.00,0.00,0.00,0.97,5
3,False,"['game', 'full', 'hacker', 'need', 'report', '...",game full hacker need report system something,0.03,0.03,0.03,0.83,0.03,0.03,3
4,False,"['much', 'enjoy', 'game', 'rampant', 'hacker',...",much enjoy game rampant hacker discord remove ...,0.00,0.00,0.00,0.00,0.98,0.00,4
...,...,...,...,...,...,...,...,...,...,...
3321,False,"['really', 'fun', 'first', 'time', 'play', 'ge...",really fun first time play get redundant annoy...,0.01,0.38,0.17,0.01,0.15,0.29,1
3322,False,"['start', 'game']",start game,0.17,0.17,0.17,0.17,0.17,0.17,0
3323,False,"['buy', 'game', 'day', 'release', 'wait', 'rel...",buy game day release wait release whole hour t...,0.08,0.45,0.14,0.08,0.01,0.25,1
3324,False,"['really', 'fun', 'start', 'griefer', 'hacker'...",really fun start griefer hacker infest game pl...,0.10,0.01,0.08,0.06,0.22,0.52,5


In [26]:
df_topic_theme = df_sent_topic[['clean_str', 'dominant_topic']]

In [27]:
df_topic_theme.head(10)

Unnamed: 0,clean_str,dominant_topic
0,hacker destroy game use hack final matchill ch...,5
1,hacking ruin game recommend reporting antichea...,4
2,update dev address cheat issue finally add eac...,5
3,game full hacker need report system something,3
4,much enjoy game rampant hacker discord remove ...,4
5,ill start say far bad game definitely potentia...,1
6,many hacker ruin silly game take one batch rui...,3
7,game appear super fun laugh play friend ton fu...,0
8,get fly hacker casual chill game xd,0
9,fun game enough content start price dollar sta...,1


In [28]:
topic_remap = {0: 'cheaters ruin the game', 1: 'some stages get repetitive', 2: 'team mode is fun but a waste of time with cheaters', 3: 'hackers in almost every stages', 4: 'server connection issues', 5: 'cheater reporting system'}

In [29]:
df_topic_theme['dominant_topic_theme'] = df_topic_theme['dominant_topic'].map(topic_remap)

In [30]:
df_topic_theme

Unnamed: 0,clean_str,dominant_topic,dominant_topic_theme
0,hacker destroy game use hack final matchill ch...,5,cheater reporting system
1,hacking ruin game recommend reporting antichea...,4,server connection issues
2,update dev address cheat issue finally add eac...,5,cheater reporting system
3,game full hacker need report system something,3,hackers in almost every stages
4,much enjoy game rampant hacker discord remove ...,4,server connection issues
...,...,...,...
3321,really fun first time play get redundant annoy...,1,some stages get repetitive
3322,start game,0,cheaters ruin the game
3323,buy game day release wait release whole hour t...,1,some stages get repetitive
3324,really fun start griefer hacker infest game pl...,5,cheater reporting system


In [31]:
df_topic_theme['dominant_topic_theme'].value_counts()

cheaters ruin the game                                888
some stages get repetitive                            607
hackers in almost every stages                        550
team mode is fun but a waste of time with cheaters    507
server connection issues                              396
cheater reporting system                              378
Name: dominant_topic_theme, dtype: int64

In [32]:
df_result = df_topic_theme['dominant_topic_theme'].value_counts().rename_axis('topic').reset_index(name='counts')

In [33]:
df_result

Unnamed: 0,topic,counts
0,cheaters ruin the game,888
1,some stages get repetitive,607
2,hackers in almost every stages,550
3,team mode is fun but a waste of time with chea...,507
4,server connection issues,396
5,cheater reporting system,378


In [34]:
fig = go.Figure([go.Bar(x=df_result['topic'], y=df_result['counts'])])
fig.update_layout(title=go.layout.Title(text="Topics in Recommended Reviews"))
fig.show()