In [1]:
import pandas as pd
import numpy as np
import gensim
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
import wordcloud
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
df = pd.read_csv('.../clean_text.csv')
df

Unnamed: 0,voted_up,clean_token,clean_str
0,True,"['fall', 'guy', 'chill', 'fun', 'game', 'reall...",fall guy chill fun game really need server imp...
1,True,"['give', 'little', 'hug', 'player', 'stare', '...",give little hug player stare back second jump ...
2,True,"['reccomend', 'much', 'buy']",reccomend much buy
3,True,"['hug', 'people']",hug people
4,True,"['little', 'boy', 'like', 'see', 'wipeout', 't...",little boy like see wipeout tv always dream ta...
...,...,...,...
13201,True,"['well', 'make', 'game', 'get', 'enough', 'man...",well make game get enough many way differentia...
13202,True,"['fun', 'competitive', 'especially', 'friend',...",fun competitive especially friend recommend bu...
13203,True,"['game', 'extremely', 'fun', 'nice', 'break', ...",game extremely fun nice break usual drop loot ...
13204,True,"['fun', 'play', 'keep', 'work', 'game']",fun play keep work game


In [3]:
df_no = df[df['voted_up'] == False]

In [4]:
df_no.reset_index(inplace = True, drop = True)

In [5]:
df_no

Unnamed: 0,voted_up,clean_token,clean_str
0,False,"['hacker', 'destroy', 'game', 'use', 'hack', '...",hacker destroy game use hack final matchill ch...
1,False,"['hacking', 'ruin', 'game', 'recommend', 'repo...",hacking ruin game recommend reporting antichea...
2,False,"['update', 'dev', 'address', 'cheat', 'issue',...",update dev address cheat issue finally add eac...
3,False,"['game', 'full', 'hacker', 'need', 'report', '...",game full hacker need report system something
4,False,"['much', 'enjoy', 'game', 'rampant', 'hacker',...",much enjoy game rampant hacker discord remove ...
...,...,...,...
3417,False,"['really', 'fun', 'first', 'time', 'play', 'ge...",really fun first time play get redundant annoy...
3418,False,"['start', 'game']",start game
3419,False,"['buy', 'game', 'day', 'release', 'wait', 'rel...",buy game day release wait release whole hour t...
3420,False,"['really', 'fun', 'start', 'griefer', 'hacker'...",really fun start griefer hacker infest game pl...


In [7]:
df_no.info()

&lt;class &#39;pandas.core.frame.DataFrame&#39;&gt;
RangeIndex: 3422 entries, 0 to 3421
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   voted_up     3422 non-null   bool  
 1   clean_token  3422 non-null   object
 2   clean_str    3408 non-null   object
dtypes: bool(1), object(2)
memory usage: 56.9+ KB


In [8]:
df.isnull().sum()

voted_up         0
clean_token      0
clean_str      121
dtype: int64

In [9]:
df[df.isna().any(axis=1)]

Unnamed: 0,voted_up,clean_token,clean_str
233,True,[],
251,True,[],
371,True,[],
465,True,[],
519,False,[],
...,...,...,...
12966,True,[],
13112,True,[],
13143,True,[],
13147,False,[],


In [12]:
df_no.dropna(inplace= True)

In [13]:
df_no.info()

&lt;class &#39;pandas.core.frame.DataFrame&#39;&gt;
Int64Index: 3408 entries, 0 to 3421
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   voted_up     3408 non-null   bool  
 1   clean_token  3408 non-null   object
 2   clean_str    3408 non-null   object
dtypes: bool(1), object(2)
memory usage: 83.2+ KB


In [14]:
df_no.clean_str=df_no.clean_str.astype(str)

In [15]:
vectorizer = CountVectorizer(analyzer='word',  
                             min_df=3,   
                             stop_words=['game'],
                             ngram_range=(2,2),                           
                             lowercase=True,                  
                             token_pattern='[a-zA-Z0-9]{3,}', 
                             max_features=2000,           
                            )

data_vectorized = vectorizer.fit_transform(df_no['clean_str'])

lda_model = LatentDirichletAllocation(n_components= 6,
                                      learning_method='online',
                                      random_state=0,       
                                      n_jobs = -1
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

In [16]:
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=8):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=8)        

# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7
Topic 0,anti cheat,really fun,way report,fly around,worth price,chance win,full cheater,change review
Topic 1,many cheater,cheater cheater,every match,almost every,report system,even get,positive review,get refund
Topic 2,final round,tail tag,remove team,get rid,fun first,connection issue,seem like,team base
Topic 3,fall guy,feel like,play friend,battle royale,current state,fun play,fun friend,server issue
Topic 4,many hacker,full hacker,hacker every,waste money,cheater every,every single,report hacker,would recommend
Topic 5,server issue,fun get,get repetitive,fall mountain,get boring,play hour,cheater ruin,get eliminate


In [17]:
Topics_theme = ['will change review if there is anti cheat', 'need cheater report system', 'remove team game', 'fun to play but server issue', 'hackers ruin the game', 'stages get repetitive and boring']
df_topic_keywords['topic_theme'] = Topics_theme

In [18]:
df_topic_keywords.set_index('topic_theme', inplace=True)

In [19]:
df_topic_keywords.T

topic_theme,will change review if there is anti cheat,need cheater report system,remove team game,fun to play but server issue,hackers ruin the game,stages get repetitive and boring
Word 0,anti cheat,many cheater,final round,fall guy,many hacker,server issue
Word 1,really fun,cheater cheater,tail tag,feel like,full hacker,fun get
Word 2,way report,every match,remove team,play friend,hacker every,get repetitive
Word 3,fly around,almost every,get rid,battle royale,waste money,fall mountain
Word 4,worth price,report system,fun first,current state,cheater every,get boring
Word 5,chance win,even get,connection issue,fun play,every single,play hour
Word 6,full cheater,positive review,seem like,fun friend,report hacker,cheater ruin
Word 7,change review,get refund,team base,server issue,would recommend,get eliminate


In [20]:
lda_output = lda_model.transform(data_vectorized)

topicnames = df_topic_keywords.T.columns

docnames = ["Doc" + str(i) for i in range(len(df_no))]

df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

In [21]:
df_document_topic.reset_index(inplace=True)
df_sent_topic= pd.merge(df_no, df_document_topic, left_index=True, right_index=True)
df_sent_topic.drop('index', axis=1, inplace=True)

In [22]:
df_sent_topic

Unnamed: 0,voted_up,clean_token,clean_str,will change review if there is anti cheat,need cheater report system,remove team game,fun to play but server issue,hackers ruin the game,stages get repetitive and boring,dominant_topic
0,False,"['hacker', 'destroy', 'game', 'use', 'hack', '...",hacker destroy game use hack final matchill ch...,0.37,0.02,0.02,0.02,0.54,0.02,4
1,False,"['hacking', 'ruin', 'game', 'recommend', 'repo...",hacking ruin game recommend reporting antichea...,0.04,0.04,0.79,0.04,0.04,0.04,2
2,False,"['update', 'dev', 'address', 'cheat', 'issue',...",update dev address cheat issue finally add eac...,0.78,0.09,0.12,0.00,0.00,0.00,0
3,False,"['game', 'full', 'hacker', 'need', 'report', '...",game full hacker need report system something,0.29,0.29,0.04,0.04,0.29,0.04,0
4,False,"['much', 'enjoy', 'game', 'rampant', 'hacker',...",much enjoy game rampant hacker discord remove ...,0.00,0.00,0.15,0.00,0.83,0.00,4
...,...,...,...,...,...,...,...,...,...,...
3402,False,"['already', 'microtransaction', 'game', 'alrea...",already microtransaction game already pay stil...,0.17,0.17,0.17,0.17,0.17,0.17,0
3404,False,"['fun', 'game', 'short', 'burst', 'right', 'mi...",fun game short burst right minigame selection ...,0.17,0.17,0.17,0.17,0.17,0.17,0
3405,False,"['fun', 'first', 'couple', 'hour', 'little', '...",fun first couple hour little skill involve muc...,0.01,0.05,0.11,0.36,0.11,0.36,3
3406,False,"['go', 'keep', 'disliked', 'til', 'tail', 'gam...",go keep disliked til tail game go,0.35,0.01,0.10,0.01,0.01,0.51,5


In [23]:
df_topic_theme = df_sent_topic[['clean_str', 'dominant_topic']]

In [24]:
df_topic_theme.head(10)

Unnamed: 0,clean_str,dominant_topic
0,hacker destroy game use hack final matchill ch...,4
1,hacking ruin game recommend reporting antichea...,2
2,update dev address cheat issue finally add eac...,0
3,game full hacker need report system something,0
4,much enjoy game rampant hacker discord remove ...,4
5,ill start say far bad game definitely potentia...,2
6,many hacker ruin silly game take one batch rui...,5
7,game appear super fun laugh play friend ton fu...,0
8,get fly hacker casual chill game xd,3
9,fun game enough content start price dollar sta...,5


In [25]:
topic_remap = {0: 'will change review if there is anti cheat', 1: 'need cheater report system', 2: 'remove team game', 3: 'fun to play but server issue', 4: 'hackers ruin the game', 5: 'stages get repetitive and boring'}

In [26]:
df_topic_theme['dominant_topic_theme'] = df_topic_theme['dominant_topic'].map(topic_remap)

In [27]:
df_topic_theme

Unnamed: 0,clean_str,dominant_topic,dominant_topic_theme
0,hacker destroy game use hack final matchill ch...,4,hackers ruin the game
1,hacking ruin game recommend reporting antichea...,2,remove team game
2,update dev address cheat issue finally add eac...,0,will change review if there is anti cheat
3,game full hacker need report system something,0,will change review if there is anti cheat
4,much enjoy game rampant hacker discord remove ...,4,hackers ruin the game
...,...,...,...
3402,already microtransaction game already pay stil...,0,will change review if there is anti cheat
3404,fun game short burst right minigame selection ...,0,will change review if there is anti cheat
3405,fun first couple hour little skill involve muc...,3,fun to play but server issue
3406,go keep disliked til tail game go,5,stages get repetitive and boring


In [28]:
df_topic_theme['dominant_topic_theme'].value_counts()

will change review if there is anti cheat    1091
fun to play but server issue                  556
hackers ruin the game                         468
need cheater report system                    459
remove team game                              426
stages get repetitive and boring              395
Name: dominant_topic_theme, dtype: int64

In [29]:
df_result = df_topic_theme['dominant_topic_theme'].value_counts().rename_axis('topic').reset_index(name='counts')

In [30]:
df_result

Unnamed: 0,topic,counts
0,will change review if there is anti cheat,1091
1,fun to play but server issue,556
2,hackers ruin the game,468
3,need cheater report system,459
4,remove team game,426
5,stages get repetitive and boring,395


In [31]:
fig = go.Figure([go.Bar(x=df_result['topic'], y=df_result['counts'])])
fig.update_layout(title=go.layout.Title(text="Topics in Recommended Reviews"))
fig.show()