In [None]:
!pip install pyldavis pandas==1.5.1 ftfy cleantext wordcloud bertopic umap

In [2]:
import warnings

warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import os
import re
from wordcloud import WordCloud
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import gensim.corpora as corpora

from PIL import Image
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim_models
from ftfy import fix_encoding
from cleantext import clean
#from bertopic import BERTopic
#from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer
from pprint import pprint
import seaborn as sns
import matplotlib.pyplot as plt
import warnings


warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
stop_words = stopwords.words('english')
addn_stopwords = ['app','really','snapchat', 'google', 'assistant','please','dark','mode','update','wrong','something',
                  'went','say','ever','android','ios','replika','chatgpt','gpt','snap','filter','also','ai', 'ass']
stop_words.extend(addn_stopwords)

In [4]:
path = "/kaggle/input/ai-companions-app-reviews/reviews.csv"
data = pd.read_csv(path)
data.drop('Unnamed: 0', axis=1, inplace=True)
data['date']  = pd.to_datetime(data['date'])

In [5]:
filtered_data = data[data['rev_length']>=10]
filtered_data['review'] = filtered_data['review'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
filtered_data['review'] = filtered_data['review'].apply(lambda x: x.lower())

In [6]:
positive_revs = filtered_data[filtered_data['feedback']=='positive']
negative_revs = filtered_data[filtered_data['feedback']=='negative']

after_21_revs = filtered_data[filtered_data['date']>'2021']
before_21_revs = filtered_data[filtered_data['date'] <= '2021']

In [7]:
all_words = ' '.join(filtered_data['review'].values)

before_21 = ' '.join(before_21_revs['review'].values)
after_21 = ' '.join(after_21_revs['review'].values)

negative = ' '.join(negative_revs['review'].values)
positive = ' '.join(positive_revs['review'].values)

In [8]:
#LDA
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc))
             if word not in stop_words] for doc in texts]

In [9]:
all_words = filtered_data.review.values.tolist()
pos_words = positive_revs.review.values.tolist()
neg_words = negative_revs.review.values.tolist()
after_21_words = after_21_revs.review.values.tolist()
before_21_words = before_21_revs.review.values.tolist()


data_words = remove_stopwords(list(sent_to_words(all_words)))
pos_data_words = remove_stopwords(list(sent_to_words(pos_words)))
neg_data_words = remove_stopwords(list(sent_to_words(neg_words)))
after_21_data_words = remove_stopwords(list(sent_to_words(after_21_words)))
before_21_data_words = remove_stopwords(list(sent_to_words(before_21_words)))

In [10]:
id2_word_all = corpora.Dictionary(data_words)
texts_all = data_words
corpus_all = [id2_word_all.doc2bow(text) for text in texts_all]

id2_word_pos = corpora.Dictionary(pos_data_words)
texts_pos = pos_data_words
corpus_pos = [id2_word_pos.doc2bow(text) for text in texts_pos]

id2_word_neg = corpora.Dictionary(neg_data_words)
texts_neg = neg_data_words
corpus_neg = [id2_word_neg.doc2bow(text) for text in texts_neg]

id2_word_after_21 = corpora.Dictionary(after_21_data_words)
corpus_after_21 = [id2_word_after_21.doc2bow(text) for text in after_21_data_words]

id2_word_before_21 = corpora.Dictionary(before_21_data_words)
corpus_before_21 = [id2_word_before_21.doc2bow(text) for text in before_21_data_words]

In [11]:
def fit_lda(corpus, id2word, num_topics=10):
  lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                        id2word=id2word,
                                        num_topics=num_topics)
  pprint(lda_model.print_topics())
  doc_lda = lda_model[corpus]
  return (lda_model, corpus, id2word)

In [12]:
all_lda = fit_lda(corpus_all, id2_word_all)

[(0,
  '0.031*"like" + 0.016*"voice" + 0.010*"use" + 0.008*"time" + 0.008*"im" + '
  '0.008*"dont" + 0.007*"good" + 0.007*"command" + 0.007*"using" + '
  '0.007*"ask"'),
 (1,
  '0.023*"dont" + 0.016*"know" + 0.015*"hey" + 0.015*"get" + 0.012*"screen" + '
  '0.009*"even" + 0.009*"help" + 0.009*"time" + 0.008*"notifications" + '
  '0.008*"try"'),
 (2,
  '0.016*"voice" + 0.016*"filters" + 0.013*"like" + 0.012*"good" + '
  '0.010*"love" + 0.010*"use" + 0.009*"music" + 0.009*"many" + 0.008*"cant" + '
  '0.007*"still"'),
 (3,
  '0.014*"like" + 0.012*"good" + 0.010*"answer" + 0.010*"great" + 0.009*"time" '
  '+ 0.009*"questions" + 0.009*"would" + 0.009*"helpful" + 0.008*"answers" + '
  '0.008*"ask"'),
 (4,
  '0.029*"phone" + 0.020*"working" + 0.018*"work" + 0.015*"doesnt" + '
  '0.013*"fix" + 0.013*"use" + 0.012*"even" + 0.011*"open" + 0.011*"cant" + '
  '0.009*"works"'),
 (5,
  '0.014*"im" + 0.014*"send" + 0.013*"ive" + 0.011*"problem" + 0.011*"using" + '
  '0.011*"like" + 0.010*"make" + 0.0

In [18]:
pyLDAvis.enable_notebook()
all_viz = pyLDAvis.gensim_models.prepare(all_lda[0], all_lda[1], all_lda[2])



NameError: name 'all_vis' is not defined

In [19]:
pyLDAvis.display(all_viz)

In [14]:
pos_lda = fit_lda(corpus_pos, id2_word_pos)
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(pos_lda[0], pos_lda[1], pos_lda[2])

[(0,
  '0.013*"great" + 0.010*"im" + 0.010*"need" + 0.009*"like" + 0.009*"use" + '
  '0.009*"help" + 0.009*"work" + 0.007*"get" + 0.007*"helpful" + 0.007*"dont"'),
 (1,
  '0.025*"good" + 0.021*"use" + 0.017*"filters" + 0.013*"great" + 0.012*"love" '
  '+ 0.011*"best" + 0.009*"would" + 0.008*"like" + 0.008*"better" + '
  '0.008*"voice"'),
 (2,
  '0.027*"good" + 0.022*"filters" + 0.022*"like" + 0.021*"love" + '
  '0.011*"great" + 0.011*"best" + 0.010*"thank" + 0.009*"nice" + '
  '0.009*"application" + 0.009*"make"'),
 (3,
  '0.019*"like" + 0.017*"great" + 0.016*"add" + 0.015*"would" + 0.015*"good" + '
  '0.014*"filters" + 0.010*"games" + 0.010*"option" + 0.010*"one" + '
  '0.010*"video"'),
 (4,
  '0.018*"snaps" + 0.016*"like" + 0.016*"good" + 0.015*"chat" + 0.014*"voice" '
  '+ 0.011*"cant" + 0.009*"great" + 0.009*"best" + 0.008*"would" + '
  '0.007*"work"'),
 (5,
  '0.047*"like" + 0.021*"love" + 0.018*"friends" + 0.018*"good" + 0.016*"talk" '
  '+ 0.011*"great" + 0.011*"best" + 0.009*"d

In [15]:
neg_lda = fit_lda(corpus_neg, id2_word_neg)
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(neg_lda[0], neg_lda[1], neg_lda[2])

[(0,
  '0.017*"dont" + 0.015*"want" + 0.013*"like" + 0.012*"good" + 0.012*"bitmoji" '
  '+ 0.009*"every" + 0.009*"get" + 0.008*"fix" + 0.008*"im" + '
  '0.008*"annoying"'),
 (1,
  '0.013*"answer" + 0.012*"ask" + 0.012*"like" + 0.010*"language" + '
  '0.010*"get" + 0.009*"im" + 0.009*"questions" + 0.008*"said" + 0.008*"ok" + '
  '0.008*"doesnt"'),
 (2,
  '0.017*"like" + 0.013*"dont" + 0.012*"cant" + 0.011*"use" + 0.010*"back" + '
  '0.010*"new" + 0.010*"camera" + 0.008*"change" + 0.008*"voice" + '
  '0.008*"feature"'),
 (3,
  '0.013*"phone" + 0.011*"time" + 0.009*"data" + 0.007*"tried" + 0.007*"cache" '
  '+ 0.007*"ask" + 0.007*"assistance" + 0.007*"command" + 0.007*"uninstalled" '
  '+ 0.006*"still"'),
 (4,
  '0.014*"voice" + 0.013*"use" + 0.013*"send" + 0.013*"fix" + 0.011*"im" + '
  '0.011*"doesnt" + 0.011*"working" + 0.011*"problem" + 0.010*"phone" + '
  '0.010*"good"'),
 (5,
  '0.011*"dont" + 0.009*"like" + 0.008*"without" + 0.008*"make" + '
  '0.008*"would" + 0.008*"im" + 0.008*"w

In [16]:
after_21_lda = fit_lda(corpus_after_21, id2_word_after_21)
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(after_21_lda[0], after_21_lda[1], after_21_lda[2])

[(0,
  '0.019*"doesnt" + 0.016*"cant" + 0.014*"work" + 0.012*"dont" + 0.011*"text" '
  '+ 0.011*"use" + 0.010*"voice" + 0.009*"like" + 0.008*"works" + 0.008*"get"'),
 (1,
  '0.028*"like" + 0.015*"would" + 0.014*"good" + 0.014*"want" + 0.012*"dont" + '
  '0.012*"talk" + 0.011*"nice" + 0.008*"much" + 0.008*"get" + 0.008*"friend"'),
 (2,
  '0.030*"best" + 0.021*"good" + 0.018*"like" + 0.011*"great" + 0.008*"love" + '
  '0.007*"information" + 0.007*"language" + 0.006*"helpful" + 0.006*"one" + '
  '0.006*"useful"'),
 (3,
  '0.024*"like" + 0.022*"love" + 0.012*"good" + 0.011*"would" + 0.010*"use" + '
  '0.010*"new" + 0.009*"great" + 0.009*"play" + 0.009*"search" + 0.009*"make"'),
 (4,
  '0.030*"phone" + 0.015*"working" + 0.015*"cant" + 0.013*"even" + '
  '0.013*"dont" + 0.011*"work" + 0.010*"like" + 0.010*"use" + 0.009*"open" + '
  '0.008*"get"'),
 (5,
  '0.018*"im" + 0.018*"problem" + 0.014*"fix" + 0.010*"hey" + 0.009*"cant" + '
  '0.009*"help" + 0.009*"issue" + 0.009*"get" + 0.008*"voice" 

In [17]:
before_21_lda = fit_lda(corpus_before_21, id2_word_before_21)
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(before_21_lda[0], before_21_lda[1], before_21_lda[2])

[(0,
  '0.019*"phone" + 0.019*"cant" + 0.013*"fix" + 0.010*"new" + 0.010*"even" + '
  '0.008*"problem" + 0.008*"working" + 0.008*"camera" + 0.008*"ive" + '
  '0.008*"im"'),
 (1,
  '0.030*"like" + 0.016*"love" + 0.015*"good" + 0.014*"best" + 0.011*"would" + '
  '0.011*"great" + 0.010*"get" + 0.009*"time" + 0.009*"one" + 0.008*"friend"'),
 (2,
  '0.026*"filters" + 0.023*"good" + 0.019*"love" + 0.017*"like" + 0.012*"get" '
  '+ 0.011*"great" + 0.009*"friends" + 0.009*"dont" + 0.009*"talk" + '
  '0.008*"use"'),
 (3,
  '0.014*"dont" + 0.012*"like" + 0.010*"one" + 0.010*"good" + 0.010*"time" + '
  '0.009*"friends" + 0.008*"even" + 0.008*"change" + 0.008*"new" + '
  '0.008*"great"'),
 (4,
  '0.015*"send" + 0.015*"doesnt" + 0.014*"fix" + 0.014*"good" + 0.010*"like" + '
  '0.010*"video" + 0.010*"great" + 0.010*"time" + 0.009*"work" + 0.008*"cant"'),
 (5,
  '0.030*"use" + 0.017*"dont" + 0.013*"im" + 0.013*"good" + 0.011*"like" + '
  '0.008*"phone" + 0.008*"work" + 0.007*"love" + 0.007*"great" + 