In [1]:
import pandas as pd
file_path = r'app_reviews_with_time_and_version_clue.csv'
df = pd.read_csv(file_path)

In [2]:
# Display the first few rows of the dataframe
df.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,a32b525d-f8d4-4f68-afff-9cdd4d7de8cb,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,I can't access analysis to reset my cycle leng...,1,0,164.0,2024-10-20 23:05:36,"Hey, thanks for getting in touch. We’re sorry ...",2024-10-21 07:08:38,164.0
1,99758246-d266-4d43-bd33-9f40a62fb7a7,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,this app is great but it's just really annoyin...,3,0,176.0,2024-10-20 21:54:46,"Hey, thanks for your review. We introduced Clu...",2024-10-21 07:06:28,176.0
2,8b99d9bc-f26c-45e0-933b-0c1918f27050,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,It requires money,1,0,,2024-10-20 18:52:05,"Hey, thanks for your review. We introduced Clu...",2024-10-21 07:06:26,
3,a2cf1bea-3067-4c42-bb8e-9a72ce280af7,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Worst app don't use,1,0,177.1,2024-10-20 14:45:12,"Hey, thanks for your review. We’re sorry to he...",2024-10-21 07:06:57,177.1
4,a4b92428-fc5f-4e6d-be94-32c24c06276d,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,"It's a good app, but I really don't like that ...",1,0,,2024-10-20 13:39:17,"Hello, we understand you’re disappointed about...",2024-10-21 07:05:14,


In [3]:
# session 2--- Drop rows with missing 'content' as they are not helpful for review analysis
# preprocess the texts also
df.dropna(subset=['content'], inplace=True)
# Remove unnecessary columns
df.drop(['userImage'], axis=1, inplace=True)

In [4]:
# Standardize text data in 'content'
df['content'] = df['content'].str.lower().str.strip()
# remove stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
df['content'] = df['content'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Remove punctuation
import string
df['content'] = df['content'].str.replace('[{}]'.format(string.punctuation), '')
df['content'] = df['content'].astype(str).str.replace(r"[^\w\s']", '', regex=True)
# Remove numbers
df['content'] = df['content'].str.replace('\d+', '')

In [6]:
# Remove mentions
df['content'] = df['content'].replace(r'@\w+', '', regex=True)

In [7]:
# Remove hashtags
df['content'] = df['content'].replace(r'#\w+', '', regex=True)

In [8]:
#lemmitization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
df['content'] = df['content'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
df.head()

Unnamed: 0,reviewId,userName,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,a32b525d-f8d4-4f68-afff-9cdd4d7de8cb,A Google user,can't access analysis reset cycle length annoy...,1,0,164.0,2024-10-20 23:05:36,"Hey, thanks for getting in touch. We’re sorry ...",2024-10-21 07:08:38,164.0
1,99758246-d266-4d43-bd33-9f40a62fb7a7,A Google user,app great really annoying half thing app acces...,3,0,176.0,2024-10-20 21:54:46,"Hey, thanks for your review. We introduced Clu...",2024-10-21 07:06:28,176.0
2,8b99d9bc-f26c-45e0-933b-0c1918f27050,A Google user,requires money,1,0,,2024-10-20 18:52:05,"Hey, thanks for your review. We introduced Clu...",2024-10-21 07:06:26,
3,a2cf1bea-3067-4c42-bb8e-9a72ce280af7,A Google user,worst app use,1,0,177.1,2024-10-20 14:45:12,"Hey, thanks for your review. We’re sorry to he...",2024-10-21 07:06:57,177.1
4,a4b92428-fc5f-4e6d-be94-32c24c06276d,A Google user,good app really like every single time opened ...,1,0,,2024-10-20 13:39:17,"Hello, we understand you’re disappointed about...",2024-10-21 07:05:14,


In [10]:
# First count vectorizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Bag of Words (BoW) model
# we use  it with filtered data with low scores in reviews aka 1 and 2 only
# 1. use text from reviews where score is 1 or 2:
filtered_df = df[df['score'].isin([1, 2])]
count_vectorizer = CountVectorizer(max_df=0.40, min_df=2, stop_words='english')
count_matrix = count_vectorizer.fit_transform(filtered_df['content'])

In [11]:
# Get total counts of each word
word_counts = count_matrix.sum(axis=0).A1
word_features = count_vectorizer.get_feature_names_out()

In [12]:
# Create a DataFrame for visualization
bow_df = pd.DataFrame({'word': word_features, 'count': word_counts}).sort_values(by='count', ascending=False)
print(bow_df.head(20))

          word  count
3746    update   2047
2536    period   1858
3775      used   1716
3952      year   1636
3773       use   1391
877       data   1250
2310       new   1226
883        day   1222
656       clue   1209
3784     using   1187
3583     track   1181
3549      time   1036
864      cycle   1017
3588  tracking   1000
2025      like    985
1347   feature    901
2818    really    823
2412      open    702
2424    option    644
2510       pay    633


In [13]:
# then TF IDF
#### Try changing min_df and max_df
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(filtered_df['content'])


In [14]:
# Get mean TF-IDF scores for each word
tfidf_means = tfidf_matrix.mean(axis=0).A1
tfidf_features = tfidf_vectorizer.get_feature_names_out()

In [15]:
# Create a DataFrame for TF-IDF visualization
tfidf_df = pd.DataFrame({'word': tfidf_features, 'tfidf': tfidf_means}).sort_values(by='tfidf', ascending=False)
print(tfidf_df.head(20))

          word     tfidf
271        app  0.068780
3747    update  0.035284
2537    period  0.032009
3776      used  0.028352
3953      year  0.026642
3774       use  0.026293
878       data  0.024252
884        day  0.023656
2311       new  0.023608
2413      open  0.022040
3584     track  0.021719
3785     using  0.020972
657       clue  0.020823
2026      like  0.020086
3550      time  0.019949
865      cycle  0.019143
3589  tracking  0.018383
2819    really  0.017743
1348   feature  0.017001
1534      good  0.016107


In [16]:
!pip uninstall numpy -y
!pip install numpy

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Collecting numpy
  Using cached numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Using cached numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.4 MB)
Installing collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have numpy 2.2.4 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 2.2.4 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.2.4 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-2.2.4


In [17]:
!pip uninstall gensim -y
!pip install gensim

Found existing installation: gensim 4.3.3
Uninstalling gensim-4.3.3:
  Successfully uninstalled gensim-4.3.3
Collecting gensim
  Using cached gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
Installing collected packages: numpy, gensim
  Attempting uninstall: numpy
    Found existing installation: numpy 2.2.4
    Uninstalling numpy-2.2.4:
      Successfully uninstalled numpy-2.2.4
Successfully installed gensim-4.3.3 numpy-1.26.4


In [18]:
import gensim
from gensim.models import Word2Vec

In [19]:
#embedddings usage instead of count

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize

In [20]:
# Assuming filtered_df['content'] contains your text data

tfidf_vectorizer = TfidfVectorizer(max_df=0.50, min_df=2, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(filtered_df['content'])

In [21]:
# 1. Get high-frequency words from TF-IDF:

feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_scores = tfidf_matrix.sum(axis=0).A1  # Sum TF-IDF scores for each word

In [22]:
# Create a DataFrame to store words and their TF-IDF scores:
tfidf_df = pd.DataFrame({'word': feature_names, 'tfidf': tfidf_scores})

In [23]:
# Get the top N high-frequency words (adjust N as needed):
top_n = 20
top_words_tfidf = tfidf_df.nlargest(top_n, 'tfidf')['word'].tolist()

print(f"Top {top_n} words (TF-IDF): {top_words_tfidf}")

Top 20 words (TF-IDF): ['update', 'period', 'used', 'year', 'use', 'data', 'day', 'new', 'open', 'track', 'using', 'clue', 'like', 'time', 'cycle', 'tracking', 'really', 'feature', 'good', 'pay']


In [24]:
# 2. Train Word2Vec model:

nltk.download('punkt')  # Download tokenizer data if not already downloaded
nltk.download('punkt_tab')
tokenized_sentences = filtered_df['content'].apply(lambda x: word_tokenize(str(x).lower()))

word2vec_model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=2, workers=4) #adjust parameters as needed.

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [25]:
# 3. Get word embeddings for high-frequency words:

word_embeddings = {}
for word in top_words_tfidf:
    try:
        word_embeddings[word] = word2vec_model.wv[word]
    except KeyError:
        print(f"Word '{word}' not found in Word2Vec vocabulary.")

In [26]:
# 4. Example: Find similar words using embeddings
if word_embeddings:
  example_word = top_words_tfidf[18] #Take the first word from the top words
  try:
    similar_words = word2vec_model.wv.most_similar(example_word, topn=5)
    print(f"\nWords similar to '{example_word}': {similar_words}")
  except KeyError:
    print(f"Word '{example_word}' not found in Word2Vec vocabulary.")


Words similar to 'good': [('liked', 0.9940689206123352), ('disappointed', 0.9915490746498108), ('really', 0.990898072719574), ('clean', 0.9905880093574524), ('rendered', 0.9882809519767761)]


In [27]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer  # For VADER
nltk.download('vader_lexicon')
nltk.download('punkt')
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [28]:
def analyze_sentiment_vader(text):
    scores = analyzer.polarity_scores(text)
    return scores  # Returns a dictionary of scores

In [29]:
df['vader_sentiment'] = df['content'].apply(analyze_sentiment_vader)
#Expand the dictionary in separate columns
df = pd.concat([df, df['vader_sentiment'].apply(pd.Series)], axis=1)
df.drop(columns=['vader_sentiment'], inplace=True)

In [30]:
#Interpret the results:
def interpret_sentiment(compound):
    # Convert compound to float to handle potential non-numeric values
    try:
        compound = float(compound)  # Try converting to float
    except (TypeError, ValueError):  # Catch both TypeError and ValueError
        # Handle non-numeric or empty values
        return "Neutral"

    if compound >= 0.05:
        return "Positive"
    elif compound <= -0.05:
        return "Negative"
    else:
        return "Neutral"

In [31]:
# Reset the index before applying the function
df = df.reset_index(drop=True)  # Reset the index and drop the old index

df['vader_interpretation'] = df['compound'].apply(interpret_sentiment)

print("VADER Sentiment Analysis:")
print(df.head())

VADER Sentiment Analysis:
                               reviewId       userName  \
0  a32b525d-f8d4-4f68-afff-9cdd4d7de8cb  A Google user   
1  99758246-d266-4d43-bd33-9f40a62fb7a7  A Google user   
2  8b99d9bc-f26c-45e0-933b-0c1918f27050  A Google user   
3  a2cf1bea-3067-4c42-bb8e-9a72ce280af7  A Google user   
4  a4b92428-fc5f-4e6d-be94-32c24c06276d  A Google user   

                                             content  score  thumbsUpCount  \
0  can't access analysis reset cycle length annoy...      1              0   
1  app great really annoying half thing app acces...      3              0   
2                                     requires money      1              0   
3                                      worst app use      1              0   
4  good app really like every single time opened ...      1              0   

  reviewCreatedVersion                   at  \
0                164.0  2024-10-20 23:05:36   
1                176.0  2024-10-20 21:54:46   
2              

In [33]:
#classifier based sentiment analysis

twitter_df = pd.read_csv("twitter_sentiment.csv", encoding='ISO-8859-1')
twitter_df.head()
def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenize and lowercase
    cleaned_tokens = [token for token in tokens if token.isalnum()]  # Remove punctuation and non-alphanumeric characters
    return cleaned_tokens

In [34]:
# Handle NaN values before applying the function
twitter_df['tokens'] = twitter_df['text'].fillna('').apply(preprocess_text)
twitter_df = twitter_df.rename(columns={'Sentiment': 'sentiment'})
# Combine the dataframes
twitter_df.shape
twitter_df.head()
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [35]:
# 3. Feature Extraction (TF-IDF)
tfidf = TfidfVectorizer()
twitter_df['text'] = twitter_df['text'].fillna('')  # Fill NaN values with empty string
X = tfidf.fit_transform(twitter_df['text'])
y = twitter_df['sentiment']

In [38]:
# 4. Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
# 5. Train the SVM classifier
svm_classifier = SVC(kernel='linear')  # You can experiment with different kernels
svm_classifier.fit(X_train, y_train)

In [41]:
# 6. Make predictions and evaluate
y_pred = svm_classifier.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.76      0.61      0.67      1562
     neutral       0.64      0.79      0.71      2230
    positive       0.81      0.70      0.75      1705

    accuracy                           0.71      5497
   macro avg       0.73      0.70      0.71      5497
weighted avg       0.73      0.71      0.71      5497



In [42]:
# 7. Sentiment analysis function (using the trained SVM)
def analyze_sentiment_svm(text):
    text_vectorized = tfidf.transform([text])  # Transform the single text
    prediction = svm_classifier.predict(text_vectorized)[0]  # Get the prediction
    return prediction

In [44]:
# Apply the function to the 'content' column
df['svm_sentiment'] = df['content'].apply(analyze_sentiment_svm)
df.to_excel("sentiment_analysis_results.xlsx", index=False)  # index=False to exclude row numbers

In [45]:
from google.colab import files
files.download("sentiment_analysis_results.xlsx")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [46]:
####LDA topic modelling
#LDA for each score
import gensim
from gensim import corpora
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import string
!pip install pyLDAvis
import pyLDAvis
import matplotlib.pyplot as plt
import seaborn as sns

Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.1


In [47]:
# Load NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [48]:
# Perform topic modeling for each review score
def perform_lda_for_score(score, num_topics=10, num_words=10):
    # Filter the reviews for the given score
    filtered_reviews = df[df['score'] == score]['content'].tolist()

    # Create a document-term matrix
    vectorizer = CountVectorizer(max_df=0.9, min_df=5, stop_words='english')
    X = vectorizer.fit_transform(filtered_reviews)

    # Create a dictionary and corpus for LDA
    dictionary = corpora.Dictionary([review.split() for review in filtered_reviews])
    corpus = [dictionary.doc2bow(review.split()) for review in filtered_reviews]

    # Fit the LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10, random_state=42)

    # Display the top words for each topic
    print(f"\nTop topics for score {score}:")
    for i, topic in lda_model.show_topics(num_topics=num_topics, num_words=num_words, formatted=False):
        words = ', '.join([word[0] for word in topic])
        print(f"Topic {i + 1}: {words}")

In [49]:
# Perform LDA for each score (1, 2, 3, 4, 5)
for score in range(1, 6):
    perform_lda_for_score(score)


Top topics for score 1:
Topic 1: woman, hate, abortion, people, stupid, pregnancy, star, app, 1, pro
Topic 2: app, time, every, keep, day, crashing, using, get, it, used
Topic 3: option, friendly, user, category, pill, track, lot, pm, it, confusing
Topic 4: app, used, update, new, window, fertile, version, feature, removed, tracking
Topic 5: app, data, update, account, year, lost, new, can't, even, phone
Topic 6: app, pay, free, used, ad, feature, year, subscription, premium, use
Topic 7: bad, right, good, correctly, sends, flo, copy, link, sync, ovulating
Topic 8: app, period, day, cycle, update, even, track, use, can't, like
Topic 9: app, use, period, year, using, feature, like, get, make, data
Topic 10: open, app, crash, work, it, working, even, time, fix, phone

Top topics for score 2:
Topic 1: app, period, day, track, like, option, cycle, really, data, change
Topic 2: came, know, widget, access, yall, period, 20th, said, updated, good
Topic 3: app, year, it, used, phone, update, 

In [50]:
#visualization
import pyLDAvis.gensim_models
filtered_reviews = df[df['score'] == 1]['content'].tolist()
import gensim
import pyLDAvis

In [51]:
# Create a document-term matrix
vectorizer = CountVectorizer(max_df=0.7, min_df=5, stop_words='english')
X = vectorizer.fit_transform(filtered_reviews)

In [52]:
# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary([review.split() for review in filtered_reviews])
corpus = [dictionary.doc2bow(review.split()) for review in filtered_reviews]

In [53]:
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=3, id2word=dictionary, passes=10, random_state=100)
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary) # Assuming you have lda_model, corpus, dictionary from your LDA analysis
pyLDAvis.display(vis)

In [54]:
#BERT
!pip install bertopic
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN

Collecting bertopic
  Downloading bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloa

  axis.set_ylabel('$\lambda$ value')
  """Perform robust single linkage clustering from a vector array


In [60]:
# Define a function for BERT-based topic modeling
def bert_topic_modeling(documents):
    # Initialize the BERTopic model
    umap_model = UMAP(n_components=100, n_neighbors=15, min_dist=0.0) #adjust n_components for umap_reduce_dims
    hdbscan_model = HDBSCAN(min_cluster_size=20, metric='euclidean', cluster_selection_method='eom') #adjust min_cluster_size for hdbscan_min_cluster_size

    topic_model = BERTopic(min_topic_size=3, nr_topics='auto', umap_model=umap_model, hdbscan_model=hdbscan_model, verbose=True)
    # Fit the model to the documents
    topics, probs = topic_model.fit_transform(documents)

    # Visualize the topics
    return topic_model, topics, probs

filtered_reviews = df[df['score'] == 1]['content'].tolist()

topic_model, topics, probs = bert_topic_modeling(filtered_reviews)

topic_dict = topic_model.get_topics()

def print_bert_topics(topic_model):
    """Prints the topics and associated words from a trained BERTopic model."""

    topic_info = topic_model.get_topic_info()
    for topic_id, row in topic_info.iterrows():
        if topic_id == -1:  # Ignore outlier topic
            continue
        topic_words = topic_model.get_topic(topic_id)
        if topic_words:  # Check if topic_words is not None or empty
            words = ", ".join([word[0] for word in topic_words])
            print(f"Topic {row['Topic']}: {words}")

2025-03-18 12:07:05,073 - BERTopic - Embedding - Transforming documents to embeddings.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/147 [00:00<?, ?it/s]

2025-03-18 12:08:48,613 - BERTopic - Embedding - Completed ✓
2025-03-18 12:08:48,616 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-03-18 12:09:41,303 - BERTopic - Dimensionality - Completed ✓
2025-03-18 12:09:41,308 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-03-18 12:09:43,555 - BERTopic - Cluster - Completed ✓
2025-03-18 12:09:43,556 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-03-18 12:09:43,701 - BERTopic - Representation - Completed ✓
2025-03-18 12:09:43,702 - BERTopic - Topic reduction - Reducing number of topics
2025-03-18 12:09:43,829 - BERTopic - Topic reduction - Reduced number of topics from 24 to 24


In [61]:
# Print the topics and words
print_bert_topics(topic_model)

Topic -1: data, account, lost, year, update, app, gone, email, log, new
Topic 0: open, crash, crashing, app, working, keep, even, phone, it, try
Topic 1: period, day, date, cycle, start, last, late, length, said, track
Topic 2: window, fertile, ovulation, fertility, removed, day, app, feature, using, used
Topic 3: bad, work, hate, inaccurate, use, friendly, like, hard, terrible, understand
Topic 4: reminder, pill, notification, app, take, update, control, birth, taking, back
Topic 5: update, new, save, track, click, back, old, use, app, version
Topic 6: free, pay, subscription, money, paid, trial, cancel, charged, refund, charge
Topic 7: worst, app, fake, hate, work, ever, good, use, apps, install
Topic 8: la, de, que, para, con, se, mi, no, era, aplicacin
Topic 9: ad, pop, subscription, premium, plus, every, buy, popups, annoying, time
Topic 10: pregnancy, pregnant, period, tracking, track, cycle, mode, app, get, clue
Topic 11: 13, age, period, younger, girl, 11, 12, young, get, old
T