In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

# Pre-download NLTK resources if they are not already available
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')

# Load data
data = pd.read_csv("reviews.csv")
print(data.head())  # Check if data is loaded correctly

# Filter and clean text
negative_reviews = data.loc[data['score'].isin([1, 2]), 'content']

def clean_text(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = filter(lambda word: word.isalpha() and word.lower() not in stop_words, tokens)
    return ' '.join(filtered_tokens)

texts = negative_reviews.map(clean_text)
df = pd.DataFrame({'review': texts})

# Vectorize and cluster
vectorizer = TfidfVectorizer()
matrix = vectorizer.fit_transform(df['review'])

model = KMeans(n_clusters=5, random_state=500)
labels = model.fit_predict(matrix)
df['category'] = labels

# Extract top terms
terms = vectorizer.get_feature_names_out()
top_terms = []

for i in range(model.n_clusters):
    indices = np.where(df['category'] == i)[0]
    sum_tfidf = matrix[indices].sum(axis=0)
    term_frequencies = np.array(sum_tfidf).flatten()
    top_index = term_frequencies.argmax()
    top_terms.append({
        'category': i,
        'term': terms[top_index],
        'frequency': term_frequencies[top_index]
    })

result_df = pd.DataFrame(top_terms)
print(result_df)


                                             content  score
0                      I cannot open the app anymore      1
1  I have been begging for a refund from this app...      1
2  Very costly for the premium version (approx In...      1
3  Used to keep me organized, but all the 2020 UP...      1
4                                Dan Birthday Oct 28      1
   category           term   frequency
0         0  notifications   37.691067
1         1            app   68.970459
2         2        premium   48.859837
3         3            app  118.515618
4         4        version   67.390991
