# Topic Modeling using LDA (Latent Dirichlet Allocation)

In [11]:
!pip install langdetect

import pandas as pd
import ast
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim import corpora
from gensim.models import LdaModel, CoherenceModel
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load dataset
df = pd.read_csv("iPhone reviews data for sentiment analysis.csv")

# Language detection with safe fallback
def safe_detect_lang(text):
    try:
        return detect(str(text))
    except LangDetectException:
        return 'unknown'

df['language'] = df['review'].apply(safe_detect_lang)
df = df[df['language'] == 'en'].reset_index(drop=True)

# Text cleaning setup
stop_words = set(stopwords.words('english'))
custom_words = ['phone', 'iphone', 'device', 'apple'] # Filter common words for better results
stop_words.update(custom_words)
lemmatizer = WordNetLemmatizer()

def clean_tokenize(text):
    tokens = word_tokenize(str(text).lower())
    tokens = [t for t in tokens if t.isalpha()]
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return tokens

# Clean and tokenize reviews
df['clean_tokens'] = df['review'].apply(clean_tokenize)
preprocessed_documents = df['clean_tokens'].tolist()

# Create dictionary and corpus
dictionary = corpora.Dictionary(preprocessed_documents)
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents]

# Train LDA model
lda_model = LdaModel(corpus, num_topics=4, id2word=dictionary, passes=15)

# Assign dominant topic to each document
article_labels = []
for doc in preprocessed_documents:
    bow = dictionary.doc2bow(doc)
    topics = lda_model.get_document_topics(bow)
    dominant_topic = max(topics, key=lambda x: x[1])[0]
    article_labels.append(dominant_topic)

# Save results
df_result = pd.DataFrame({
    'tokenized': preprocessed_documents,
    'topic': article_labels
})
df_result.to_csv("topic_results.csv", index=False)

# Print top keywords for each topic
print("\nTop Terms for Each Topic:")
for idx, topic in lda_model.print_topics():
    print(f"Topic {idx}:")
    terms = [term.strip() for term in topic.split("+")]
    for term in terms:
        weight, word = term.split("*")
        print(f"- {word.strip()} (weight: {weight.strip()})")
    print()

# Coherence score
coherence_model = CoherenceModel(model=lda_model, texts=preprocessed_documents, dictionary=dictionary, coherence='c_v')
print(f"\nCoherence Score: {coherence_model.get_coherence():.3f}")



[nltk_data] Downloading package punkt to C:\Users\isham/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\isham/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\isham/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



Top Terms for Each Topic:
Topic 0:
- "great" (weight: 0.029)
- "battery" (weight: 0.029)
- "good" (weight: 0.021)
- "quality" (weight: 0.017)
- "screen" (weight: 0.013)
- "life" (weight: 0.012)
- "excellent" (weight: 0.012)
- "condition" (weight: 0.011)
- "use" (weight: 0.010)
- "fast" (weight: 0.009)

Topic 1:
- "screen" (weight: 0.017)
- "condition" (weight: 0.014)
- "good" (weight: 0.013)
- "scratch" (weight: 0.012)
- "battery" (weight: 0.012)
- "camera" (weight: 0.009)
- "time" (weight: 0.008)
- "excellent" (weight: 0.007)
- "got" (weight: 0.007)
- "amazon" (weight: 0.007)

Topic 2:
- "new" (weight: 0.020)
- "like" (weight: 0.013)
- "would" (weight: 0.011)
- "battery" (weight: 0.010)
- "look" (weight: 0.009)
- "work" (weight: 0.008)
- "use" (weight: 0.008)
- "could" (weight: 0.007)
- "get" (weight: 0.007)
- "everything" (weight: 0.007)

Topic 3:
- "service" (weight: 0.009)
- "got" (weight: 0.009)
- "great" (weight: 0.009)
- "work" (weight: 0.008)
- "issue" (weight: 0.007)
- "new" 

# RESULTS
**Topic 0 centers on product satisfaction, highlighting battery life and screen quality.**

**Topic 1 reveals concerns about device condition, especially scratches and camera.**

**Topic 2 captures how customers feel about the phone's appearance and usability.**

**Topic 3 focuses on service-related experiences, including support and delivery.**