In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
import nltk
import re

# Download stopwords
nltk.download("stopwords")


In [None]:
# Load a dataset (sample movie reviews dataset)
url = "https://raw.githubusercontent.com/datasets/movie-reviews-dataset/master/data/movies_reviews.csv"
df = pd.read_csv(url)

# Display dataset
print("Dataset Sample:")
print(df.head())

# Dataset overview
print("\nDataset Shape:", df.shape)


In [None]:
# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters and numbers
    text = " ".join([word for word in text.split() if word not in stopwords.words("english")])
    return text

# Apply preprocessing
df['cleaned_text'] = df['review'].apply(preprocess_text)

# Display cleaned text
print("\nCleaned Text Sample:")
print(df[['review', 'cleaned_text']].head())


In [None]:
# Create a document-term matrix
vectorizer = CountVectorizer(max_df=0.9, min_df=10, stop_words="english")
dtm = vectorizer.fit_transform(df['cleaned_text'])

print("\nDocument-Term Matrix Shape:", dtm.shape)


In [None]:
# Initialize and fit the LDA model
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(dtm)

# Display topics and top words
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic #{topic_idx + 1}:")
        print(", ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))

# Display top words for each topic
print("\nTop Words per Topic:")
display_topics(lda, vectorizer.get_feature_names_out(), num_top_words=10)


In [None]:
# Assign the most likely topic to each document
topic_assignments = lda.transform(dtm)
df['topic'] = topic_assignments.argmax(axis=1) + 1

# Display topic assignments
print("\nDocument Topics Assigned:")
print(df[['review', 'topic']].head())


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Generate word clouds for each topic
for topic_idx, topic in enumerate(lda.components_):
    wordcloud = WordCloud(width=800, height=400, background_color="white").generate(
        " ".join([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-15 - 1:-1]])
    )
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"Word Cloud for Topic #{topic_idx + 1}")
    plt.show()
