In [1]:
# Import necessary libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora
from gensim.models import LdaModel
import matplotlib.pyplot as plt
import pyLDAvis.gensim

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
df = pd.read_csv('cleaned_youtube_comments.csv')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Preprocess function: Tokenize, remove stop words, and lemmatize
def preprocess(text):
    # Tokenize
    tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize
    # Remove stopwords and non-alphabetic tokens
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
    # Lemmatize each word
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return tokens

# Apply preprocessing to each comment
df['processed_comments'] = df['comment'].apply(preprocess)

# Preview the processed comments
print(df[['comment', 'processed_comments']].head())

# Join tokens back to strings for vectorization
df['processed_comments_str'] = df['processed_comments'].apply(lambda x: ' '.join(x))

# Vectorize using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)  # Limit to top 1000 features
X = vectorizer.fit_transform(df['processed_comments_str'])

# Check the shape of the vectorized data
print(X.shape)

# Create a dictionary (mapping words to IDs)
dictionary = corpora.Dictionary(df['processed_comments'])

# Create a document-term matrix
corpus = [dictionary.doc2bow(text) for text in df['processed_comments']]

# Apply LDA (find 5 topics)
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

# Show the topics and top words associated with each topic
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

# Optional: Visualize topics using pyLDAvis
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)

[nltk_data] Downloading package punkt to /Users/psylviana/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/psylviana/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/psylviana/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                             comment  \
0  if you enjoy gamespub videos a comment like or...   
1                                obrigado pela ajuda   
2  i am german and i hated franz kafkas books at ...   
3  thank you for your help on the ammo puzzle hon...   
4  whoa whoa whoawhos still listening in 2023 we ...   

                                  processed_comments  
0  [enjoy, gamespub, video, comment, like, sub, w...  
1                            [obrigado, pela, ajuda]  
2  [german, hated, franz, kafka, book, school, ch...  
3  [thank, help, ammo, puzzle, honestly, didnt, k...  
4  [whoa, whoa, whoawhos, still, listening, sick,...  
(32558, 1000)
(0, '0.049*"quietits" + 0.019*"mandalore" + 0.015*"think" + 0.012*"content" + 0.008*"russian"')
(1, '0.173*"game" + 0.106*"mark" + 0.018*"look" + 0.013*"make" + 0.010*"im"')
(2, '0.117*"made" + 0.107*"lying" + 0.058*"im" + 0.044*"never" + 0.040*"end"')
(3, '0.030*"like" + 0.018*"video" + 0.015*"play" + 0.013*"time" + 

In [3]:
import pandas as pd

# Assuming 'lda_model' is your trained LDA model and 'topics' contain the topics
topics = lda_model.print_topics(num_words=5)

# Extract topics and their top words
topic_data = []
for topic_id, topic in enumerate(topics):
    words = [word.split('*')[1].strip().replace('"', '') for word in topic[1].split(' + ')]
    topic_data.append({
        'Topic': f'Topic {topic_id + 1}', 
        'Top Words': ', '.join(words)
    })

# Create DataFrame from the extracted data
df_comparison = pd.DataFrame(topic_data)

# Display the DataFrame as a table (it will automatically display in Jupyter Notebooks)
df_comparison


Unnamed: 0,Topic,Top Words
0,Topic 1,"quietits, mandalore, think, content, russian"
1,Topic 2,"game, mark, look, make, im"
2,Topic 3,"made, lying, im, never, end"
3,Topic 4,"like, video, play, time, back"
4,Topic 5,"elevator, music, one, yes, button"


In [9]:
import pandas as pd
# Set Pandas display options to avoid truncating text in columns
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.max_colwidth', None)  # Avoid truncating text
pd.set_option('display.width', 1000)        # Increase the total display width

# Apply LDA with 10 topics (increase num_topics to 10)
lda_model = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=15)

# Assuming 'lda_model' is your trained LDA model and 'topics' contain the topics
topics = lda_model.print_topics(num_topics=10, num_words=8)

# Extract topics and their top words
topic_data = []
for topic_id, topic in enumerate(topics):
    words = [word.split('*')[1].strip().replace('"', '') for word in topic[1].split(' + ')]
    topic_data.append({
        'Topic': f'Topic {topic_id + 1}', 
        'Top Words': ', '.join(words)
    })

# Create DataFrame from the extracted data
df_comparison = pd.DataFrame(topic_data)

# Display the DataFrame as a table (it will automatically display in Jupyter Notebooks)
print(df_comparison)

      Topic                                                  Top Words
0   Topic 1     mark, video, time, stanley, elevator, back, play, love
1   Topic 2      quietits, minute, hard, moment, nice, bit, xd, little
2   Topic 3        bucket, ive, would, one, much, actually, ever, hole
3   Topic 4      game, make, button, pathologic, watch, want, im, shit
4   Topic 5         dont, know, think, good, look, youre, mandalore, u
5   Topic 6        got, played, play, whole, comment, one, seen, start
6   Topic 7        like, jim, ending, feel, song, part, really, review
7   Topic 8               yes, man, lol, hope, take, gon, get, someone
8   Topic 9          never, end, hour, long, work, anyone, said, dance
9  Topic 10  made, lying, im, bucket, watching, waiting, also, dancing
