In [None]:
# Import necessary libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora
from gensim.models import LdaModel
import matplotlib.pyplot as plt
import pyLDAvis.gensim

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


In [4]:
# Set Pandas display options to avoid truncating text in columns
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.max_colwidth', None)  # Avoid truncating text
pd.set_option('display.width', 1000)        # Increase the total display width

In [None]:

# Load the dataset
df = pd.read_csv('classified_comments.csv')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Preprocess function: Tokenize, remove stop words, and lemmatize
def preprocess(text):
    # Tokenize
    tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize
    # Remove stopwords and non-alphabetic tokens
    stop_words_en = set(stopwords.words('english'))
    stop_words_es = set(stopwords.words('spanish'))
    # Russian stopwords (manually add or use a list from Snowball)
    stop_words_ru = set([
        'и','в','во','не','что','он','на','я','с','со','как','а','то',
        'все','она','так','его','но','да','ты','к','у','же','вы','за',
        'бы','по','ее','мне','было','вот','от','меня','еще','нет','о',
        'из','ему','теперь','когда','даже','ну','вдруг','ли','если',
        'уже','или','ни','быть','был','него','до','вас','нибудь',
        'опять','уж','вам','ведь','там','потом','себя','ничего','ей',
        'может','они','тут','где','есть','надо','ней','для','мы',
        'тебя','их','чем','была','сам','чтоб','без','будто','чего',
        'раз','тоже','себе','под','будет','ж','тогда','кто','этот',
        'того','потому','этого','какой','совсем','ним','здесь',
        'этом','один','почти','мой','тем','чтобы','нее','сейчас',
        'были','куда','зачем','всех','никогда','можно','при','наконец',
        'два','об','другой','хоть','после','над','больше','тот',
        'через','эти','нас','про','всего','них','какая','много',
        'разве','три','эту','моя','впрочем','хорошо','свою','этой',
        'перед','иногда','лучше','чуть','том','нельзя','такой',
        'им','более','всегда','конечно','всю','между'
    ])
     

    stop_words = stop_words_en | stop_words_es | stop_words_ru | custom_stopwords
    tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
    # Lemmatize each word
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return tokens

# Apply preprocessing to each comment
df['processed_comments'] = df['comment'].apply(preprocess)

# Preview the processed comments
print(df[['comment', 'processed_comments']].head())

# Join tokens back to strings for vectorization
df['processed_comments_str'] = df['processed_comments'].apply(lambda x: ' '.join(x))

# Vectorize using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)  # Limit to top 1000 features
X = vectorizer.fit_transform(df['processed_comments_str'])

# Check the shape of the vectorized data
print(X.shape)

# Create a dictionary (mapping words to IDs)
dictionary = corpora.Dictionary(df['processed_comments'])

# Create a document-term matrix
corpus = [dictionary.doc2bow(text) for text in df['processed_comments']]

                                                                                                                                                                            comment                                                                                                                          processed_comments
0                                if you enjoy gamespub videos a comment like or sub would be highly appreciated it means a lot for us you can also follow us on twitter for updates                                   [enjoy, gamespub, video, sub, would, highly, appreciated, mean, lot, u, also, follow, u, twitter, update]
1                                                                                                                                                               obrigado pela ajuda                                                                                                                     [obrigado, pela, ajuda]
2  i am german and i hated franz kafkas 

In [9]:
# Apply LDA (find 5 topics)
lda_model = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=15)

# Show the topics and top words associated with each topic
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

# Optional: Visualize topics using pyLDAvis
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)

(0, '0.041*"gta" + 0.041*"best" + 0.033*"awesome" + 0.029*"ever" + 0.028*"would"')
(1, '0.092*"look" + 0.034*"please" + 0.026*"well" + 0.023*"take" + 0.020*"thank"')
(2, '0.043*"character" + 0.043*"amazing" + 0.042*"lol" + 0.027*"finally" + 0.023*"nice"')
(3, '0.046*"love" + 0.037*"really" + 0.037*"leon" + 0.026*"make" + 0.024*"world"')
(4, '0.046*"cd" + 0.043*"red" + 0.028*"projekt" + 0.026*"new" + 0.021*"pc"')
(5, '0.054*"trailer" + 0.035*"cyberpunk" + 0.029*"year" + 0.027*"time" + 0.023*"first"')
(6, '0.031*"cant" + 0.023*"wait" + 0.021*"fucking" + 0.020*"good" + 0.017*"thanks"')
(7, '0.053*"gameplay" + 0.038*"made" + 0.029*"city" + 0.023*"full" + 0.018*"это"')
(8, '0.062*"man" + 0.046*"person" + 0.035*"ready" + 0.021*"little" + 0.020*"style"')
(9, '0.044*"dont" + 0.036*"know" + 0.034*"see" + 0.032*"get" + 0.032*"watch"')


In [10]:
import pandas as pd

# Assuming 'lda_model' is your trained LDA model and 'topics' contain the topics
topics = lda_model.print_topics(num_words=15)

# Extract topics and their top words
topic_data = []
for topic_id, topic in enumerate(topics):
    words = [word.split('*')[1].strip().replace('"', '') for word in topic[1].split(' + ')]
    topic_data.append({
        'Topic': f'Topic {topic_id + 1}', 
        'Top Words': ', '.join(words)
    })

# Create DataFrame from the extracted data
df_comparison = pd.DataFrame(topic_data)

# Display the DataFrame as a table (it will automatically display in Jupyter Notebooks)
df_comparison


Unnamed: 0,Topic,Top Words
0,Topic 1,"gta, best, awesome, ever, would, movie, need, hope, omg, big, fan, else, left, seen, wish"
1,Topic 2,"look, please, well, take, thank, cool, hyped, right, dark, dont, youre, last, view, already, remake"
2,Topic 3,"character, amazing, lol, finally, nice, played, v, sound, dead, main, ex, anyone, believe, single, as"
3,Topic 4,"love, really, leon, make, world, much, want, music, evil, god, zombie, blade, real, runner, claire"
4,Topic 5,"cd, red, projekt, new, pc, looking, watching, ciri, got, original, start, definitely, hey, excited, graphic"
5,Topic 6,"trailer, cyberpunk, year, time, first, shit, fuck, go, still, car, get, witcher, resident, long, holy"
6,Topic 7,"cant, wait, fucking, good, thanks, great, got, gon, show, money, thats, come, going, waiting, think"
7,Topic 8,"gameplay, made, city, full, это, end, film, ghost, shell, lying, might, least, kinda, puzzle, away"
8,Topic 9,"man, person, ready, little, style, next, enough, bit, somebody, high, jorji, sorry, took, especially, gamer"
9,Topic 10,"dont, know, see, get, watch, hype, release, let, damn, never, future, people, many, guy, even"


In [None]:
import pandas as pd
# Set Pandas display options to avoid truncating text in columns
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.max_colwidth', None)  # Avoid truncating text
pd.set_option('display.width', 1000)        # Increase the total display width

# Apply LDA with 10 topics (increase num_topics to 10)
lda_model = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=15)

# Assuming 'lda_model' is your trained LDA model and 'topics' contain the topics
topics = lda_model.print_topics(num_topics=10, num_words=8)

# Extract topics and their top words
topic_data = []
for topic_id, topic in enumerate(topics):
    words = [word.split('*')[1].strip().replace('"', '') for word in topic[1].split(' + ')]
    topic_data.append({
        'Topic': f'Topic {topic_id + 1}', 
        'Top Words': ', '.join(words)
    })

# Create DataFrame from the extracted data
df_comparison = pd.DataFrame(topic_data)

# Display the DataFrame as a table (it will automatically display in Jupyter Notebooks)
print(df_comparison)