In [20]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from gensim import corpora, models
import spacy
import matplotlib.pyplot as plt
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import os
from collections import Counter

target_column = 'rawContent'

# Download NLTK data
nltk.download('stopwords')

# Load data
df = pd.read_csv("your_data.csv")
texts = df[target_column].astype(str).tolist()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
# Initialize spaCy
nlp = spacy.load('en_core_web_sm')
# Cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'@\w+|#\w+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

cleaned_texts = [clean_text(text) for text in texts]

# Tokenization and Lemmatization
def tokenize_lemmatize(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc if token.is_alpha and token.lemma_ not in stop_words]

tokenized_texts = [tokenize_lemmatize(text) for text in cleaned_texts]

# Assuming 'tokenized_texts' is your list of tokenized tweets
all_words = [word for text in tokenized_texts for word in text]
word_freq = Counter(all_words)
print(word_freq.most_common(20))

In [9]:
from gensim.models import CoherenceModel
custom_stopwords = {'ubisoft', 'game', 'go', 'get', 'I'}

stop_words = set(stopwords.words('english'))
stop_words.update(custom_stopwords)



# Create Dictionary and Corpus
dictionary = corpora.Dictionary(tokenized_texts)
dictionary.filter_extremes(no_below=15, no_above=0.5)
corpus = [dictionary.doc2bow(text) for text in tokenized_texts]

# Build LDA Model
max_topics = 100
step = 10
for num_topics in range(10, max_topics+step, step):
    print(f"Number of Topics: {num_topics}")
    num_topics = 10
    passes = 15
    random_state = 42

    lda_model = models.LdaMulticore(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
        random_state=random_state,
        update_every=1,
        chunksize=100,
        passes=passes,
        alpha='auto',
        workers=4,
        per_word_topics=True
    )
    
    

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()
    print(f'Coherence Score: {coherence_score}')
    
    perplexity = lda_model.log_perplexity(corpus)
    print(f'Perplexity: {perplexity}')

    # Print Topics
    for idx, topic in lda_model.print_topics(-1):
        print(f"Topic: {idx}\nWords: {topic}\n")

    # Visualize Topics
    lda_vis = gensimvis.prepare(lda_model, corpus, dictionary)
    # Uncomment the line below to display the visualization in the notebook
    # pyLDAvis.display(data=lda_vis)
    pyLDAvis.save_html(lda_vis, f'lda_visualization_topic_{num_topics}.html')


Number of Topics: 10
Number of Topics: 20
Number of Topics: 30
Number of Topics: 40
Number of Topics: 50
Number of Topics: 60
Number of Topics: 70
Number of Topics: 80
Number of Topics: 90
Number of Topics: 100


# Experiment with Different Models

In [10]:
from bertopic import BERTopic

# Initialize BERTopic
topic_model = BERTopic()
topics, probabilities = topic_model.fit_transform(texts)

# Explore topics
topic_model.get_topic_info()

  from .autonotebook import tqdm as notebook_tqdm
  import pkg_resources
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,62770,-1_they_but_it_like,"[they, but, it, like, them, their, and, dont, ...","[Just finished Assassin's Creed, and all I can..."
1,0,3214,0_rainbow6uk_titaniumrolo_ubisoftuk_lol,"[rainbow6uk, titaniumrolo, ubisoftuk, lol, pre...",[@TitaniumRolo @Rainbow6_UK @Rainbow6Game @Ubi...
2,1,1370,1_assassinsfr_assassinscreed_assassinsuk_ubiso...,"[assassinsfr, assassinscreed, assassinsuk, ubi...",[@CreedKells @Ubisoft @UbisoftDE @UbisoftFR @U...
3,2,910,2_servers_server_down_fix,"[servers, server, down, fix, ubisoftsupport, u...","[@Ubisoft servers down?, @Ubisoft are the serv..."
4,3,905,3_ac_ac3_ac4_ac2,"[ac, ac3, ac4, ac2, ac1, acu, origins, acs, ac...","[@Ubisoft @assassinscreed Best AC game, @Ubiso..."
...,...,...,...,...,...
2261,2260,10,2260_varsitygamingtv_crackheaded_httpstcofwbpz...,"[varsitygamingtv, crackheaded, httpstcofwbpz2j...",[@VarsityGamingTV Come on. You knew this would...
2262,2261,10,2261_fc5_mzammadkhan_chiranramgobin_thatsgroov...,"[fc5, mzammadkhan, chiranramgobin, thatsgroovy...",[@RustyPennyy What platform do you play FC5 on...
2263,2262,10,2262_promises_modern_return_httptcow6slz9vzw0,"[promises, modern, return, httptcow6slz9vzw0, ...",[Ubisoft Promises Modern Day Story Will Return...
2264,2263,10,2263_httptinyurlcombu64oy_httptcoey34fpn5_http...,"[httptinyurlcombu64oy, httptcoey34fpn5, httpbi...",[@FalseShepard we're working on getting more @...
