<a href="https://colab.research.google.com/github/johnmatsson/MachineLearningNotebooks/blob/master/LDALatent_Dirichlet_allocation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install gensim nltk




In [2]:
import nltk
from nltk.corpus import reuters
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
import gensim

# Download the necessary NLTK data files (run this once)
nltk.download('reuters')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Step 1: Load the Reuters dataset
documents = [reuters.raw(fileid) for fileid in reuters.fileids()]

# Step 2: Preprocess the text
def preprocess(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha()]  # Lemmatize and remove non-alphabetic tokens
    tokens = [token for token in tokens if token not in stop_words]  # Remove stop words
    return tokens

# Apply preprocessing to all documents
preprocessed_documents = [preprocess(doc) for doc in documents]

# Step 3: Create a Dictionary and Corpus for Gensim LDA
dictionary = Dictionary(preprocessed_documents)
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents]

# Step 4: Train the LDA Model
num_topics = 10  # You can adjust this number based on your needs
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=42, passes=10, iterations=100)

# Display the topics
for idx, topic in lda_model.print_topics(num_topics=num_topics, num_words=5):
    print(f"Topic {idx + 1}: {topic}")

# Step 5: Assign topics to documents
# Get the topic distribution for a single document
def get_document_topics(doc):
    bow = dictionary.doc2bow(preprocess(doc))
    return lda_model.get_document_topics(bow)

# Example: Display topics for the first document
doc_id = 0
print(f"\nTopics for Document {doc_id + 1}:")
print(get_document_topics(documents[doc_id]))


[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Topic 1: 0.044*"said" + 0.032*"lt" + 0.022*"company" + 0.015*"inc" + 0.014*"ha"
Topic 2: 0.032*"said" + 0.017*"trade" + 0.016*"would" + 0.010*"price" + 0.009*"wa"
Topic 3: 0.112*"pct" + 0.047*"january" + 0.047*"february" + 0.026*"price" + 0.024*"rose"
Topic 4: 0.038*"tonne" + 0.028*"said" + 0.023*"mln" + 0.015*"export" + 0.013*"wheat"
Topic 5: 0.041*"ct" + 0.033*"stg" + 0.031*"april" + 0.029*"mln" + 0.028*"dividend"
Topic 6: 0.043*"billion" + 0.032*"pct" + 0.030*"year" + 0.029*"said" + 0.016*"mln"
Topic 7: 0.036*"said" + 0.023*"bank" + 0.017*"rate" + 0.015*"market" + 0.013*"wa"
Topic 8: 0.082*"share" + 0.043*"said" + 0.034*"offer" + 0.033*"stock" + 0.028*"lt"
Topic 9: 0.047*"said" + 0.036*"dlrs" + 0.028*"mln" + 0.022*"company" + 0.020*"oil"
Topic 10: 0.136*"v" + 0.109*"mln" + 0.068*"ct" + 0.067*"net" + 0.054*"loss"

Topics for Document 1:
[(0, 0.105425276), (1, 0.6939672), (5, 0.16130368), (6, 0.024408823), (8, 0.013691739)]


In [9]:
!pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.1


In [10]:

import pyLDAvis.gensim


# Visualize the LDA topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis
