In [8]:
import pymongo
import math
from gensim import models
from gensim.utils import simple_preprocess

# Define the preprocess_text function if it's not already defined
def preprocess_text(text):
    # Implement your preprocessing steps here
    return text

def assign_topics_to_documents(documents, output_file, lda_model):
    with open(output_file, 'a') as f:
        for document in documents:
            # Concatenate text from multiple fields
            fields = ['title', 'general']
            content_sections = document.get('content_sections', {})
            if isinstance(content_sections, dict):
                fields.extend(content_sections.values())
            else:
                fields.append(content_sections)
            text = ' '.join([str(document.get(field, '')) for field in fields])
            processed_text = preprocess_text(text)
            tokens = simple_preprocess(processed_text)  # Tokenize the text
            bow_vector = lda_model.id2word.doc2bow(tokens)
            topics = lda_model.get_document_topics(bow_vector)
            # Write the document ID and corresponding topics to the file
            f.write(f"Document ID: {document['_id']}\n")
            for topic, prob in topics:
                f.write(f"Topic {topic}: {prob}\n")
            f.write("\n")

# Assuming this is your main script
if __name__ == "__main__":
    # Connect to MongoDB and get the collection
    client = pymongo.MongoClient("mongodb://localhost:27017/")
    db = client["WS_Data_DB"]
    collection = db["LogRhythm7_15Docs"]

    # Load the trained LDA model
    lda_model = models.LdaModel.load('C:\\Users\\ted59\\Knapp069-Practicum-1-Project\\Processed Data\\lda_model.pkl')

    # Batch processing of documents
    batch_size = 100
    total_documents = collection.count_documents({})
    total_batches = math.ceil(total_documents / batch_size)
    output_file = 'C:\\Users\\ted59\\Knapp069-Practicum-1-Project\\Processed Data\\topic_assignments.txt'
    for batch_index in range(total_batches):
        skip = batch_index * batch_size
        documents = collection.find().skip(skip).limit(batch_size)
        assign_topics_to_documents(documents, output_file, lda_model)
    print("Topic assignment completed. Results saved to topic_assignments.txt.")


Topic assignment completed. Results saved to topic_assignments.txt.
