In [4]:
import os
import pathlib
from data_processor import TopicModelDataPreprocessor
from fastopic import FASTopic
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Download NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('omw-1.4', quiet=True)

# Combine multiple stop word sources
def get_comprehensive_stopwords():
    nltk_stops = set(stopwords.words('english'))
    sklearn_stops = ENGLISH_STOP_WORDS
    custom_stops = {
        'et', 'al', 'introduction', 'conclusion', 'method', 'methodology', 
        'results', 'discussion', 'references', 'appendix', 'table', 'figure'
    }
    return nltk_stops.union(sklearn_stops).union(custom_stops)

# Advanced text cleaning function
def advanced_preprocess(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Initialize lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # Get comprehensive stop words
    stop_words = get_comprehensive_stopwords()
    
    # Advanced filtering and lemmatization
    cleaned_tokens = [
        lemmatizer.lemmatize(token) 
        for token in tokens 
        if (token not in stop_words and 
            len(token) > 2 and  # Remove very short tokens
            not token.isdigit())  # Remove pure digit tokens
    ]
    
    return ' '.join(cleaned_tokens)

# Set the path to your PDF folder
all_pdf_folder_path = '/nfs/turbo/si-acastel/expert_field_project/full_pdfs_by_author/huesmann'
path_all_pdf_folder_path = pathlib.Path(all_pdf_folder_path)

# Process the PDF files
topic_model_processor = TopicModelDataPreprocessor(all_pdf_folder_path=path_all_pdf_folder_path)
docs = topic_model_processor.get_and_process_pdf_files(path_all_pdf_folder_path)

# Preprocess documents using advanced preprocessing function
processed_docs_text = [advanced_preprocess(" ".join(doc)) for doc in docs]

# Initialize FastTopic model with a reasonable number of topics
num_topics = 10  # Adjust this number based on your needs and dataset size
model = FASTopic(num_topics=num_topics)

# Fit the model on the preprocessed documents and get topic-word distributions and document-topic distributions
topic_top_words, doc_topic_dist = model.fit_transform(processed_docs_text)

# Print top words for each topic to identify main topics
print("Top words for each topic:")
for topic_idx in range(num_topics):
    top_words = model.get_topic(topic_idx)
    print(f"Topic {topic_idx}: {', '.join([word for word, prob in top_words[:5]])}")

# Print overall topic distribution across all documents to find dominant topics
overall_dist = np.mean(doc_topic_dist, axis=0)
top_topics = np.argsort(overall_dist)[::-1][:7]
print("\nTop 7 topics by overall distribution:")
for topic in top_topics:
    top_words = model.get_topic(topic)
    print(f"Topic {topic} ({overall_dist[topic]:.4f}): {', '.join([word for word, prob in top_words[:5]])}")

2024-12-09 11:11:43,056 - INFO - Note: NumExpr detected 32 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2024-12-09 11:11:43,058 - INFO - NumExpr defaulting to 8 threads.
2024-12-09 11:12:28,233 - INFO - generated new fontManager


TypeError: TopicModelDataPreprocessor.__init__() got an unexpected keyword argument 'all_pdf_folder_path'