In [1]:
import os
# from nltk.corpus import stopwords
# from gensim import corpora, models
# import pyLDAvis
# import pyLDAvis.gensim

# nltk.download('stopwords')
# stop_words = stopwords.words('english')
# nltk.download('punkt_tab')
#TODO: Make it pick out stop words somehow - not a priority


from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from gensim.models import Phrases
from gensim.models.phrases import Phraser
import nltk
import re

def preprocess_with_bigrams(docs, min_word_length=3, remove_stopwords=True, 
                          lemmatize=True, min_count=5, threshold=100):
    """
    Preprocess documents and detect common bigram phrases.
    
    Args:
        docs (list): List of document strings
        min_word_length (int): Minimum word length to keep
        remove_stopwords (bool): Whether to remove stopwords
        lemmatize (bool): Whether to lemmatize words
        min_count (int): Minimum frequency for bigram detection
        threshold (float): Threshold for bigram detection scoring
        
    Returns:
        list: List of processed documents with bigrams
    """
    # Download required NLTK data
    try:
        nltk.download('stopwords', quiet=True)

        nltk.download('wordnet', quiet=True)
        nltk.download('averaged_perceptron_tagger', quiet=True)
        nltk.download('averaged_perceptron_tagger_eng')
        nltk.download('punkt', quiet=True)
        nltk.download('omw-1.4', quiet=True)
    except Exception as e:
        print(f"Warning: Some NLTK downloads failed: {str(e)}")
    
    # Initialize tools
    stop_words = set(stopwords.words('english'))
    custom_stops = {'from', 'subject', 're', 'edu', 'use', 'amanda', 'kowalski', 'nber'}
    stop_words.update(custom_stops)
    lemmatizer = WordNetLemmatizer()
    
    def clean_text(text):
        """Remove URLs, special characters, and extra whitespace."""
        # Remove URLs and email addresses
        text = re.sub(r'http\S+|www\S+|https\S+|\S+@\S+', '', text)
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Remove extra whitespace
        return ' '.join(text.split()).strip()
    
    def get_wordnet_pos(tag):
        """Convert Penn Treebank tags to WordNet POS tags."""
        tag_map = {
            'J': wordnet.ADJ,
            'V': wordnet.VERB,
            'N': wordnet.NOUN,
            'R': wordnet.ADV
        }
        return tag_map.get(tag[0], wordnet.NOUN)
    
    # First pass: basic preprocessing
    processed_docs = []
    for doc in docs:
        # Clean and tokenize
        clean_doc = clean_text(doc.lower())
        tokens = word_tokenize(clean_doc)
        
        # Process tokens
        if lemmatize:
            # POS tagging for better lemmatization
            pos_tags = pos_tag(tokens)
            processed_tokens = []
            
            for token, pos in pos_tags:
                if (token.isalpha() and 
                    len(token) >= min_word_length and
                    (not remove_stopwords or token not in stop_words)):
                    lemma = lemmatizer.lemmatize(token, get_wordnet_pos(pos))
                    processed_tokens.append(lemma)
        else:
            processed_tokens = [
                token for token in tokens
                if token.isalpha() and 
                len(token) >= min_word_length and
                (not remove_stopwords or token not in stop_words)
            ]
        
        processed_docs.append(processed_tokens)
    
    # Second pass: detect and add bigrams
    bigram = Phrases(processed_docs, 
                    min_count=min_count,
                    threshold=threshold)
    bigram_model = Phraser(bigram)
    
    # Apply bigram detection
    docs_with_bigrams = [bigram_model[doc] for doc in processed_docs]
    
    return docs_with_bigrams


ModuleNotFoundError: No module named 'fitz'

In [1]:
import fitz  # PyMuPDF

def create_docs(pdf_files):
    docs = []
    for pdf_file in pdf_files:
        with fitz.open(pdf_file) as pdf:
            text = ""
            for page_num in range(len(pdf)):
                page = pdf.load_page(page_num)
                text += page.get_text()
            docs.append(text)
    return docs

In [3]:
# def preprocess_documents(docs):
#     processed_docs = []
#     for doc in docs:
#         tokens = nltk.word_tokenize(doc.lower())
#         tokens = [word for word in tokens if word.isalpha() and word not in stop_words and len(word) > 2]
#         processed_docs.append(tokens)    
#     return processed_docs

In [3]:
author_folder_path = '/nfs/turbo/si-acastel/expert_field_project/full_pdfs_by_author/aekowals/'
if os.path.isdir(author_folder_path):
        pdf_files = [os.path.join(author_folder_path, f) for f in os.listdir(author_folder_path)
            if os.path.isfile(os.path.join(author_folder_path, f)) and f.endswith('.pdf')]
        docs = create_docs(pdf_files)
        # processed_docs = preprocess_documents(docs)
        # print(docs)
        # print(processed_docs)
        processed = preprocess_with_bigrams(
        docs,
        min_count=2,  # Lower threshold for this small example
        threshold=10   # Lower threshold for this small example
        )
        
        # Print a sample of detected bigrams
        print("\nProcessed documents with bigrams:")
        for i, doc in enumerate(processed[:2], 1):
            print(f"\nDoc {i}:", doc)

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/hudah/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


: 

In [6]:
import tomotopy as tp

hdp = tp.HDPModel(tw=tp.TermWeight.IDF, min_cf=5, rm_top=7,
                 gamma=1, alpha=0.1, initial_k=10, seed=99999)

# Add docs to train
for vec in processed:
    hdp.add_doc(vec)

# Initiate MCMC burn-in 
hdp.burn_in = 100
hdp.train(0)
print('Num docs:', len(hdp.docs), ', Vocab size:', hdp.num_vocabs, ', Num words:', hdp.num_words)
print('Removed top words:', hdp.removed_top_words)

Num docs: 30 , Vocab size: 5367 , Num words: 149217
Removed top words: ['estimate', 'individual', 'state', 'data', 'reform', 'hospital', 'health_insurance']


  hdp.train(0)


: 

In [5]:
# from gensim.models import Phrases
# from gensim.models.phrases import Phraser

# bigram = Phrases(processed_docs, min_count=5, threshold=100)
# bigram_model = Phraser(bigram)
# word_bigrams = [bigram_model[w_vec] for w_vec in processed_docs]
# word_bigrams[2][:7]

['nber_working',
 'series',
 'behavior',
 'clinical_trial',
 'implications',
 'mammography',
 'guidelines']