In [1]:
"""
This script is developed and tested with Python 3.6.13 environment.

Install the necessary libraries using pip to make sure the Python environment matches the versions

pip install PyPDF2==1.26.0
pip install guidedlda
pip install numpy==1.19.5
pip install nltk==3.5

"""
import os
import pathlib
import PyPDF2
import guidedlda
import numpy as np
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Set the path to your PDF folder
all_pdf_folder_path = '/nfs/turbo/si-acastel/expert_field_project/full_pdfs_by_author/huesmann'
path_all_pdf_folder_path = pathlib.Path(all_pdf_folder_path)

def extract_text_from_pdf(pdf_path):
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfFileReader(file, strict=False)
            text = ''
            for page in range(reader.numPages):
                try:
                    text += reader.getPage(page).extractText()
                except Exception as e:
                    print(f"Error extracting text from page {page} of {pdf_path}: {str(e)}")
        return text
    except Exception as e:
        print(f"Error processing {pdf_path}: {str(e)}")
        return ""

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and 4 <= len(token) <= 20]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

# Extract text from all PDFs
all_texts = []
for pdf_file in path_all_pdf_folder_path.glob('*.pdf'):
    text = extract_text_from_pdf(str(pdf_file))
    if text:
        preprocessed_text = preprocess_text(text)
        all_texts.append(preprocessed_text)

# Create vocabulary
vocab = set()
for text in all_texts:
    words = text.split()
    vocab.update(words)

vocab = list(vocab)
word2id = {w: i for i, w in enumerate(vocab)}

# Create document-term matrix
X = np.zeros((len(all_texts), len(vocab)), dtype=np.int64)
for i, text in enumerate(all_texts):
    word_counts = Counter(text.split())
    for word, count in word_counts.items():
        if word in word2id:
            X[i, word2id[word]] = int(count)

# Define optimized seed topics based on the author list
seed_topics = {
    0: ['survey', 'methodology', 'statistics', 'sampling', 'data'],
    1: ['politics', 'policy', 'government', 'election', 'democracy'],
    2: ['health', 'medicine', 'wellbeing', 'psychology', 'behavior'],
    3: ['economics', 'finance', 'market', 'income', 'wealth'],
    4: ['sociology', 'demography', 'family', 'community', 'social'],
    5: ['education', 'learning', 'school', 'academic', 'student'],
    6: ['environment', 'climate', 'sustainability', 'ecology', 'urban'],
    7: ['communication', 'media', 'technology', 'internet', 'social media'],
    8: ['international', 'global', 'culture', 'migration', 'development']
}

# Create seed_topic_dict
seed_topic_dict = {}
for topic_id, words in seed_topics.items():
    for word in words:
        if word in word2id:
            seed_topic_dict[word2id[word]] = topic_id

# Apply Guided LDA
model = guidedlda.GuidedLDA(n_topics=9, n_iter=100, random_state=7, refresh=20)
model.fit(X, seed_topics=seed_topic_dict, seed_confidence=0.15)

# Print top words for each topic
n_top_words = 8
for i, topic_dist in enumerate(model.topic_word_):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print(f'Topic {i}: {", ".join(topic_words)}')

[nltk_data] Downloading package punkt to /home/sameerc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sameerc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sameerc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Error processing /nfs/turbo/si-acastel/expert_field_project/full_pdfs_by_author/huesmann/xml.pdf: Could not read malformed PDF file


INFO:guidedlda:n_documents: 60
INFO:guidedlda:vocab_size: 23425
INFO:guidedlda:n_words: 193123
INFO:guidedlda:n_topics: 9
INFO:guidedlda:n_iter: 100
INFO:guidedlda:<0> log likelihood: -2135560
INFO:guidedlda:<20> log likelihood: -1682116
INFO:guidedlda:<40> log likelihood: -1647487
INFO:guidedlda:<60> log likelihood: -1632869
INFO:guidedlda:<80> log likelihood: -1623172
INFO:guidedlda:<99> log likelihood: -1619860


Topic 0: aggressivebehavior, crossref, childdevelopment, figure, table, etal, aggression, xfebruary
Topic 1: arousal, memory, film, clip, recall, subject, model, amnesia
Topic 2: time, item, positive, negative, model, would, size, rate
Topic 3: youth, trait, report, teacher, composite, score, sample, model
Topic 4: aggression, child, parent, factor, participant, huesmann, family, measure
Topic 5: violence, violent, medium, aggressive, behavior, effect, game, study
Topic 6: violence, exposure, author, child, wave, political, manuscript, israeli
Topic 7: data, table, study, problem, result, learning, however, used
Topic 8: aggression, child, belief, social, behavior, huesmann, aggressive, normative
