In [8]:
!pip install pdfminer.six
!pip install gensim
!pip install nltk
!pip install spacy



In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [19]:
import pandas as pd
import requests
import os
import re
from pdfminer.high_level import extract_text
import glob
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
import gensim
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from collections import Counter

In [10]:
nlp = spacy.load("en_core_web_sm")

In [16]:
def preprocess_text(text):
    # Remove non-alphanumeric characters and extra whitespaces
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    
    # Lemmatize text using spaCy
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_ for token in doc if token.lemma_ not in STOP_WORDS]
    
    return lemmatized_tokens

In [4]:
def clean_and_preprocess_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    # Remove non-alphanumeric characters and extra whitespaces
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    
    # Preprocess text
    tokens = preprocess_text(text)
    
    return tokens

In [25]:
def clean_and_preprocess_directory(directory='/kaggle/working/txt_directory'):
    documents = {}
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            file_path = os.path.join(directory, filename)
            tokens = clean_and_preprocess_file(file_path)
            documents[filename] = tokens
    return documents

In [30]:
#Increase the threshold with more documents
def perform_topic_modeling(directory='/kaggle/working/txt_directory', num_topics=5, max_freq_threshold=1000):
    # Clean and preprocess documents
    documents = clean_and_preprocess_directory(directory)
    
    # Count word frequencies across all documents
    all_words = [word for doc in documents.values() for word in doc]
    word_counts = Counter(all_words)
    
    # Identify words to be filtered based on their frequency
    frequent_words = [word for word, freq in word_counts.items() if freq >= max_freq_threshold * len(all_words)]
    
    # Remove frequent words from documents
    cleaned_documents = {file_name: [word for word in doc if word not in frequent_words] for file_name, doc in documents.items()}
    
    # Create dictionary and corpus
    dictionary = corpora.Dictionary(cleaned_documents.values())
    corpus = [dictionary.doc2bow(doc) for doc in cleaned_documents.values()]
    
    # Build LDA model
    lda_model = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
    
    # Get topics for each document
    document_topics = {}
    for file_name, doc_bow in zip(cleaned_documents.keys(), corpus):
        topics = lda_model.get_document_topics(doc_bow)
        document_topics[file_name] = topics
        
    for idx, topic in lda_model.print_topics(-1):
        print(f'Topic {idx}: {topic}\n')
    
    return lda_model, document_topics

In [None]:
glob.glob('/kaggle/working/pdf_directory/*.pdf')

In [None]:
df = pd.read_pickle('/kaggle/input/semantic-scholar-retrieval/retrieved_data')

In [None]:
def extract_pdf_from_response(df_row, pdf_directory='/kaggle/working/pdf_directory'):
    """
    This function takes the structure of the 'openAccessPdf' column of the semantic scholar
    api response dataset downloads the available pdfs in the specified directory, indexed
    with the paperid value.
    
    variables:
    - df_row: And iterable with a 'paperId' key and an 'openAccessPdf' key
    - pdf_directory: The folder in which the files will be stored
    """
    
    if not os.path.exists(pdf_directory):
        os.makedirs(pdf_directory)
    
    if df_row['openAccessPdf']:
        try:
            req = requests.get(df_row['openAccessPdf']['url'])
            if req.status_code == 200:
                with open(pdf_directory + '/' + df_row['paperId'] + '.pdf', 'wb') as f:
                    f.write(req.content)
        except Exception as e:
            pass

In [None]:
def extract_plain_text_from_pdf(pdf_directory='/kaggle/working/pdf_directory', txt_directory='/kaggle/working/txt_directory'):
    all_files = glob.glob(pdf_directory + '/*.pdf')
    
    if not os.path.exists(txt_directory):
        os.makedirs(txt_directory)
        
    for i in all_files:
        file_id = i.split('.')[0].split('/')[-1]
        print(i)
        whole_text = extract_text(i)
        with open(txt_directory + '/' + file_id + '.txt', 'w') as f:
            f.write(whole_text)

In [None]:
df[~df['openAccessPdf'].isna()].head(5).apply(extract_pdf_from_response, axis=1)

In [None]:
extract_plain_text_from_pdf()

In [31]:
lda_model, document_topics = perform_topic_modeling()

Topic 0: 0.010*"channel" + 0.009*"10" + 0.009*"OFDM" + 0.008*"org" + 0.008*"modulation" + 0.008*"2023" + 0.008*"frequency" + 0.007*"signal" + 0.007*"doi" + 0.007*"http"

Topic 1: 0.016*"image" + 0.015*"EC50" + 0.014*"occupancy" + 0.012*"1" + 0.012*"0" + 0.011*"I" + 0.011*"cluster" + 0.011*"voxel" + 0.010*"level" + 0.010*"use"

Topic 2: 0.000*"1" + 0.000*"system" + 0.000*"feature" + 0.000*"use" + 0.000*"5" + 0.000*"2" + 0.000*"0" + 0.000*"4" + 0.000*"I" + 0.000*"10"

Topic 3: 0.000*"feature" + 0.000*"1" + 0.000*"system" + 0.000*"0" + 0.000*"use" + 0.000*"test" + 0.000*"2" + 0.000*"item" + 0.000*"datum" + 0.000*"3"

Topic 4: 0.023*"feature" + 0.013*"system" + 0.012*"test" + 0.011*"use" + 0.011*"component" + 0.011*"item" + 0.010*"1" + 0.010*"code" + 0.009*"case" + 0.009*"function"



In [32]:
document_topics

{'272cc6496c3b3de74090de7451ffcd0d19351b92.txt': [(1, 0.9995926)],
 'fb64d10afa0c5270415abbf2f5ac33618e45e0ad.txt': [(4, 0.99987113)],
 '45272efd2973b6d1f830e6c01daeb9bbbb3769d6.txt': [(1, 0.9998108)],
 'ecdf7d78813ef689fc0238cc9e53e0b1ef686e2f.txt': [(0, 0.99975824)]}