# Topic Modeler
Python scripts for extracting text from PDF and TXT files, followed by text preprocessing using NLP methods and topic modeling of the preprocessed texts. Topic modeling can be used to perform preliminary data analysis to identify key themes and topics present in a corpus of text.
Implementation adapted from [tutorial](https://towardsdatascience.com/topic-modeling-with-bert-779f7db187e6) by Maarten Grootendorst.

Source: [Katerina Labrou](https://github.com/klabrou/topic-modeling). MIT License.



In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import streamlit as st
import os
import pandas as pd
import numpy as np
import math
import csv
from PyPDF2 import PdfReader

In [None]:
def process_pdfs(directory):
    # Create a CSV file for storing the extracted data
    output_csv = "pdf_data.csv"

    # Open the CSV file in write mode
    with open(output_csv, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Author", "Title", "Subject", "Content"])



        # Iterate through each PDF file in the directory
        for filename in os.listdir(directory):
            print(filename)
            if filename.endswith(".pdf"):
                pdf_path = os.path.join(directory, filename)

                # Open the PDF file
                with open(pdf_path, "rb") as pdf_file:
                    try:
                        pdf = PdfReader(pdf_file)

                        # Extract metadata
                        author = pdf.metadata.author
                        title = pdf.metadata.title
                        subject = pdf.metadata.subject

                        # Extract content from each page
                        content = ""
                        for page_number in range(len(pdf.pages)):
                            content += pdf.pages[page_number].extract_text()

                        # Write the extracted data to the CSV file
                        writer.writerow([author, title, subject, content])

                        print(f"Processed {filename}")
                    except Exception as e:
                        print(f"Error processing {filename}: {str(e)}")

            if filename.endswith(".txt"):
                txt_path = os.path.join(directory, filename)

                # Open the PDF file
                with open(txt_path, "rb") as txt_file:

                        content = txt_file.read()

                        # Extract metadata
                        author = ""
                        title = ""
                        subject = " "

                        # Write the extracted data to the CSV file
                        writer.writerow([author, title, subject, content])

    print("Extraction complete. CSV file generated.")

directory_path = "./"
process_pdfs(directory_path)

In [None]:
df = pd.read_csv('./pdf_data.csv')

In [None]:
# remove stopwords
import nltk # https://medium.com/grabngoinfo/topic-modeling-with-deep-learning-using-python-bertopic-cf91f5676504
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()

# Remove stopwords
stopwords = nltk.corpus.stopwords.words('english')
df['Content_without_stopwords'] = df['Content'].apply(lambda x: ' '.join([w for w in x.split() if w.lower() not in stopwords]))
# Lemmatization
df['Content_lemmatized'] = df['Content_without_stopwords'].apply(lambda x: ' '.join([wn.lemmatize(w) for w in x.split() if w not in stopwords]))

In [None]:
# Topic model
from bertopic import BERTopic
# Dimension reduction
from umap import UMAP

# Initiate UMAP
umap_model = UMAP(n_neighbors= 5,
                  n_components= 5,
                  min_dist=0.0,
                  metric='cosine',
                  random_state=100)

topic_model = BERTopic(umap_model=umap_model, language="english", calculate_probabilities=True, min_topic_size=2)
topics, probs = topic_model.fit_transform(df['Content_lemmatized'])

In [None]:
topic_model.get_topic_info()

In [None]:
# Get top 10 terms for a topic
for i in range(5):
  print(topic_model.get_topic(i))