In [None]:
!pip install PyPDF2 sentence-transformers bertopic hdbscan umap-learn spacy plotly
!python -m spacy download en_core_web_sm

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting bertopic
  Downloading bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
Collecting hdbscan
  Downloading hdbscan-0.8.40-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting umap-learn
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-p

In [None]:
import os
import io
import pandas as pd
import numpy as np
from datetime import datetime
import plotly.express as px
from tqdm import tqdm
import PyPDF2
from google.colab import files
import spacy
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from hdbscan import HDBSCAN
import umap
from sklearn.cluster import KMeans
import re

def read_pdf(file_path):
    """Extract text from a PDF file"""
    try:
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
        return text
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return None

def preprocess_text(text):
    """Clean and preprocess the extracted text"""
    if not text:
        return ""

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)

    # Keep numbers and some special characters for technical documents
    text = re.sub(r'[^a-zA-Z0-9\s\.\-]', ' ', text)

    # Convert to lowercase
    text = text.lower().strip()

    return text

def split_into_chunks(text, chunk_size=200):
    """Split text into chunks of approximately equal size"""
    # Reduce chunk size and use sentence-based splitting
    sentences = re.split(r'[.!?]+', text)
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue

        if len(current_chunk) + len(sentence) < chunk_size:
            current_chunk += " " + sentence
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence

    if current_chunk:
        chunks.append(current_chunk.strip())

    # Ensure we have enough chunks for meaningful analysis
    if len(chunks) < 5:
        # If text is too short, create overlapping chunks
        words = text.split()
        chunks = []
        for i in range(0, len(words), chunk_size//2):
            chunk = ' '.join(words[i:i + chunk_size])
            if len(chunk.strip()) > 0:
                chunks.append(chunk)

    return chunks

def perform_topic_modeling(texts, method='bertopic', num_topics=None):
    """Perform topic modeling with optimized parameters for technical documents"""
    # Initialize sentence transformer
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

    print("Computing document embeddings...")
    embeddings = embedding_model.encode(texts, show_progress_bar=True)

    if method == 'bertopic':
        try:
            # More flexible parameters for technical documents
            topic_model = BERTopic(
                n_gram_range=(1, 3),
                min_topic_size=2,  # Reduced for smaller documents
                verbose=True,
                calculate_probabilities=True,
                nr_topics='auto'  # Let BERTopic determine optimal number
            )
            topics, probs = topic_model.fit_transform(texts, embeddings)

            # Check if we have valid topics
            if len(topic_model.get_topic_info()) <= 1:
                print("Insufficient distinct topics found. Trying with K-means...")
                method = 'kmeans'
            else:
                return topics, topic_model, embeddings

        except Exception as e:
            print(f"BERTopic modeling failed: {e}")
            print("Falling back to K-means clustering...")
            method = 'kmeans'

    if method == 'kmeans':
        # Use K-means as fallback
        num_topics = num_topics or min(10, len(texts)//5)  # Adaptive number of topics
        clustering_model = KMeans(n_clusters=max(2, num_topics), random_state=42)
        topics = clustering_model.fit_predict(embeddings)
        topic_model = clustering_model

    return topics, topic_model, embeddings

def save_topic_visualizations(topic_model, output_dir, method='bertopic'):
    """Save topic modeling visualizations with error handling"""
    os.makedirs(output_dir, exist_ok=True)

    try:
        if method == 'bertopic':
            # Save topic information
            topic_info = topic_model.get_topic_info()
            topic_info.to_csv(os.path.join(output_dir, "topic_info.csv"))

            # Generate and save visualizations
            try:
                fig_topics = topic_model.visualize_topics()
                fig_topics.write_html(os.path.join(output_dir, "topic_clusters.html"))
            except Exception as e:
                print(f"Could not generate topic clusters visualization: {e}")

            try:
                fig_hierarchy = topic_model.visualize_hierarchy()
                fig_hierarchy.write_html(os.path.join(output_dir, "topic_hierarchy.html"))
            except Exception as e:
                print(f"Could not generate topic hierarchy visualization: {e}")

            try:
                fig_similarity = topic_model.visualize_heatmap()
                fig_similarity.write_html(os.path.join(output_dir, "topic_similarity.html"))
            except Exception as e:
                print(f"Could not generate topic similarity visualization: {e}")

        elif method == 'kmeans':
            # Save basic clustering results for K-means
            cluster_centers = pd.DataFrame(topic_model.cluster_centers_)
            cluster_centers.to_csv(os.path.join(output_dir, "cluster_centers.csv"))

    except Exception as e:
        print(f"Error saving visualizations: {e}")

def analyze_pdf(file_path, output_dir="topic_analysis", method='bertopic', num_topics=None, chunk_size=200):
    """Main function to analyze a PDF document"""
    print("Reading PDF...")
    text = read_pdf(file_path)

    if not text:
        print("Failed to read PDF document")
        return None

    print("Preprocessing text...")
    processed_text = preprocess_text(text)

    if len(processed_text.split()) < 100:
        print("Warning: Document contains very little text for analysis")
        return None

    print("Splitting text into chunks...")
    chunks = split_into_chunks(processed_text, chunk_size)

    if not chunks:
        print("No valid text chunks found")
        return None

    print(f"Found {len(chunks)} text chunks")

    print(f"Performing topic modeling using {method}...")
    topics, topic_model, embeddings = perform_topic_modeling(
        chunks, method=method, num_topics=num_topics
    )

    print("Saving results...")
    save_topic_visualizations(topic_model, output_dir, method)

    print("Analysis complete!")
    return topic_model, topics, chunks

In [None]:
import os
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import plotly.express as px
import plotly.graph_objects as go
import re

def preprocess_text(text):
    """Clean and preprocess the input text"""
    if not text:
        return ""

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)

    # Keep numbers and some special characters for technical documents
    text = re.sub(r'[^a-zA-Z0-9\s\.\-]', ' ', text)

    # Convert to lowercase
    text = text.lower().strip()

    return text

def split_into_chunks(text, chunk_size=200):
    """Split text into chunks of approximately equal size"""
    # Split on sentence boundaries
    sentences = re.split(r'[.!?]+', text)
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue

        if len(current_chunk) + len(sentence) < chunk_size:
            current_chunk += " " + sentence
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence

    if current_chunk:
        chunks.append(current_chunk.strip())

    # Create overlapping chunks if text is too short
    if len(chunks) < 5:
        words = text.split()
        chunks = []
        for i in range(0, len(words), chunk_size//2):
            chunk = ' '.join(words[i:i + chunk_size])
            if len(chunk.strip()) > 0:
                chunks.append(chunk)

    return chunks

def perform_topic_modeling(texts, embeddings, method='bertopic', num_topics=None):
    """Perform topic modeling with optimized parameters"""
    if method == 'bertopic' and len(texts) >= 10:  # Only use BERTopic if we have enough text
        try:
            topic_model = BERTopic(
                n_gram_range=(1, 3),
                min_topic_size=2,
                verbose=True,
                calculate_probabilities=True,
                nr_topics='auto'
            )
            topics, probs = topic_model.fit_transform(texts, embeddings)

            # Check if we have valid topics
            if len(topic_model.get_topic_info()) <= 1:
                print("Insufficient distinct topics found. Trying with K-means...")
                method = 'kmeans'
            else:
                return topics, topic_model, 'bertopic'

        except Exception as e:
            print(f"BERTopic modeling failed: {e}")
            print("Falling back to K-means clustering...")
            method = 'kmeans'

    # Use K-means as fallback or for small datasets
    if method == 'kmeans':
        num_topics = num_topics or min(5, len(texts))  # Reduced number for small datasets
        clustering_model = KMeans(n_clusters=max(2, num_topics), random_state=42)
        topics = clustering_model.fit_predict(embeddings)
        return topics, clustering_model, 'kmeans'

def create_kmeans_visualizations(embeddings, topics, output_dir):
    """Create visualizations for K-means clustering results"""
    # Reduce dimensionality for visualization
    pca = PCA(n_components=2)
    embeddings_2d = pca.fit_transform(embeddings)

    # Create scatter plot
    df = pd.DataFrame({
        'PC1': embeddings_2d[:, 0],
        'PC2': embeddings_2d[:, 1],
        'Cluster': [f'Cluster {i}' for i in topics]
    })

    fig = px.scatter(
        df,
        x='PC1',
        y='PC2',
        color='Cluster',
        title='Document Clusters Visualization'
    )
    fig.write_html(os.path.join(output_dir, "cluster_visualization.html"))

    # Save cluster data
    df.to_csv(os.path.join(output_dir, "cluster_data.csv"), index=False)

def save_topic_visualizations(topic_model, topics, embeddings, output_dir, method):
    """Save topic modeling visualizations with error handling"""
    os.makedirs(output_dir, exist_ok=True)

    try:
        if method == 'bertopic':
            # Save topic information
            topic_info = topic_model.get_topic_info()
            topic_info.to_csv(os.path.join(output_dir, "topic_info.csv"))

            # Generate and save visualizations
            try:
                fig_topics = topic_model.visualize_topics()
                fig_topics.write_html(os.path.join(output_dir, "topic_clusters.html"))
            except Exception as e:
                print(f"Could not generate topic clusters visualization: {e}")

            try:
                fig_hierarchy = topic_model.visualize_hierarchy()
                fig_hierarchy.write_html(os.path.join(output_dir, "topic_hierarchy.html"))
            except Exception as e:
                print(f"Could not generate topic hierarchy visualization: {e}")

            try:
                fig_similarity = topic_model.visualize_heatmap()
                fig_similarity.write_html(os.path.join(output_dir, "topic_similarity.html"))
            except Exception as e:
                print(f"Could not generate topic similarity visualization: {e}")

        elif method == 'kmeans':
            create_kmeans_visualizations(embeddings, topics, output_dir)

    except Exception as e:
        print(f"Error saving visualizations: {e}")

def analyze_text(text, output_dir="topic_analysis_results", method='bertopic', num_topics=None, chunk_size=200):
    """Main function to analyze text input"""
    print("Preprocessing text...")
    processed_text = preprocess_text(text)

    if len(processed_text.split()) < 50:  # Reduced minimum word count
        print("Warning: Text contains very few words for analysis")
        return None

    print("Splitting text into chunks...")
    chunks = split_into_chunks(processed_text, chunk_size)

    if not chunks:
        print("No valid text chunks found")
        return None

    print(f"Found {len(chunks)} text chunks")

    # Initialize sentence transformer
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
    print("Computing document embeddings...")
    embeddings = embedding_model.encode(chunks, show_progress_bar=True)

    print(f"Performing topic modeling using {method}...")
    topics, model, final_method = perform_topic_modeling(chunks, embeddings, method, num_topics)

    print("Saving results...")
    save_topic_visualizations(model, topics, embeddings, output_dir, final_method)

    return model, topics, chunks, final_method

def main():
    """Run the text topic modeling analysis"""
    print("Please paste your text below (press Ctrl+D or Ctrl+Z when finished):")
    text_lines = []
    try:
        while True:
            line = input()
            text_lines.append(line)
    except EOFError:
        text = '\n'.join(text_lines)

    if not text.strip():
        print("No text was provided")
        return

    output_dir = "topic_analysis_results"
    os.makedirs(output_dir, exist_ok=True)

    results = analyze_text(
        text,
        output_dir=output_dir,
        method='bertopic',
        chunk_size=200
    )

    if results:
        model, topics, chunks, method = results
        if method == 'bertopic':
            print("\nTop topics and their keywords:")
            topic_info = model.get_topic_info()
            print(topic_info.head(10))
        else:
            print("\nClustering completed using K-means")
            print(f"Number of clusters: {len(set(topics))}")

        # List saved files
        print("\nAnalysis results saved to directory:", output_dir)
        for file in os.listdir(output_dir):
            if file.endswith(('.csv', '.html')):
                print(f"- {file}")

if __name__ == "__main__":
    main()

Please paste your text below (press Ctrl+D or Ctrl+Z when finished):
Artificial intelligence is the science and engineering of making intelligent machines, especially intelligent computer programs.1 Within the broader field of artificial intelligence, machine learning is the study of computer algorithms that improve automatically through experience.2 Genomics and machine learning have a shared history dating back nearly a quarter century, with the first applications of machine learning methods on DNA sequence data being reported soon after the beginning of the Human Genome Project. Nowadays, genomics is inherently a data-intensive field of research; in fact, since the advent of next-generation DNA-sequencing methods, truly massive volumes of exome, genome, and transcriptome sequencing data have been generated, often with rich and complex metadata annotations. This rich data landscape, which includes not just sequencing data but additional layers of information such as functional genomi

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Performing topic modeling using bertopic...


TypeError: cannot unpack non-iterable NoneType object

In [None]:
def main():
    """Run the PDF topic modeling analysis"""
    print("Please upload a PDF file...")
    uploaded = files.upload()

    if not uploaded:
        print("No file was uploaded")
        return

    file_name = list(uploaded.keys())[0]
    output_dir = "topic_analysis_results"
    os.makedirs(output_dir, exist_ok=True)

    topic_model, topics, chunks = analyze_pdf(
        file_name,
        output_dir=output_dir,
        method='bertopic',
        chunk_size=200  # Reduced chunk size for more granular analysis
    )

    if topic_model:
        if hasattr(topic_model, 'get_topic_info'):
            print("\nTop topics and their keywords:")
            topic_info = topic_model.get_topic_info()
            print(topic_info.head(10))
        else:
            print("\nClustering completed using K-means")
            print(f"Number of clusters: {len(set(topics))}")

        # Download available results
        print("\nDownloading analysis results...")
        for file in os.listdir(output_dir):
            if file.endswith(('.csv', '.html')):
                files.download(os.path.join(output_dir, file))

if __name__ == "__main__":
    main()

Please upload a PDF file...


Saving Long-Term-Follow-Up-After-Admin-Human-GT-Products_Jan_2020.pdf to Long-Term-Follow-Up-After-Admin-Human-GT-Products_Jan_2020 (1).pdf
Reading PDF...
Preprocessing text...
Splitting text into chunks...
Found 486 text chunks
Performing topic modeling using bertopic...
Computing document embeddings...


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

2024-12-09 17:04:58,284 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-09 17:04:59,561 - BERTopic - Dimensionality - Completed ✓
2024-12-09 17:04:59,563 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-09 17:04:59,709 - BERTopic - Cluster - Completed ✓
2024-12-09 17:04:59,711 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-09 17:04:59,977 - BERTopic - Representation - Completed ✓
2024-12-09 17:04:59,981 - BERTopic - Topic reduction - Reducing number of topics
2024-12-09 17:05:00,338 - BERTopic - Topic reduction - Reduced number of topics from 65 to 43


Saving results...
Analysis complete!

Top topics and their keywords:
   Topic  Count                                               Name  \
0     -1    123                                   -1_the_to_and_of   
1      0    116                                  0_the_ltfu_of_for   
2      1     29  1_retroviral_vectors_retroviral vectors_retrov...   
3      2     15          2_fda_october_guidance_advisory committee   
4      3     13                3_genome_editing_genome editing_the   
5      4     12                   4_new_to_incidence_new incidence   
6      5     12                5_vector_of vector_sites_pattern of   
7      6     10                             6_be_or_studies_target   
8      7     10               7_vector sequences_vector_assay_qpcr   
9      8      9                          8_combined_linked_scid_et   

                                      Representation  \
0  [the, to, and, of, for, in, or, product, that,...   
1  [the, ltfu, of, for, product, to, gt, and, ob

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>