In [2]:
!pip install Pypdf tools

Collecting Pypdf
  Downloading pypdf-5.4.0-py3-none-any.whl.metadata (7.3 kB)
Collecting tools
  Downloading tools-0.1.9.tar.gz (34 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pytils (from tools)
  Downloading pytils-0.4.3.tar.gz (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.4/101.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading pypdf-5.4.0-py3-none-any.whl (302 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.3/302.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: tools, pytils
  Building wheel for tools (setup.py) ... [?25l[?25hdone
  Created wheel for tools: filename=tools-0.1.9-py3-none-any.whl size=46730 sha256=717a6d24669aa035d3422d21dee109da0a4681d186a50d5251fab1f0c23b84b4
 

In [17]:
import os
import logging
import re
import string
import random
from collections import Counter
from typing import List, Dict, Tuple, Any, Optional
import traceback

# --- PDF Handling Library ---
try:
    from pypdf import PdfReader
    logging.info("Successfully imported pypdf.")
except ImportError:
    logging.error("Failed to import 'pypdf'. Please install it: pip install pypdf")
    # Define a dummy class if import fails to allow script structure check
    class PdfReader:
        def __init__(self, *args): raise ImportError("pypdf not found")
        pages = []
        metadata = None
        is_encrypted = False
    # exit() # Use exit() in a real script

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')

# --- PDF Handling Functions (Independent) ---

def get_pdf_file(pdf_folder: str, mode: str = "latest") -> str:
    """
    Selects a PDF file path from a specified folder based on the mode.
    (Standalone version)
    """
    if not os.path.exists(pdf_folder):
        try:
            os.makedirs(pdf_folder, exist_ok=True)
            logging.warning(f"PDF folder '{pdf_folder}' did not exist and was created.")
            raise FileNotFoundError(f"PDF folder '{pdf_folder}' was created, but no PDFs found.")
        except OSError as e:
            logging.error(f"Failed to create PDF folder '{pdf_folder}': {e}")
            raise FileNotFoundError(f"PDF folder '{pdf_folder}' does not exist and could not be created.")

    try:
        pdf_files = [f for f in os.listdir(pdf_folder) if f.lower().endswith('.pdf') and os.path.isfile(os.path.join(pdf_folder, f))]
    except OSError as e:
        logging.error(f"Error listing files in folder '{pdf_folder}': {e}")
        raise FileNotFoundError(f"Could not access files in the PDF folder '{pdf_folder}'.")

    if not pdf_files:
        logging.warning(f"No PDF files found in folder: {pdf_folder}")
        raise FileNotFoundError(f"No PDF files found in {pdf_folder}")

    logging.info(f"Found {len(pdf_files)} PDF file(s) in '{pdf_folder}'.")
    selected_file_path = None

    if mode == "latest":
        try:
            latest_file = max(pdf_files, key=lambda f: os.path.getmtime(os.path.join(pdf_folder, f)))
            selected_file_path = os.path.join(pdf_folder, latest_file)
            logging.info(f"Selected latest PDF: {latest_file}")
        except Exception as e:
            logging.error(f"Error determining latest file: {e}")
            raise
    elif mode == "random":
        random_file = random.choice(pdf_files)
        selected_file_path = os.path.join(pdf_folder, random_file)
        logging.info(f"Selected random PDF: {random_file}")
    elif mode == "interactive":
        # (Interactive code omitted for brevity, can be added back if needed)
        logging.warning("Interactive mode selection not fully implemented in this snippet.")
        # Fallback to random if interactive part is omitted
        random_file = random.choice(pdf_files)
        selected_file_path = os.path.join(pdf_folder, random_file)
        logging.info(f"Selected random PDF (fallback): {random_file}")
    else:
        raise ValueError(f"Invalid mode: '{mode}'. Must be 'latest', 'random', or 'interactive'")

    if selected_file_path is None: raise RuntimeError("Failed to select a PDF file.")
    return selected_file_path


def extract_full_text_metadata_pypdf(pdf_path: str) -> Tuple[Optional[str], Optional[Dict[str, Any]]]:
    """
    Extracts the full text content and metadata from a PDF using pypdf.
    (Standalone version)
    """
    if not os.path.exists(pdf_path):
        logging.error(f"PDF file not found: {pdf_path}")
        return None, None
    if not os.path.isfile(pdf_path):
        logging.error(f"Path exists but is not a file: {pdf_path}")
        return None, None

    try:
        logging.info(f"Opening PDF: {os.path.basename(pdf_path)} using pypdf")
        reader = PdfReader(pdf_path)
        metadata = reader.metadata
        num_pages = len(reader.pages)

        extracted_metadata = {
            "title": getattr(metadata, 'title', os.path.basename(pdf_path)),
            "author": getattr(metadata, 'author', "Unknown"),
            "subject": getattr(metadata, 'subject', ""),
            "creator": getattr(metadata, 'creator', ""),
            "producer": getattr(metadata, 'producer', ""),
            "page_count": num_pages,
            "is_encrypted": reader.is_encrypted,
            "file_name": os.path.basename(pdf_path),
            "file_path": pdf_path
        }

        logging.info(f"Extracting text from {num_pages} page(s)...")
        full_text = ""
        for page_num, page in enumerate(reader.pages):
            try:
                 text = page.extract_text()
                 if text:
                     full_text += text + "\n" # Add single newline between pages
                 else:
                      logging.warning(f"No text extracted from page {page_num + 1}.")
            except Exception as page_error:
                 logging.warning(f"Could not extract text from page {page_num + 1}: {page_error}")

        logging.info(f"Successfully extracted {len(full_text)} characters from PDF.")
        full_text = full_text.replace('\x00', '').strip() # Remove null chars and trim ends

        # Simple preprocessing: fix hyphenation and normalize whitespace
        full_text = re.sub(r'(\w)-\n(\w)', r'\1\2', full_text)
        full_text = re.sub(r'\s+', ' ', full_text) # Normalize all whitespace to single space

        return full_text, extracted_metadata

    except ImportError:
         logging.error("pypdf library is required but not installed.")
         return None, None
    except Exception as e:
        logging.error(f"Error extracting text from PDF '{os.path.basename(pdf_path)}' using pypdf: {e}")
        logging.error(traceback.format_exc())
        return None, None


# --- EDA Functions (Operating on Full Text) ---

STOP_WORDS = set([
    'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't", 'as', 'at',
    'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by',
    'can', "can't", 'cannot', 'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during',
    'each', 'few', 'for', 'from', 'further', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself', 'him', 'himself', 'his', 'how', "how's",
    'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself',
    "let's", 'me', 'more', 'most', "mustn't", 'my', 'myself',
    'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own',
    'same', "shan't", 'she', "she'd", "she'll", "she's", 'should', "shouldn't", 'so', 'some', 'such',
    'than', 'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'this', 'those', 'through', 'to', 'too',
    'under', 'until', 'up', 'very', 'was', "wasn't", 'we', "we'd", "we'll", "we're", "we've", 'were', "weren't", 'what', "what's", 'when', "when's", 'where', "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's", 'with', "won't", 'would', "wouldn't",
    'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves',
    # Domain specific
    'et', 'al', 'nber', 'working', 'paper', 'series', 'figure', 'table', 'http', 'org', 'www', 'abstract', 'doi', 'appendix',
    'university', 'research', 'data', 'results', 'analysis', 'study', 'based', 'using', 'also', 'however', 'within', 'whether'
])

def calculate_basic_stats(full_text: str) -> Dict[str, Any]:
    """Calculates basic statistics from the full text."""
    stats = {}
    if not full_text: return {"error": "Input text is empty"}
    try:
        words = full_text.split()
        stats['word_count'] = len(words)
        # Simple sentence split - may be inaccurate with abbreviations etc.
        sentences = re.split(r'[.?!]\s+', full_text) # Split on .?! followed by space
        stats['sentence_count'] = len([s for s in sentences if s.strip()])
        if stats['sentence_count'] > 0:
            stats['avg_sentence_length_words'] = round(stats['word_count'] / stats['sentence_count'], 2)
        else:
            stats['avg_sentence_length_words'] = stats['word_count']
        stats['character_count'] = len(full_text)
        logging.info("Calculated basic text statistics.")
        return stats
    except Exception as e:
        logging.error(f"Error calculating basic stats: {e}")
        return {"error": f"Failed to calculate stats: {e}"}

def get_word_frequency(full_text: str, num_words: int = 25) -> List[Tuple[str, int]]:
    """Calculates frequency of significant words in the full text."""
    if not full_text: return []
    try:
        text = full_text.lower()
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = re.sub(r'\d+', '', text) # Remove numbers
        words = text.split()
        filtered_words = [word for word in words if word not in STOP_WORDS and len(word) > 2]
        word_counts = Counter(filtered_words)
        most_common = word_counts.most_common(num_words)
        logging.info(f"Calculated word frequencies, found top {len(most_common)} words.")
        return most_common
    except Exception as e:
        logging.error(f"Error calculating word frequency: {e}")
        return []

def find_potential_proper_nouns(full_text: str, min_freq: int = 3) -> List[Tuple[str, int]]:
    """Finds potential proper nouns using capitalization heuristic."""
    if not full_text: return []
    potential_nouns = Counter()
    try:
        # Use regex to find capitalized words that are not at the start of a line
        # This is still a very rough heuristic.
        # Matches words starting with an uppercase letter, followed by lowercase,
        # NOT preceded by sentence-ending punctuation and space, or start of text.
        pattern = r"(?<![\.\?!]\s)(?<!^)\b([A-Z][a-z]+(?:[-'][A-Z][a-z]+)*)\b"
        matches = re.findall(pattern, full_text)
        potential_nouns.update(matches)

        frequent_nouns = [(noun, freq) for noun, freq in potential_nouns.items() if freq >= min_freq and noun.lower() not in STOP_WORDS]
        frequent_nouns.sort(key=lambda x: x[1], reverse=True)
        logging.info(f"Found {len(frequent_nouns)} potential proper nouns (heuristic) with min frequency {min_freq}.")
        return frequent_nouns
    except Exception as e:
        logging.error(f"Error finding potential proper nouns: {e}")
        return []

def find_common_headers(full_text: str) -> Dict[str, int]:
    """Counts occurrences of common academic paper headers."""
    if not full_text: return {}
    headers = {
        "Abstract": 0, "Introduction": 0, "Method": 0, "Methodology": 0,
        "Data": 0, "Results": 0, "Discussion": 0, "Conclusion": 0,
        "References": 0, "Appendix": 0
    }
    found_headers = {}
    try:
        # Case-insensitive search for headers at the start of a line (potentially with numbers/whitespace)
        for header in headers.keys():
             # Regex: start of line, optional whitespace/numbering, header text, optional colon, whitespace/newline end
            pattern = re.compile(r"^\s*(?:\d+\.?\s*)?" + re.escape(header) + r"\s*:?\s*$", re.IGNORECASE | re.MULTILINE)
            matches = pattern.findall(full_text)
            count = len(matches)
            if count > 0:
                found_headers[header] = count
        logging.info(f"Checked for common headers. Found: {found_headers}")
        return found_headers
    except Exception as e:
        logging.error(f"Error finding common headers: {e}")
        return {}

# --- Main EDA Execution Block ---
if __name__ == "__main__":
    logging.info("Standalone EDA Script execution started.")

    # <<< --- CONFIGURATION --- >>>
    PDF_FOLDER = "my_pdfs"  # IMPORTANT: Change this
    SELECTION_MODE = "random" # "latest", "random" (interactive needs more code)
    TOP_N_WORDS = 30
    MIN_PROPER_NOUN_FREQ = 4
    # <<< --- END CONFIGURATION --- >>>

    try:
        # Step 1: Select PDF
        pdf_path = get_pdf_file(PDF_FOLDER, mode=SELECTION_MODE)

        # Step 2: Extract Full Text & Metadata
        full_text, metadata = extract_full_text_metadata_pypdf(pdf_path)

        if full_text and metadata:
            print("\n" + "=" * 60)
            print(f"Running Standalone EDA for: {metadata.get('file_name', 'N/A')}")
            print("=" * 60)

            # Step 3: Basic Stats Analysis
            print("\n--- Basic Text Statistics ---")
            stats = calculate_basic_stats(full_text)
            if 'error' in stats: print(f"  Error: {stats['error']}")
            else:
                for key, value in stats.items(): print(f"  {key.replace('_', ' ').capitalize():<30}: {value}")
            print("-" * 60)

            # Step 4: Word Frequency (Potential Topics)
            print(f"\n--- Top {TOP_N_WORDS} Frequent Words (Potential Topics) ---")
            top_words = get_word_frequency(full_text, num_words=TOP_N_WORDS)
            if top_words:
                col_width = 20
                for i in range(0, len(top_words), 2):
                     word1, freq1 = top_words[i]
                     entry1 = f"{word1}: {freq1}"
                     entry2 = ""
                     if i + 1 < len(top_words):
                         word2, freq2 = top_words[i+1]
                         entry2 = f"{word2}: {freq2}"
                     print(f"  {entry1:<{col_width}} {entry2:<{col_width}}")
            else: print("  Could not calculate word frequencies.")
            print("-" * 60)

            # Step 5: Potential Proper Nouns (Characters/Entities - HEURISTIC)
            print(f"\n--- Potential Proper Nouns (Frequency >= {MIN_PROPER_NOUN_FREQ}) ---")
            print("  (Warning: Basic heuristic, may include errors. Use NER for accuracy)")
            potential_names = find_potential_proper_nouns(full_text, min_freq=MIN_PROPER_NOUN_FREQ)
            if potential_names:
                col_width = 25
                for i in range(0, len(potential_names), 2):
                     name1, freq1 = potential_names[i]
                     entry1 = f"{name1}: {freq1}"
                     entry2 = ""
                     if i + 1 < len(potential_names):
                         name2, freq2 = potential_names[i+1]
                         entry2 = f"{name2}: {freq2}"
                     print(f"  {entry1:<{col_width}} {entry2:<{col_width}}")
            else: print(f"  No potential proper nouns found with frequency >= {MIN_PROPER_NOUN_FREQ}.")
            print("-" * 60)

            # Step 6: Common Header Check
            print("\n--- Common Header Check ---")
            found_headers = find_common_headers(full_text)
            if found_headers:
                 for header, count in found_headers.items(): print(f"  Found '{header}': {count} time(s)")
            else: print("  No common headers (Abstract, Introduction, etc.) found matching patterns.")
            print("-" * 60)

        else:
            print("\n" + "=" * 60)
            print("EDA Failed: Could not extract text from the selected PDF.")
            print("=" * 60)

    except FileNotFoundError as e:
        logging.error(f"File/Folder Error: {e}")
        print(f"\nERROR: {e}")
    except ValueError as e:
        logging.error(f"Configuration or Input Error: {e}")
        print(f"\nERROR: {e}")
    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}", exc_info=True)
        print(f"\nUNEXPECTED ERROR: {e}")

    logging.info("Standalone EDA Script execution finished.")


Running Standalone EDA for: w27392.pdf

--- Basic Text Statistics ---
  Word count                    : 9930
  Sentence count                : 399
  Avg sentence length words     : 24.89
  Character count               : 62068
------------------------------------------------------------

--- Top 30 Frequent Words (Potential Topics) ---
  covid: 135           students: 117       
  eﬀects: 63           outcomes: 55        
  ∗∗∗: 55              health: 51          
  treatment: 48        economic: 37        
  online: 35           due: 34             
  graduation: 34       job: 34             
  student: 31          likely: 31          
  income: 28           proxies: 27         
  survey: 26           eﬀect: 25           
  honors: 25           average: 25         
  pandemic: 24         expected: 24        
  lost: 24             state: 22           
  academic: 22         sample: 22          
  expectations: 20     shocks: 20          
  major: 20            gpa: 20             
-