#### Building a app that can help with reading research papers effectively, sometimes researchers have so much to do and have to create time to read research paper, i am creating a tts to help hear reserach paper as audio, like an audio

In [1]:
!pip install pymupdf pdfplumber requests numpy soundfile arxiv kokoro-onnx

Collecting pymupdf

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain 0.3.11 requires numpy<2,>=1.22.4; python_version < "3.12", but you have numpy 2.4.2 which is incompatible.
numba 0.62.1 requires numpy<2.4,>=1.22, but you have numpy 2.4.2 which is incompatible.
scipy 1.14.1 requires numpy<2.3,>=1.23.5, but you have numpy 2.4.2 which is incompatible.
unstructured 0.16.11 requires numpy<2, but you have numpy 2.4.2 which is incompatible.

[notice] A new release of pip is available: 24.3.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip



  Downloading pymupdf-1.27.1-cp310-abi3-win_amd64.whl.metadata (3.4 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.9-py3-none-any.whl.metadata (43 kB)
Collecting arxiv
  Downloading arxiv-2.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting kokoro-onnx
  Downloading kokoro_onnx-0.5.0-py3-none-any.whl.metadata (3.5 kB)
Collecting pdfminer.six==20251230 (from pdfplumber)
  Downloading pdfminer_six-20251230-py3-none-any.whl.metadata (4.3 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.5.0-py3-none-win_amd64.whl.metadata (68 kB)
Collecting feedparser~=6.0.10 (from arxiv)
  Downloading feedparser-6.0.12-py3-none-any.whl.metadata (2.7 kB)
Collecting espeakng-loader>=0.2.4 (from kokoro-onnx)
  Downloading espeakng_loader-0.2.4-py3-none-win_amd64.whl.metadata (1.3 kB)
Collecting numpy
  Downloading numpy-2.4.2-cp311-cp311-win_amd64.whl.metadata (6.6 kB)
Collecting onnxruntime>=1.20.1 (from kokoro-onnx)
  Downloading onnxruntime-1.24.2-cp311-cp311-win_amd64

In [2]:
import fitz  # This is PyMuPDF - after installing PyMuPDF, import works as 'fitz'
import re
from typing import Dict, List, Tuple
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class PaperStructureAnalyzer:
    """Analyzes research paper structure and extracts different elements"""
    
    def __init__(self, config: Dict = None):
        self.config = config or {}
        self.logger = logger
        
    def analyze_paper(self, pdf_path: str) -> Dict:
        """
        Analyze paper and identify sections, equations, tables, figures
        """
        try:
            doc = fitz.open(pdf_path)
            structure = {
                'metadata': self._extract_metadata(doc),
                'sections': [],
                'equations': [],
                'tables': [],
                'figures': [],
                'page_count': len(doc)
            }
            
            for page_num in range(len(doc)):
                page = doc[page_num]
                text = page.get_text()
                
                # Extract equations
                equations = self._extract_equations(text, page_num)
                structure['equations'].extend(equations)
                
                # Extract figures
                figures = self._extract_figures(text, page_num)
                structure['figures'].extend(figures)
                
                self.logger.info(f"Processed page {page_num + 1}/{len(doc)}")
            
            doc.close()
            return structure
            
        except Exception as e:
            self.logger.error(f"Error analyzing paper: {e}")
            raise
    
    def _extract_metadata(self, doc) -> Dict:
        """Extract paper metadata"""
        metadata = doc.metadata
        
        # Try to extract title from first page
        first_page = doc[0].get_text()
        lines = first_page.split('\n')
        title = lines[0] if lines else "Unknown Title"
        
        return {
            'title': title,
            'author': metadata.get('author', 'Unknown'),
            'subject': metadata.get('subject', ''),
            'keywords': metadata.get('keywords', ''),
            'creator': metadata.get('creator', '')
        }
    
    def _extract_equations(self, text: str, page_num: int) -> List[Dict]:
        """Extract mathematical equations"""
        equations = []
        
        # Pattern for LaTeX equations
        patterns = [
            (r'\$\$(.*?)\$\$', 'display'),  # Display math
            (r'\$(.*?)\$', 'inline'),       # Inline math
            (r'\\\[(.*?)\\\]', 'display'),  # LaTeX display
            (r'\\\((.*?)\\\)', 'inline')    # LaTeX inline
        ]
        
        for pattern, eq_type in patterns:
            matches = re.finditer(pattern, text, re.DOTALL)
            for match in matches:
                equations.append({
                    'latex': match.group(1).strip(),
                    'type': eq_type,
                    'page': page_num + 1,
                    'position': match.span()
                })
        
        return equations
    
    def _extract_figures(self, text: str, page_num: int) -> List[Dict]:
        """Extract figure captions"""
        figures = []
        
        # Look for figure captions
        patterns = [
            r'Figure\s+(\d+)[:.]\s*(.*?)(?=Figure\s+\d+|Table\s+\d+|\Z)',
            r'Fig\.?\s*(\d+)[:.]\s*(.*?)(?=Fig\.?|Figure|Table|\Z)'
        ]
        
        for pattern in patterns:
            matches = re.finditer(pattern, text, re.DOTALL | re.IGNORECASE)
            for match in matches:
                figures.append({
                    'number': match.group(1),
                    'caption': match.group(2).strip(),
                    'page': page_num + 1
                })
        
        return figures

# Test the analyzer
if __name__ == "__main__":
    import sys
    
    if len(sys.argv) > 1:
        pdf_path = sys.argv[1]
    else:
        # Test with a sample PDF or prompt user
        pdf_path = input("Enter path to PDF file: ").strip()
    
    try:
        analyzer = PaperStructureAnalyzer()
        result = analyzer.analyze_paper(pdf_path)
        
        print("\n=== Analysis Results ===")
        print(f"Title: {result['metadata']['title']}")
        print(f"Author: {result['metadata']['author']}")
        print(f"Pages: {result['page_count']}")
        print(f"Equations found: {len(result['equations'])}")
        print(f"Figures found: {len(result['figures'])}")
        
        if result['equations']:
            print("\n=== Equations ===")
            for eq in result['equations'][:5]:  # Show first 5
                print(f"Page {eq['page']} [{eq['type']}]: {eq['latex'][:50]}...")
        
        if result['figures']:
            print("\n=== Figures ===")
            for fig in result['figures'][:5]:  # Show first 5
                print(f"Page {fig['page']}: Figure {fig['number']} - {fig['caption'][:50]}...")
                
    except FileNotFoundError:
        print(f"Error: File '{pdf_path}' not found")
    except Exception as e:
        print(f"Error: {e}")

ERROR:__main__:Error analyzing paper: module 'fitz' has no attribute 'open'


Error: module 'fitz' has no attribute 'open'


In [3]:
import sys
!{sys.executable} -m pip list | findstr fitz
!{sys.executable} -m pip list | findstr PyMuPDF

PyMuPDF                   1.23.8
PyMuPDFb                  1.23.7


In [4]:
import sys
import os

print(f"Python executable: {sys.executable}")
print(f"Python version: {sys.version}")
print(f"Current working directory: {os.getcwd()}")
print(f"Script location: {__file__ if '__file__' in dir() else 'Running interactively'}")

print("\n=== Checking PyMuPDF installation ===")

# Check if fitz can be imported
try:
    import fitz
    print(f"✓ fitz imported successfully")
    print(f"  fitz location: {fitz.__file__}")
    print(f"  fitz version: {fitz.version}")
    print(f"  Has 'open' attribute: {'open' in dir(fitz)}")
except ImportError as e:
    print(f"✗ Cannot import fitz: {e}")

# Check all installed packages
import subprocess
result = subprocess.run([sys.executable, "-m", "pip", "list"], 
                       capture_output=True, text=True)
print("\n=== Installed packages with 'fitz' or 'pdf' ===")
for line in result.stdout.split('\n'):
    if any(x in line.lower() for x in ['fitz', 'pymupdf', 'pdf']):
        print(line)

Python executable: c:\Users\user\AppData\Local\Programs\Python\Python311\python.exe
Python version: 3.11.3 (tags/v3.11.3:f3909b8, Apr  4 2023, 23:49:59) [MSC v.1934 64 bit (AMD64)]
Current working directory: c:\Users\user\Downloads\Project\aipdf
Script location: Running interactively

=== Checking PyMuPDF installation ===
✓ fitz imported successfully
  fitz location: None


AttributeError: module 'fitz' has no attribute 'version'

In [5]:
# In Jupyter Notebook, run this cell
import sys
print(sys.executable)

# Check if fitz works in Jupyter
import fitz
print(f"fitz version: {fitz.version}")
print(f"fitz location: {fitz.__file__}")

# Test open method
doc = fitz.open()
print("✓ fitz.open() works")
doc.close()

c:\Users\user\AppData\Local\Programs\Python\Python311\python.exe


AttributeError: module 'fitz' has no attribute 'version'

In [6]:
import sys
import os

# Print all paths where Python looks for modules
print("Python path:")
for path in sys.path:
    print(f"  {path}")

# Check if fitz exists in any of these paths
import site
print(f"\nSite packages: {site.getsitepackages()}")

# Try to find fitz module location
import importlib.util
spec = importlib.util.find_spec('fitz')
if spec:
    print(f"\nfitz found at: {spec.origin}")
else:
    print("\nfitz not found in Python path")

Python path:
  c:\Users\user\AppData\Local\Programs\Python\Python311\python311.zip
  c:\Users\user\AppData\Local\Programs\Python\Python311\DLLs
  c:\Users\user\AppData\Local\Programs\Python\Python311\Lib
  c:\Users\user\AppData\Local\Programs\Python\Python311
  
  C:\Users\user\AppData\Roaming\Python\Python311\site-packages
  C:\Users\user\AppData\Roaming\Python\Python311\site-packages\win32
  C:\Users\user\AppData\Roaming\Python\Python311\site-packages\win32\lib
  C:\Users\user\AppData\Roaming\Python\Python311\site-packages\Pythonwin
  c:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages

Site packages: ['c:\\Users\\user\\AppData\\Local\\Programs\\Python\\Python311', 'c:\\Users\\user\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages']

fitz found at: None
