In [None]:
import os
import re
import fitz  # PyMuPDF
import hashlib
import json
from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed
import unicodedata
from pathlib import Path
import logging

In [None]:

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class PDFProcessor:
    def __init__(self, input_dir="data", output_dir="txt_data", stats_file="processing_stats.json"):
        self.input_dir = Path(input_dir)
        self.output_dir = Path(output_dir)
        self.stats_file = stats_file
        self.output_dir.mkdir(exist_ok=True)
        
        # Statistics tracking
        self.stats = {
            'total_files': 0,
            'processed_successfully': 0,
            'failed_files': [],
            'total_chars_before': 0,
            'total_chars_after': 0,
            'duplicates_removed': 0,
            'avg_doc_length': 0
        }
        
        # For deduplication
        self.seen_hashes = set()
        
        # Common patterns to remove
        self.cleanup_patterns = [
            # Page numbers (various formats)
            r'^\s*\d+\s*$',
            r'^\s*-\s*\d+\s*-\s*$',
            r'^\s*Page\s+\d+\s*$',
            r'^\s*\d+\s+of\s+\d+\s*$',
            
            # Common headers/footers
            r'^.*(?:proceedings|conference|journal|volume|issue).*$',
            r'^.*(?:copyright|©|\(c\)).*$',
            r'^.*(?:doi:|arxiv:|isbn:).*$',
            
            # Figure/table references
            r'^\s*(?:figure|fig|table|tab)\.?\s*\d+.*$',
            r'^\s*(?:equation|eq)\.?\s*\d+.*$',
            
            # Common boilerplate
            r'^\s*(?:abstract|introduction|conclusion|references|bibliography)\s*$',
            r'^\s*(?:this paper|in this work|we present|we propose)\s+.*$',
            
            # URLs and emails
            r'https?://\S+',
            r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
            
            # Excessive whitespace
            r'\s{3,}',
            
            # Citation patterns
            r'\[[0-9,\s\-]+\]',
            r'\([A-Za-z]+\s+et\s+al\.?,?\s+\d{4}\)',
            r'\([A-Za-z]+\s+\d{4}\)',
        ]
        
        # Compile patterns for efficiency
        self.compiled_patterns = [re.compile(pattern, re.IGNORECASE | re.MULTILINE) 
                                 for pattern in self.cleanup_patterns]
    
    def normalize_unicode(self, text):
        """Normalize Unicode characters and remove problematic ones"""
        # Normalize Unicode
        text = unicodedata.normalize('NFKD', text)
        
        # Remove or replace problematic characters
        replacements = {
            '\u2013': '-',  # en dash
            '\u2014': '--', # em dash
            '\u2018': "'",  # left single quotation mark
            '\u2019': "'",  # right single quotation mark
            '\u201c': '"',  # left double quotation mark
            '\u201d': '"',  # right double quotation mark
            '\u2026': '...',# horizontal ellipsis
            '\xa0': ' ',    # non-breaking space
            '\u00a0': ' ',  # non-breaking space
        }
        
        for old_char, new_char in replacements.items():
            text = text.replace(old_char, new_char)
        
        # Remove remaining non-ASCII characters that might cause issues
        text = ''.join(char for char in text if ord(char) < 128 or char.isalnum())
        
        return text
    
    def clean_text(self, text):
        """Apply comprehensive text cleaning"""
        original_length = len(text)
        
        # Normalize Unicode first
        text = self.normalize_unicode(text)
        
        # Split into lines for line-by-line processing
        lines = text.split('\n')
        cleaned_lines = []
        
        for line in lines:
            line = line.strip()
            
            # Skip empty lines
            if not line:
                continue
            
            # Skip very short lines (likely artifacts)
            if len(line) < 10:
                continue
            
            # Apply regex patterns
            skip_line = False
            for pattern in self.compiled_patterns:
                if pattern.match(line) or pattern.search(line):
                    skip_line = True
                    break
            
            if skip_line:
                continue
            
            # Additional cleaning
            line = re.sub(r'\s+', ' ', line)  # Normalize whitespace
            line = line.strip()
            
            # Skip lines with too many special characters (likely formatting artifacts)
            special_char_ratio = sum(1 for c in line if not c.isalnum() and c not in ' .,!?;:()[]{}"-') / len(line)
            if special_char_ratio > 0.3:
                continue
            
            # Skip lines that are mostly numbers (likely page numbers or references)
            if re.match(r'^[\d\s\-\.,]+$', line):
                continue
            
            cleaned_lines.append(line)
        
        # Join lines back together
        cleaned_text = '\n'.join(cleaned_lines)
        
        # Final cleanup
        cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)  # Limit consecutive newlines
        cleaned_text = cleaned_text.strip()
        
        # Update stats
        self.stats['total_chars_before'] += original_length
        self.stats['total_chars_after'] += len(cleaned_text)
        
        return cleaned_text
    
    def is_duplicate(self, text):
        """Check if text is a duplicate using hash comparison"""
        # Create hash of the text
        text_hash = hashlib.md5(text.encode()).hexdigest()
        
        if text_hash in self.seen_hashes:
            self.stats['duplicates_removed'] += 1
            return True
        
        self.seen_hashes.add(text_hash)
        return False
    
    def extract_text_from_pdf(self, pdf_path):
        """Extract and clean text from a single PDF"""
        try:
            with fitz.open(pdf_path) as doc:
                text_blocks = []
                
                for page_num, page in enumerate(doc):
                    page_text = page.get_text()
                    
                    # Skip pages with very little text (likely cover pages or purely visual)
                    if len(page_text.strip()) < 100:
                        continue
                    
                    text_blocks.append(page_text)
                
                # Combine all pages
                full_text = '\n'.join(text_blocks)
                
                # Clean the text
                cleaned_text = self.clean_text(full_text)
                
                # Check for duplicates
                if self.is_duplicate(cleaned_text):
                    logger.warning(f"Duplicate content detected in {pdf_path.name}")
                    return None
                
                # Quality check - ensure we have substantial content
                if len(cleaned_text) < 500:  # Minimum viable document length
                    logger.warning(f"Document too short after cleaning: {pdf_path.name}")
                    return None
                
                return cleaned_text
                
        except Exception as e:
            logger.error(f"Error processing {pdf_path.name}: {e}")
            self.stats['failed_files'].append(str(pdf_path.name))
            return None
    
    def process_single_pdf(self, pdf_path):
        """Process a single PDF file"""
        try:
            logger.info(f"Processing: {pdf_path.name}")
            
            # Extract and clean text
            cleaned_text = self.extract_text_from_pdf(pdf_path)
            
            if cleaned_text is None:
                return False
            
            # Create output filename
            output_filename = pdf_path.stem + '.txt'
            output_path = self.output_dir / output_filename
            
            # Write cleaned text
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(cleaned_text)
            
            logger.info(f"Successfully processed: {pdf_path.name} -> {output_filename}")
            self.stats['processed_successfully'] += 1
            
            return True
            
        except Exception as e:
            logger.error(f"Failed to process {pdf_path.name}: {e}")
            self.stats['failed_files'].append(str(pdf_path.name))
            return False
    
    def process_all_pdfs(self, max_workers=4):
        """Process all PDFs with multiprocessing"""
        pdf_files = list(self.input_dir.glob('*.pdf'))
        self.stats['total_files'] = len(pdf_files)
        
        if not pdf_files:
            logger.warning(f"No PDF files found in {self.input_dir}")
            return
        
        logger.info(f"Found {len(pdf_files)} PDF files to process")
        
        # Process files with threading
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_pdf = {executor.submit(self.process_single_pdf, pdf_path): pdf_path 
                           for pdf_path in pdf_files}
            
            for future in as_completed(future_to_pdf):
                pdf_path = future_to_pdf[future]
                try:
                    future.result()
                except Exception as e:
                    logger.error(f"Error processing {pdf_path.name}: {e}")
                    self.stats['failed_files'].append(str(pdf_path.name))
    
    def save_stats(self):
        """Save processing statistics"""
        if self.stats['processed_successfully'] > 0:
            self.stats['avg_doc_length'] = self.stats['total_chars_after'] / self.stats['processed_successfully']
        
        with open(self.stats_file, 'w') as f:
            json.dump(self.stats, f, indent=2)
        
        logger.info(f"Processing complete! Stats saved to {self.stats_file}")
        logger.info(f"Successfully processed: {self.stats['processed_successfully']}/{self.stats['total_files']}")
        logger.info(f"Failed files: {len(self.stats['failed_files'])}")
        logger.info(f"Duplicates removed: {self.stats['duplicates_removed']}")
        logger.info(f"Text reduction: {self.stats['total_chars_before']} -> {self.stats['total_chars_after']} chars")
        logger.info(f"Average document length: {self.stats['avg_doc_length']:.0f} characters")



In [None]:
def main():
    # Configuration
    INPUT_DIR = "papers"
    OUTPUT_DIR = "txt_data"
    MAX_WORKERS = 8  # Adjust based on your system
    
    # Create processor and run
    processor = PDFProcessor(INPUT_DIR, OUTPUT_DIR)
    processor.process_all_pdfs(max_workers=MAX_WORKERS)
    processor.save_stats()

if __name__ == "__main__":
    main()