In [1]:
# Install required packages
!pip install numpy==1.24.3
!pip install --upgrade pip

# Document processing packages
!pip install PyPDF2 python-docx pytesseract pillow PyMuPDF

# NLP and ML packages
!pip install spacy transformers sentence-transformers
!pip install nltk textstat

# Web framework
!pip install streamlit pandas

# Download spaCy language model
!python -m spacy download en_core_web_sm

print("✅ All packages installed successfully!")

Collecting numpy==1.24.3
  Using cached numpy-1.24.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (5.6 kB)
Using cached numpy-1.24.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (14.0 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.2.6
    Uninstalling numpy-2.2.6:
      Successfully uninstalled numpy-2.2.6
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.24.3 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-1.24.3
Collecting numpy>=1.19.0 (from spacy)
  Using cached numpy-2.2.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (63 kB)
Using cached numpy-2.2.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (14.3 MB)
Installing collected packag

In [2]:
import os
import json
import csv
import datetime
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Document processing
import PyPDF2
import fitz  # PyMuPDF
from docx import Document
import pytesseract
from PIL import Image

# NLP and ML
import spacy
import nltk
from nltk.corpus import stopwords
from transformers import pipeline
import textstat

# Data processing
import pandas as pd
import numpy as np

# Download NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

print("✅ All libraries imported successfully!")

✅ All libraries imported successfully!


In [3]:
class MetadataGenerator:
    def __init__(self, quick_start=False):
        """
        Initialize the Metadata Generator
        
        Args:
            quick_start (bool): If True, skip heavy ML model loading
        """
        self.quick_start = quick_start
        self.nlp = None
        self.summarizer = None
        
        if not quick_start:
            self._load_models()
        
        print(f"✅ MetadataGenerator initialized ({'Quick Start' if quick_start else 'Full System'} mode)")
    
    def _load_models(self):
        """Load NLP models with error handling"""
        try:
            # Load spaCy model
            self.nlp = spacy.load("en_core_web_sm")
            print("✅ spaCy model loaded")
        except Exception as e:
            print(f"⚠️ Could not load spaCy model: {e}")
            self.nlp = None
        
        try:
            # Load summarization pipeline
            self.summarizer = pipeline("summarization", 
                                     model="facebook/bart-large-cnn",
                                     max_length=150, 
                                     min_length=50)
            print("✅ Summarization model loaded")
        except Exception as e:
            print(f"⚠️ Could not load summarization model: {e}")
            self.summarizer = None
    
    def extract_text_from_pdf(self, file_path):
        """Extract text from PDF with OCR fallback"""
        text = ""
        
        try:
            # Try PyMuPDF first
            doc = fitz.open(file_path)
            for page in doc:
                text += page.get_text()
            doc.close()
            
            if text.strip():
                return text
        except Exception as e:
            print(f"PyMuPDF extraction failed: {e}")
        
        try:
            # Fallback to PyPDF2
            with open(file_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                for page in pdf_reader.pages:
                    text += page.extract_text()
            
            if text.strip():
                return text
        except Exception as e:
            print(f"PyPDF2 extraction failed: {e}")
        
        # OCR fallback for image-based PDFs
        try:
            doc = fitz.open(file_path)
            for page_num in range(len(doc)):
                page = doc.load_page(page_num)
                pix = page.get_pixmap()
                img = Image.open(io.BytesIO(pix.tobytes("png")))
                text += pytesseract.image_to_string(img)
            doc.close()
            print("✅ OCR extraction completed")
        except Exception as e:
            print(f"⚠️ OCR extraction failed: {e}")
        
        return text
    
    def extract_text_from_docx(self, file_path):
        """Extract text from DOCX file"""
        try:
            doc = Document(file_path)
            text = ""
            for paragraph in doc.paragraphs:
                text += paragraph.text + "\n"
            return text
        except Exception as e:
            print(f"⚠️ DOCX extraction failed: {e}")
            return ""
    
    def extract_text_from_file(self, file_path):
        """Extract text from various file formats"""
        file_extension = Path(file_path).suffix.lower()
        
        if file_extension == '.pdf':
            return self.extract_text_from_pdf(file_path)
        elif file_extension == '.docx':
            return self.extract_text_from_docx(file_path)
        elif file_extension in ['.txt', '.md']:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                return f.read()
        else:
            raise ValueError(f"Unsupported file format: {file_extension}")

In [6]:
def classify_document_type(self, text):
        """Classify document type based on content"""
        text_lower = text.lower()
        
        # Simple rule-based classification
        if any(word in text_lower for word in ['agreement', 'contract', 'terms', 'legal']):
            return 'Legal'
        elif any(word in text_lower for word in ['report', 'analysis', 'findings', 'results']):
            return 'Report'
        elif any(word in text_lower for word in ['manual', 'guide', 'instructions', 'how to']):
            return 'Manual'
        elif any(word in text_lower for word in ['proposal', 'request', 'rfp', 'bid']):
            return 'Proposal'
        elif any(word in text_lower for word in ['research', 'study', 'methodology', 'hypothesis']):
            return 'Research'
        else:
            return 'Document'
    
def extract_entities(self, text):
        """Extract named entities from text"""
        if not self.nlp:
            return {}
        
        try:
            doc = self.nlp(text[:1000000])  # Limit text size for performance
            entities = {}
            
            for ent in doc.ents:
                if ent.label_ not in entities:
                    entities[ent.label_] = []
                if ent.text not in entities[ent.label_]:
                    entities[ent.label_].append(ent.text)
            
            return entities
        except Exception as e:
            print(f"⚠️ Entity extraction failed: {e}")
            return {}
    
def generate_summary(self, text):
        """Generate document summary"""
        if not self.summarizer or len(text) < 100:
            # Fallback: return first few sentences
            sentences = text.split('.')[:3]
            return '. '.join(sentences) + '.'
        
        try:
            # Chunk text if too long
            max_chunk_length = 1024
            if len(text) > max_chunk_length:
                text = text[:max_chunk_length]
            
            summary = self.summarizer(text)[0]['summary_text']
            return summary
        except Exception as e:
            print(f"⚠️ AI summarization failed: {e}")
            # Fallback summary
            sentences = text.split('.')[:3]
            return '. '.join(sentences) + '.'
    
def extract_key_topics(self, text):
        """Extract key topics and themes"""
        try:
            # Simple keyword extraction
            words = text.lower().split()
            stop_words = set(stopwords.words('english'))
            
            # Filter out stop words and short words
            filtered_words = [word for word in words 
                            if word not in stop_words and len(word) > 3]
            
            # Count word frequency
            word_freq = {}
            for word in filtered_words:
                word_freq[word] = word_freq.get(word, 0) + 1
            
            # Get top 10 most frequent words as topics
            top_topics = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:10]
            return [topic[0] for topic in top_topics]
        
        except Exception as e:
            print(f"⚠️ Topic extraction failed: {e}")
            return []

# Add the methods to the MetadataGenerator class
MetadataGenerator.classify_document_type = classify_document_type
MetadataGenerator.extract_entities = extract_entities
MetadataGenerator.generate_summary = generate_summary
MetadataGenerator.extract_key_topics = extract_key_topics

print("✅ Advanced NLP methods added to MetadataGenerator class")

✅ Advanced NLP methods added to MetadataGenerator class


In [7]:
def generate_metadata(self, file_path_or_text, is_text=False):
    """
    Generate comprehensive metadata for a document
    
    Args:
        file_path_or_text (str): File path or raw text
        is_text (bool): If True, treat input as raw text
    
    Returns:
        dict: Comprehensive metadata
    """
    start_time = datetime.datetime.now()
    
    try:
        # Extract text
        if is_text:
            text = file_path_or_text
            filename = "direct_text_input"
            file_size = len(text.encode('utf-8'))
            file_type = "text"
        else:
            text = self.extract_text_from_file(file_path_or_text)
            file_path = Path(file_path_or_text)
            filename = file_path.name
            file_size = file_path.stat().st_size
            file_type = file_path.suffix
        
        if not text.strip():
            raise ValueError("No text could be extracted from the document")
        
        # Basic content analysis
        word_count = len(text.split())
        char_count = len(text)
        
        # Readability analysis
        try:
            readability_score = textstat.flesch_reading_ease(text)
            reading_level = textstat.flesch_kincaid_grade(text)
        except:
            readability_score = 0
            reading_level = 0
        
        # Advanced analysis (if not in quick start mode)
        if not self.quick_start:
            document_type = self.classify_document_type(text)
            summary = self.generate_summary(text)
            entities = self.extract_entities(text)
            key_topics = self.extract_key_topics(text)
        else:
            document_type = "Document"
            summary = text[:200] + "..." if len(text) > 200 else text
            entities = {}
            key_topics = []
        
        # Calculate processing time
        processing_time = (datetime.datetime.now() - start_time).total_seconds()
        
        # Calculate confidence score
        confidence = self._calculate_confidence_score(text, word_count)
        
        # Compile metadata
        metadata = {
            "basic_info": {
                "filename": filename,
                "file_type": file_type,
                "file_size": file_size,
                "processing_date": datetime.datetime.now().isoformat(),
                "processing_time_seconds": round(processing_time, 2)
            },
            "content_analysis": {
                "document_type": document_type,
                "word_count": word_count,
                "character_count": char_count,
                "readability_score": round(readability_score, 2),
                "reading_level": round(reading_level, 2),
                "estimated_reading_time_minutes": round(word_count / 200, 1)  # Average reading speed
            },
            "semantic_data": {
                "summary": summary,
                "key_topics": key_topics,
                "entities": entities,
                "language": "en"  # Could be enhanced with language detection
            },
            "technical_metadata": {
                "extraction_method": "automated",
                "confidence_score": confidence,
                "processing_mode": "quick_start" if self.quick_start else "full_system"
            }
        }
        
        return metadata
        
    except Exception as e:
        print(f"❌ Error processing document: {e}")
        return {
            "error": str(e),
            "processing_date": datetime.datetime.now().isoformat()
        }

def _calculate_confidence_score(self, text, word_count):
    """Calculate confidence score based on text quality"""
    score = 0.5  # Base score
    
    # Adjust based on text length
    if word_count > 100:
        score += 0.2
    if word_count > 500:
        score += 0.1
    
    # Adjust based on text quality indicators
    if len(text.split('.')) > 5:  # Multiple sentences
        score += 0.1
    
    # Penalize for potential OCR issues
    special_char_ratio = sum(1 for c in text if not c.isalnum() and c not in ' .,!?;:') / len(text)
    if special_char_ratio > 0.1:
        score -= 0.2
    
    return max(0.1, min(1.0, score))  # Clamp between 0.1 and 1.0

# Add methods to the class
MetadataGenerator.generate_metadata = generate_metadata
MetadataGenerator._calculate_confidence_score = _calculate_confidence_score

print("✅ Main metadata generation method added")

✅ Main metadata generation method added


In [8]:
def save_metadata_to_json(metadata, output_path):
    """Save metadata to JSON file"""
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(metadata, f, indent=2, ensure_ascii=False)
    print(f"✅ Metadata saved to {output_path}")

def save_metadata_to_csv(metadata_list, output_path):
    """Save multiple metadata records to CSV"""
    if not metadata_list:
        print("⚠️ No metadata to save")
        return
    
    # Flatten metadata for CSV
    flattened_data = []
    for metadata in metadata_list:
        if 'error' in metadata:
            continue
            
        flat_record = {
            'filename': metadata['basic_info']['filename'],
            'file_type': metadata['basic_info']['file_type'],
            'file_size': metadata['basic_info']['file_size'],
            'processing_date': metadata['basic_info']['processing_date'],
            'document_type': metadata['content_analysis']['document_type'],
            'word_count': metadata['content_analysis']['word_count'],
            'character_count': metadata['content_analysis']['character_count'],
            'readability_score': metadata['content_analysis']['readability_score'],
            'reading_level': metadata['content_analysis']['reading_level'],
            'summary': metadata['semantic_data']['summary'][:100] + '...',  # Truncate for CSV
            'key_topics': ', '.join(metadata['semantic_data']['key_topics'][:5]),  # Top 5 topics
            'confidence_score': metadata['technical_metadata']['confidence_score']
        }
        flattened_data.append(flat_record)
    
    df = pd.DataFrame(flattened_data)
    df.to_csv(output_path, index=False)
    print(f"✅ Metadata CSV saved to {output_path}")

def process_single_file(generator, file_path):
    """Process a single file and return metadata"""
    print(f"📄 Processing: {file_path}")
    metadata = generator.generate_metadata(file_path)
    
    if 'error' not in metadata:
        print(f"✅ Successfully processed: {file_path}")
        print(f"   Document Type: {metadata['content_analysis']['document_type']}")
        print(f"   Word Count: {metadata['content_analysis']['word_count']}")
        print(f"   Confidence Score: {metadata['technical_metadata']['confidence_score']}")
    else:
        print(f"❌ Failed to process: {file_path}")
    
    return metadata

def process_batch_files(generator, folder_path):
    """Process all supported files in a folder"""
    supported_extensions = ['.pdf', '.docx', '.txt', '.md']
    folder = Path(folder_path)
    
    if not folder.exists():
        print(f"❌ Folder not found: {folder_path}")
        return []
    
    files = [f for f in folder.iterdir() 
             if f.is_file() and f.suffix.lower() in supported_extensions]
    
    if not files:
        print(f"⚠️ No supported files found in {folder_path}")
        return []
    
    print(f"📁 Found {len(files)} files to process")
    
    metadata_list = []
    for file_path in files:
        metadata = process_single_file(generator, file_path)
        metadata_list.append(metadata)
    
    return metadata_list

print("✅ Utility functions defined")

✅ Utility functions defined


In [9]:
def run_interactive_cli():
    """Run the interactive command line interface"""
    print("🚀 Automated Metadata Generation System")
    print("=" * 50)
    
    # Choose system mode
    while True:
        print("\n📋 Choose System Mode:")
        print("1. Full System (with AI models)")
        print("2. Quick Start (basic processing)")
        
        mode_choice = input("Enter your choice (1 or 2): ").strip()
        
        if mode_choice == "1":
            quick_start = False
            break
        elif mode_choice == "2":
            quick_start = True
            break
        else:
            print("❌ Invalid choice. Please enter 1 or 2.")
    
    # Initialize generator
    print(f"\n🔧 Initializing system in {'Full' if not quick_start else 'Quick Start'} mode...")
    generator = MetadataGenerator(quick_start=quick_start)
    
    while True:
        print("\n" + "=" * 50)
        print("📋 Choose Processing Mode:")
        print("1. Process Single File")
        print("2. Process Batch Files")
        print("3. Process Direct Text")
        print("4. Exit")
        
        choice = input("Enter your choice (1-4): ").strip()
        
        if choice == "1":
            # Single file processing
            file_path = input("Enter file path: ").strip().strip('"\'')
            
            if not os.path.exists(file_path):
                print("❌ File not found!")
                continue
            
            metadata = process_single_file(generator, file_path)
            
            # Display results
            if 'error' not in metadata:
                print("\n📊 Metadata Summary:")
                print(f"   📄 File: {metadata['basic_info']['filename']}")
                print(f"   📝 Type: {metadata['content_analysis']['document_type']}")
                print(f"   📏 Words: {metadata['content_analysis']['word_count']}")
                print(f"   📖 Summary: {metadata['semantic_data']['summary'][:100]}...")
                
                # Save option
                save_choice = input("\n💾 Save metadata? (y/n): ").strip().lower()
                if save_choice == 'y':
                    output_path = f"metadata_{Path(file_path).stem}.json"
                    save_metadata_to_json(metadata, output_path)
        
        elif choice == "2":
            # Batch processing
            folder_path = input("Enter folder path: ").strip().strip('"\'')
            
            if not os.path.exists(folder_path):
                print("❌ Folder not found!")
                continue
            
            metadata_list = process_batch_files(generator, folder_path)
            
            if metadata_list:
                successful = len([m for m in metadata_list if 'error' not in m])
                print(f"\n📊 Batch Processing Complete!")
                print(f"   ✅ Successfully processed: {successful}/{len(metadata_list)} files")
                
                # Save options
                save_choice = input("\n💾 Save all metadata? (y/n): ").strip().lower()
                if save_choice == 'y':
                    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
                    
                    # Save JSON
                    json_path = f"batch_metadata_{timestamp}.json"
                    with open(json_path, 'w', encoding='utf-8') as f:
                        json.dump(metadata_list, f, indent=2, ensure_ascii=False)
                    
                    # Save CSV
                    csv_path = f"batch_metadata_{timestamp}.csv"
                    save_metadata_to_csv(metadata_list, csv_path)
        
        elif choice == "3":
            # Direct text processing
            print("Enter your text (press Enter twice to finish):")
            lines = []
            while True:
                line = input()
                if line == "" and lines and lines[-1] == "":
                    break
                lines.append(line)
            
            text = '\n'.join(lines[:-1])  # Remove the last empty line
            
            if text.strip():
                metadata = generator.generate_metadata(text, is_text=True)
                
                if 'error' not in metadata:
                    print("\n📊 Metadata Summary:")
                    print(f"   📝 Type: {metadata['content_analysis']['document_type']}")
                    print(f"   📏 Words: {metadata['content_analysis']['word_count']}")
                    print(f"   📖 Summary: {metadata['semantic_data']['summary']}")
                    
                    # Save option
                    save_choice = input("\n💾 Save metadata? (y/n): ").strip().lower()
                    if save_choice == 'y':
                        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
                        output_path = f"text_metadata_{timestamp}.json"
                        save_metadata_to_json(metadata, output_path)
            else:
                print("❌ No text entered!")
        
        elif choice == "4":
            print("👋 Goodbye!")
            break
        
        else:
            print("❌ Invalid choice. Please enter 1-4.")

# Run the interactive CLI
run_interactive_cli()

🚀 Automated Metadata Generation System

📋 Choose System Mode:
1. Full System (with AI models)
2. Quick Start (basic processing)


Enter your choice (1 or 2):  1



🔧 Initializing system in Full mode...
✅ spaCy model loaded


Device set to use cpu


✅ Summarization model loaded
✅ MetadataGenerator initialized (Full System mode)

📋 Choose Processing Mode:
1. Process Single File
2. Process Batch Files
3. Process Direct Text
4. Exit


Enter your choice (1-4):  1
Enter file path:  /home/navya/Downloads/resume_f.pdf


📄 Processing: /home/navya/Downloads/resume_f.pdf
PyMuPDF extraction failed: module 'fitz' has no attribute 'open'
✅ Successfully processed: /home/navya/Downloads/resume_f.pdf
   Document Type: Manual
   Word Count: 376
   Confidence Score: 0.7999999999999999

📊 Metadata Summary:
   📄 File: resume_f.pdf
   📝 Type: Manual
   📏 Words: 376
   📖 Summary: Cadet | National Cadet Corps, IIT Roorkee. Participated in the Guard of Honor held on Republic Day 2...



💾 Save metadata? (y/n):  n



📋 Choose Processing Mode:
1. Process Single File
2. Process Batch Files
3. Process Direct Text
4. Exit


Enter your choice (1-4):  4


👋 Goodbye!


In [10]:
# Demo with sample text
def run_demo():
    """Run a demonstration with sample text"""
    print("🎯 Running Demo with Sample Text...")
    
    sample_text = """
    Artificial Intelligence in Document Processing: A Comprehensive Analysis
    
    This research paper examines the transformative impact of artificial intelligence 
    technologies on automated document processing workflows. The study analyzes various 
    machine learning approaches including natural language processing, optical character 
    recognition, and semantic analysis for enhancing document digitization and metadata 
    extraction processes.
    
    Our methodology involved testing multiple AI models across diverse document types 
    including legal contracts, research papers, technical manuals, and business reports. 
    The findings demonstrate significant improvements in processing accuracy and efficiency 
    when compared to traditional manual methods.
    
    Key findings include a 75% reduction in processing time and 90% improvement in metadata 
    accuracy. The implementation of transformer-based models showed particular promise for 
    complex document understanding tasks.
    
    This research was conducted by Dr. John Smith from Stanford University in collaboration 
    with the MIT AI Lab. The study period spanned from January 2024 to March 2025.
    """
    
    # Test both modes
    for quick_start in [True, False]:
        print(f"\n{'='*60}")
        print(f"Testing {'Quick Start' if quick_start else 'Full System'} Mode")
        print(f"{'='*60}")
        
        generator = MetadataGenerator(quick_start=quick_start)
        metadata = generator.generate_metadata(sample_text, is_text=True)
        
        if 'error' not in metadata:
            print("✅ Demo successful!")
            print(f"📝 Document Type: {metadata['content_analysis']['document_type']}")
            print(f"📏 Word Count: {metadata['content_analysis']['word_count']}")
            print(f"📊 Readability Score: {metadata['content_analysis']['readability_score']}")
            print(f"🎯 Confidence Score: {metadata['technical_metadata']['confidence_score']}")
            print(f"📖 Summary: {metadata['semantic_data']['summary'][:150]}...")
            
            if metadata['semantic_data']['key_topics']:
                print(f"🏷️ Key Topics: {', '.join(metadata['semantic_data']['key_topics'][:5])}")
            
            if metadata['semantic_data']['entities']:
                print("👥 Entities found:")
                for entity_type, entities in metadata['semantic_data']['entities'].items():
                    if entities:
                        print(f"   {entity_type}: {', '.join(entities[:3])}")
        else:
            print(f"❌ Demo failed: {metadata['error']}")

# Run the demo
run_demo()

🎯 Running Demo with Sample Text...

Testing Quick Start Mode
✅ MetadataGenerator initialized (Quick Start mode)
✅ Demo successful!
📝 Document Type: Document
📏 Word Count: 142
📊 Readability Score: -2.81
🎯 Confidence Score: 0.7999999999999999
📖 Summary: 
    Artificial Intelligence in Document Processing: A Comprehensive Analysis
    
    This research paper examines the transformative impact of artif...

Testing Full System Mode
✅ spaCy model loaded


Device set to use cpu


✅ Summarization model loaded
✅ MetadataGenerator initialized (Full System mode)
✅ Demo successful!
📝 Document Type: Legal
📏 Word Count: 142
📊 Readability Score: -2.81
🎯 Confidence Score: 0.7999999999999999
📖 Summary: The study analyzes various machine learning approaches including natural language processing, optical character recognition, and semantic analysis for...
🏷️ Key Topics: document, research, processing, artificial, intelligence
👥 Entities found:
   ORG: Artificial Intelligence in Document Processing, AI, Stanford University
   PERCENT: 75%, 90%
   PERSON: John Smith
   DATE: January 2024 to March 2025
