# 📊 Document Analyzer

Comprehensive document analysis tool! Features:
- **Metadata extraction:** Author, creation date, file size, page count
- **Content analysis:** Text statistics, word frequency, readability scores
- **Security analysis:** Encryption status, digital signatures, permissions
- **Format detection:** File type validation and format-specific analysis
- **Batch processing:** Analyze multiple documents at once

**Supported formats:** PDF, DOCX, TXT, RTF, HTML, and more!


## 🚀 How to Use

**Option 1:** Set `path_arg` to your file/folder path
**Option 2:** Upload files directly (in Colab)

### Examples:
```python
# Single document
path_arg = "document.pdf"

# Folder with documents  
path_arg = "/path/to/documents/"

# ZIP file
path_arg = "documents.zip"
```


In [None]:
# Import libraries
import os, zipfile, shutil, uuid, json, re
from pathlib import Path
from datetime import datetime
import hashlib
from collections import Counter
import textstat

# Check if running in Google Colab
try:
    from google.colab import files
    IS_COLAB = True
    print("🔧 Running in Google Colab")
except ImportError:
    IS_COLAB = False
    print("🔧 Running locally")

# Install required packages if needed
try:
    from PyPDF2 import PdfReader
    import docx
    from PIL import Image
    import magic
    print("✅ Required packages available")
except ImportError:
    print("📦 Installing required packages...")
    import subprocess
    subprocess.check_call(["pip", "install", "PyPDF2", "python-docx", "Pillow", "python-magic", "textstat"])
    from PyPDF2 import PdfReader
    import docx
    from PIL import Image
    import magic
    print("✅ Packages installed successfully")


In [None]:
def get_documents(path):
    """Get all document files from path (file, folder, or zip)"""
    if not os.path.exists(path):
        raise FileNotFoundError(f"❌ Path not found: {path}")
    
    # Supported document extensions
    doc_extensions = ('.pdf', '.docx', '.doc', '.txt', '.rtf', '.html', '.htm', '.md', '.odt')
    
    if zipfile.is_zipfile(path):
        print(f"📦 Extracting ZIP: {os.path.basename(path)}")
        base = os.path.splitext(os.path.basename(path))[0]
        tmp_dir = f"ext_{uuid.uuid4().hex[:6]}"
        os.makedirs(tmp_dir, exist_ok=True)
        zipfile.ZipFile(path).extractall(tmp_dir)
        docs = [os.path.join(r, f) for r, _, fs in os.walk(tmp_dir) 
                for f in fs if f.lower().endswith(doc_extensions)]
        return docs, base, tmp_dir
    elif os.path.isdir(path):
        print(f"📁 Scanning folder: {os.path.basename(path)}")
        base = os.path.basename(os.path.normpath(path))
        docs = [os.path.join(r, f) for r, _, fs in os.walk(path) 
                for f in fs if f.lower().endswith(doc_extensions)]
        return docs, base, None
    else:
        print(f"📄 Processing single document: {os.path.basename(path)}")
        return [path], os.path.splitext(os.path.basename(path))[0], None

def calculate_file_hash(file_path):
    """Calculate MD5 and SHA256 hashes of a file"""
    md5_hash = hashlib.md5()
    sha256_hash = hashlib.sha256()
    
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            md5_hash.update(chunk)
            sha256_hash.update(chunk)
    
    return {
        'md5': md5_hash.hexdigest(),
        'sha256': sha256_hash.hexdigest()
    }

def analyze_text_content(text):
    """Analyze text content for statistics and readability"""
    if not text or not text.strip():
        return {
            'word_count': 0,
            'character_count': 0,
            'sentence_count': 0,
            'paragraph_count': 0,
            'readability_score': 0,
            'top_words': [],
            'language': 'unknown'
        }
    
    # Basic statistics
    words = re.findall(r'\\b\\w+\\b', text.lower())
    sentences = re.split(r'[.!?]+', text)
    paragraphs = [p.strip() for p in text.split('\\n\\n') if p.strip()]
    
    # Word frequency
    word_freq = Counter(words)
    top_words = word_freq.most_common(10)
    
    # Readability scores
    try:
        flesch_score = textstat.flesch_reading_ease(text)
        flesch_kincaid = textstat.flesch_kincaid_grade(text)
    except:
        flesch_score = 0
        flesch_kincaid = 0
    
    return {
        'word_count': len(words),
        'character_count': len(text),
        'sentence_count': len([s for s in sentences if s.strip()]),
        'paragraph_count': len(paragraphs),
        'readability_score': flesch_score,
        'grade_level': flesch_kincaid,
        'top_words': top_words,
        'average_word_length': sum(len(word) for word in words) / len(words) if words else 0,
        'average_sentence_length': len(words) / len([s for s in sentences if s.strip()]) if sentences else 0
    }


In [None]:
def analyze_pdf(file_path):
    """Analyze PDF document"""
    try:
        with open(file_path, 'rb') as file:
            reader = PdfReader(file)
            
            # Extract text
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\\n"
            
            # Get metadata
            metadata = {}
            if reader.metadata:
                metadata = {
                    'title': str(reader.metadata.get('/Title', '')),
                    'author': str(reader.metadata.get('/Author', '')),
                    'subject': str(reader.metadata.get('/Subject', '')),
                    'creator': str(reader.metadata.get('/Creator', '')),
                    'producer': str(reader.metadata.get('/Producer', '')),
                    'creation_date': str(reader.metadata.get('/CreationDate', '')),
                    'modification_date': str(reader.metadata.get('/ModDate', ''))
                }
            
            # Security analysis
            security = {
                'encrypted': reader.is_encrypted,
                'permissions': {}
            }
            
            if reader.is_encrypted:
                try:
                    security['permissions'] = {
                        'print': reader.get_fields().get('/Print', 'Unknown') if reader.get_fields() else 'Unknown',
                        'modify': reader.get_fields().get('/Modify', 'Unknown') if reader.get_fields() else 'Unknown',
                        'copy': reader.get_fields().get('/Copy', 'Unknown') if reader.get_fields() else 'Unknown'
                    }
                except:
                    pass
            
            return {
                'type': 'PDF',
                'pages': len(reader.pages),
                'metadata': metadata,
                'security': security,
                'text_analysis': analyze_text_content(text)
            }
            
    except Exception as e:
        return {'type': 'PDF', 'error': str(e)}

def analyze_docx(file_path):
    """Analyze DOCX document"""
    try:
        doc = docx.Document(file_path)
        
        # Extract text
        text = "\\n".join([paragraph.text for paragraph in doc.paragraphs])
        
        # Get metadata
        metadata = {
            'title': doc.core_properties.title or '',
            'author': doc.core_properties.author or '',
            'subject': doc.core_properties.subject or '',
            'keywords': doc.core_properties.keywords or '',
            'created': str(doc.core_properties.created) if doc.core_properties.created else '',
            'modified': str(doc.core_properties.modified) if doc.core_properties.modified else '',
            'last_modified_by': doc.core_properties.last_modified_by or ''
        }
        
        # Count elements
        element_counts = {
            'paragraphs': len(doc.paragraphs),
            'tables': len(doc.tables),
            'images': len(doc.inline_shapes),
            'sections': len(doc.sections)
        }
        
        return {
            'type': 'DOCX',
            'metadata': metadata,
            'element_counts': element_counts,
            'text_analysis': analyze_text_content(text)
        }
        
    except Exception as e:
        return {'type': 'DOCX', 'error': str(e)}

def analyze_text_file(file_path):
    """Analyze plain text file"""
    try:
        # Try different encodings
        encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
        text = ""
        encoding_used = "unknown"
        
        for encoding in encodings:
            try:
                with open(file_path, 'r', encoding=encoding) as f:
                    text = f.read()
                    encoding_used = encoding
                    break
            except UnicodeDecodeError:
                continue
        
        if not text:
            # If all encodings fail, read as binary and decode with errors='ignore'
            with open(file_path, 'rb') as f:
                text = f.read().decode('utf-8', errors='ignore')
                encoding_used = "binary_fallback"
        
        return {
            'type': 'TEXT',
            'encoding': encoding_used,
            'text_analysis': analyze_text_content(text)
        }
        
    except Exception as e:
        return {'type': 'TEXT', 'error': str(e)}


In [None]:
def analyze_document(file_path):
    """Analyze a single document based on its type"""
    file_ext = os.path.splitext(file_path)[1].lower()
    file_size = os.path.getsize(file_path)
    file_hash = calculate_file_hash(file_path)
    
    # Basic file info
    basic_info = {
        'filename': os.path.basename(file_path),
        'file_size': file_size,
        'file_size_mb': round(file_size / (1024 * 1024), 2),
        'file_extension': file_ext,
        'modified_date': datetime.fromtimestamp(os.path.getmtime(file_path)).isoformat(),
        'hashes': file_hash
    }
    
    # Analyze based on file type
    if file_ext == '.pdf':
        analysis = analyze_pdf(file_path)
    elif file_ext in ['.docx', '.doc']:
        analysis = analyze_docx(file_path)
    elif file_ext in ['.txt', '.md', '.rtf']:
        analysis = analyze_text_file(file_path)
    else:
        analysis = {'type': 'UNKNOWN', 'error': f'Unsupported file type: {file_ext}'}
    
    # Combine basic info with analysis
    result = {**basic_info, **analysis}
    return result

def process(path, include_hashes=True, detailed_analysis=True):
    """Process documents and generate analysis report"""
    try:
        docs, base, tmp = get_documents(path)
        if not docs:
            print("❌ No valid documents found.")
            return

        print(f"📊 Found {len(docs)} document(s)")
        
        out_dir = f"analysis_{uuid.uuid4().hex[:6]}"
        os.makedirs(out_dir, exist_ok=True)
        
        results = []
        summary_stats = {
            'total_files': len(docs),
            'total_size_mb': 0,
            'file_types': Counter(),
            'total_words': 0,
            'total_pages': 0,
            'encrypted_files': 0
        }
        
        for i, doc in enumerate(docs, 1):
            print(f"🔄 Analyzing {i}/{len(docs)}: {os.path.basename(doc)}")
            
            try:
                result = analyze_document(doc)
                results.append(result)
                
                # Update summary statistics
                summary_stats['total_size_mb'] += result.get('file_size_mb', 0)
                summary_stats['file_types'][result.get('type', 'UNKNOWN')] += 1
                
                if 'text_analysis' in result:
                    summary_stats['total_words'] += result['text_analysis'].get('word_count', 0)
                
                if 'pages' in result:
                    summary_stats['total_pages'] += result.get('pages', 0)
                
                if result.get('security', {}).get('encrypted', False):
                    summary_stats['encrypted_files'] += 1
                
                print(f"  ✅ {result.get('type', 'UNKNOWN')} - {result.get('file_size_mb', 0)} MB")
                
            except Exception as e:
                print(f"  ❌ Error analyzing {os.path.basename(doc)}: {e}")
                results.append({
                    'filename': os.path.basename(doc),
                    'error': str(e)
                })
        
        # Generate detailed report
        report_path = os.path.join(out_dir, "analysis_report.txt")
        with open(report_path, 'w', encoding='utf-8') as f:
            f.write("=== DOCUMENT ANALYSIS REPORT ===\\n\\n")
            f.write(f"Generated: {datetime.now().isoformat()}\\n")
            f.write(f"Total documents: {summary_stats['total_files']}\\n")
            f.write(f"Total size: {summary_stats['total_size_mb']:.2f} MB\\n")
            f.write(f"Total words: {summary_stats['total_words']:,}\\n")
            f.write(f"Total pages: {summary_stats['total_pages']}\\n")
            f.write(f"Encrypted files: {summary_stats['encrypted_files']}\\n\\n")
            
            f.write("File type distribution:\\n")
            for file_type, count in summary_stats['file_types'].items():
                f.write(f"  {file_type}: {count}\\n")
            
            f.write("\\n=== DETAILED ANALYSIS ===\\n\\n")
            
            for result in results:
                f.write(f"📄 {result.get('filename', 'Unknown')}\\n")
                f.write(f"Type: {result.get('type', 'Unknown')}\\n")
                f.write(f"Size: {result.get('file_size_mb', 0)} MB\\n")
                
                if 'error' in result:
                    f.write(f"Error: {result['error']}\\n")
                else:
                    if 'metadata' in result and result['metadata']:
                        f.write("Metadata:\\n")
                        for key, value in result['metadata'].items():
                            if value:
                                f.write(f"  {key}: {value}\\n")
                    
                    if 'text_analysis' in result:
                        ta = result['text_analysis']
                        f.write(f"Text Analysis:\\n")
                        f.write(f"  Words: {ta.get('word_count', 0):,}\\n")
                        f.write(f"  Characters: {ta.get('character_count', 0):,}\\n")
                        f.write(f"  Sentences: {ta.get('sentence_count', 0)}\\n")
                        f.write(f"  Paragraphs: {ta.get('paragraph_count', 0)}\\n")
                        f.write(f"  Readability Score: {ta.get('readability_score', 0):.1f}\\n")
                        f.write(f"  Grade Level: {ta.get('grade_level', 0):.1f}\\n")
                        
                        if ta.get('top_words'):
                            f.write(f"  Top Words: {', '.join([f'{word}({count})' for word, count in ta['top_words'][:5]])}\\n")
                    
                    if 'security' in result and result['security'].get('encrypted'):
                        f.write("Security: ENCRYPTED\\n")
                
                f.write("\\n" + "-"*50 + "\\n\\n")
        
        # Save JSON report
        json_path = os.path.join(out_dir, "analysis_report.json")
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump({
                'summary': summary_stats,
                'documents': results,
                'generated': datetime.now().isoformat()
            }, f, ensure_ascii=False, indent=2)
        
        # Create final output
        if len(results) == 1:
            final_file = f"{base}_analysis.txt"
            shutil.move(report_path, final_file)
            print(f"✅ Analysis report: {final_file}")
        else:
            final_file = f"{base}_analysis.zip"
            with zipfile.ZipFile(final_file, 'w', zipfile.ZIP_DEFLATED) as z:
                for root, dirs, files in os.walk(out_dir):
                    for file in files:
                        z.write(os.path.join(root, file), file)
            print(f"✅ Analysis package: {final_file}")
        
        if IS_COLAB:
            files.download(final_file)
            print("📥 Download started!")
        else:
            print(f"📁 Output: {os.path.abspath(final_file)}")

    except Exception as e:
        print(f"❌ Error: {e}")
    finally:
        shutil.rmtree(out_dir, ignore_errors=True)
        if tmp: shutil.rmtree(tmp, ignore_errors=True)


## ⚙️ Configuration

Set your file path and analysis options here:


In [None]:
# Configuration
path_arg = None  # Set your file/folder path here
include_hashes = True  # Set to False to skip hash calculation (faster)
detailed_analysis = True  # Set to False for basic analysis only


## 🎯 Run Document Analysis

Execute the document analysis process:


In [None]:
if path_arg:
    print(f"🚀 Processing: {path_arg}")
    process(path_arg, include_hashes, detailed_analysis)
elif IS_COLAB:
    print("📤 Upload your document files...")
    uploaded = files.upload()
    doc_files = [fname for fname in uploaded.keys() 
                 if fname.lower().endswith(('.pdf', '.docx', '.doc', '.txt', '.rtf', '.html', '.htm', '.md'))]
    
    if doc_files:
        print(f"🚀 Processing {len(doc_files)} document(s)")
        
        # Create temporary directory for uploaded files
        temp_dir = f"temp_{uuid.uuid4().hex[:6]}"
        os.makedirs(temp_dir, exist_ok=True)
        
        for fname in doc_files:
            shutil.move(fname, os.path.join(temp_dir, fname))
        
        process(temp_dir, include_hashes, detailed_analysis)
        shutil.rmtree(temp_dir, ignore_errors=True)
    else:
        print("❌ No supported document files found in upload")
else:
    print("❗ Please set path_arg or upload in Colab.")
