# Module 6: Document Forgery Detection

This notebook covers:
- OCR text extraction from license images
- Format validation using regex
- Image quality analysis for forgery indicators
- CNN-based classification (transfer learning)

In [None]:
import numpy as np
import re
from pathlib import Path

# Check available libraries
try:
    import cv2
    CV2_AVAILABLE = True
    print("✓ OpenCV available")
except ImportError:
    CV2_AVAILABLE = False
    print("✗ OpenCV not installed")

try:
    import pytesseract
    TESSERACT_AVAILABLE = True
    print("✓ Tesseract available")
except ImportError:
    TESSERACT_AVAILABLE = False
    print("✗ Tesseract not installed")

try:
    import tensorflow as tf
    TF_AVAILABLE = True
    print(f"✓ TensorFlow {tf.__version__} available")
except ImportError:
    TF_AVAILABLE = False
    print("✗ TensorFlow not installed")

In [None]:
# License Validator
class LicenseValidator:
    PATTERNS = {
        'license_no': r'UAE-[A-Z]{3}-\d{6}',
        'date': r'\d{4}-\d{2}-\d{2}',
    }
    
    @staticmethod
    def validate_license_number(text):
        matches = re.findall(LicenseValidator.PATTERNS['license_no'], text)
        return len(matches) > 0, matches
    
    @staticmethod
    def validate_dates(text):
        dates = re.findall(LicenseValidator.PATTERNS['date'], text)
        valid = []
        for d in dates:
            try:
                y, m, day = map(int, d.split('-'))
                if 1 <= m <= 12 and 1 <= day <= 31:
                    valid.append(d)
            except: pass
        return len(valid) > 0, valid
    
    @staticmethod
    def analyze(text):
        issues = []
        license_ok, licenses = LicenseValidator.validate_license_number(text)
        if not license_ok: issues.append("Invalid license number")
        dates_ok, dates = LicenseValidator.validate_dates(text)
        if not dates_ok: issues.append("Invalid dates")
        return {'valid': len(issues) == 0, 'issues': issues, 'licenses': licenses, 'dates': dates}

# Test
sample = "License No: UAE-DXB-123456\nDOB: 1985-03-15\nExpiry: 2026-03-15"
print("Sample validation:", LicenseValidator.analyze(sample))

In [None]:
# CNN Model for Forgery Detection (if TensorFlow available)
if TF_AVAILABLE:
    from tensorflow.keras.applications import MobileNetV2
    from tensorflow.keras import layers, models
    
    def create_forgery_model():
        base = MobileNetV2(input_shape=(224, 224, 3), include_top=False, weights='imagenet')
        base.trainable = False
        
        model = models.Sequential([
            base,
            layers.GlobalAveragePooling2D(),
            layers.Dense(128, activation='relu'),
            layers.Dropout(0.5),
            layers.Dense(1, activation='sigmoid')
        ])
        model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
        return model
    
    print("Creating forgery detection model...")
    forgery_model = create_forgery_model()
    print(f"Model created with {forgery_model.count_params():,} parameters")
else:
    print("TensorFlow not available - skipping CNN model")

## Summary

The forgery detection pipeline includes:

1. **OCR Extraction**: Uses Tesseract to extract text from documents
2. **Format Validation**: Regex patterns check license numbers and dates
3. **Image Quality**: OpenCV checks for blur, resolution, edge anomalies
4. **CNN Classification**: MobileNetV2 transfer learning for image classification

For production use:
- Collect labeled dataset of genuine vs forged documents
- Fine-tune the CNN model
- Implement human-in-the-loop for edge cases