In [None]:
# %% [markdown]
# # OCR Processor Notebook
# ## 1. Import Dependencies

# %%
import csv
import io
from pathlib import Path
import pytesseract
from PIL import Image, ImageOps
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from google.oauth2 import service_account

In [None]:
# %% [markdown]
# ## 2. Configuration

# %%
# Configuration Constants
SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
IMAGE_EXTENSIONS = ('.png', '.jpg', '.jpeg', '.webp', '.bmp', '.tiff')

In [None]:
# %% [markdown]
# ## 3. Core Functions

# %%
def authenticate_google_drive(credential_file):
    """Authenticate with Google Drive API"""
    credentials = service_account.Credentials.from_service_account_file(
        credential_file, scopes=SCOPES
    )
    return build('drive', 'v3', credentials=credentials)

def list_files_in_folder(service, folder_id):
    """List image files in Google Drive folder"""
    query = f"'{folder_id}' in parents and trashed = false"
    results = service.files().list(
        q=query,
        fields="files(id, name, mimeType)"
    ).execute()
    return [f for f in results.get('files', [])
            if f['name'].lower().endswith(IMAGE_EXTENSIONS)]

def download_file(service, file_id, filename, destination_folder):
    """Download file from Google Drive if not exists"""
    dest_path = Path(destination_folder) / filename
    if dest_path.exists():
        return

    dest_path.parent.mkdir(parents=True, exist_ok=True)
    request = service.files().get_media(fileId=file_id)

    with io.BytesIO() as fh:
        downloader = MediaIoBaseDownload(fh, request)
        done = False
        while not done:
            status, done = downloader.next_chunk()
        
        with open(dest_path, 'wb') as f:
            f.write(fh.getbuffer())

def process_image(img):
    """Enhance image for OCR processing"""
    if img.mode == 'RGBA':
        background = Image.new('RGB', img.size, (255, 255, 255))
        background.paste(img, mask=img.split()[-1])
        img = background

    gray = img.convert('L')
    processed = gray.point(lambda x: ((x / 255) ** 3 * 255))  # Gamma
    processed = processed.point(lambda x: 255 if x > 128 else 0)  # Threshold
    return ImageOps.invert(processed)

def ocr_image(image):
    """Perform OCR on PIL Image"""
    data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
    text = pytesseract.image_to_string(image)
    
    confidences = [float(c) for c, t in zip(data['conf'], data['text'])
                  if t.strip() and float(c) >= 0]
    avg_confidence = round(sum(confidences)/len(confidences), 2) if confidences else 0
    return text, avg_confidence

In [None]:
# %% [markdown]
# ## 4. Processing Functions

# %%
def process_files(file_list, service=None, force=False):
    """Process files (Drive or local)"""
    Path('transcripts').mkdir(exist_ok=True)
    
    for file_info in file_list:
        try:
            if isinstance(file_info, dict):  # Google Drive file
                filename = file_info['name']
                file_id = file_info['id']
                source_path = Path('downloaded_images') / filename
                download_file(service, file_id, filename, 'downloaded_images')
            else:  # Local file
                filename = file_info.name
                source_path = file_info

            transcript_path = Path('transcripts') / f"{source_path.stem}.txt"
            if not force and transcript_path.exists():
                print(f"Skipping {filename} - transcript exists")
                continue

            print(f"\nProcessing {filename}")
            
            with Image.open(source_path) as img:
                processed_img = process_image(img)
                text, confidence = ocr_image(processed_img)
                
                print(f"OCR Confidence: {confidence}%")
                with open(transcript_path, 'w', encoding='utf-8') as f:
                    f.write(text)
                print(f"Saved transcript to {transcript_path}")

        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")

def export_csv(output_file):
    """Export transcripts to CSV"""
    transcript_dir = Path('transcripts')
    downloaded_dir = Path('downloaded_images')
    
    records = []
    
    for transcript_path in transcript_dir.glob('*.txt'):
        try:
            stem = transcript_path.stem
            image_path = None
            original_filename = None
            
            for ext in IMAGE_EXTENSIONS:
                possible_path = downloaded_dir / f"{stem}{ext}"
                if possible_path.exists():
                    image_path = possible_path
                    original_filename = possible_path.name
                    break
            
            with open(transcript_path, 'r', encoding='utf-8') as f:
                transcript_text = f.read()
            
            records.append({
                'original_file_name': original_filename or 'Unknown',
                'file_path': str(image_path) if image_path else 'Not found',
                'transcript_text': transcript_text
            })
            
        except Exception as e:
            print(f"Error processing {transcript_path.name}: {str(e)}")
    
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, 
            fieldnames=['original_file_name', 'file_path', 'transcript_text'])
        writer.writeheader()
        writer.writerows(records)
    
    print(f"Exported {len(records)} transcripts to {output_file}")

In [None]:
# %% [markdown]
# ## 5. Execution Control
# 
# Choose your operation mode:

# %%
# Configuration Cell - Edit these values before running
MODE = 'export'  # Choose from: 'drive', 'local', 'export'
FORCE_REPROCESS = False  # Set to True to overwrite existing transcripts
GOOGLE_FOLDER_ID = '18RaLotdAd5ggl9g2MDXfsnVSjJxOWmw5'  # For drive mode
CREDENTIALS_FILE = 'credentials/testing-451622-d2bb9ea8367e.json'  # For drive mode
CSV_OUTPUT_FILE = 'transcripts.csv'  # For export mode

In [None]:
# %% [markdown]
# ## 6. Run Selected Operation

# %%
if MODE == 'drive':
    service = authenticate_google_drive(CREDENTIALS_FILE)
    files = list_files_in_folder(service, GOOGLE_FOLDER_ID)
    print(f"Found {len(files)} images in Google Drive")
    process_files(files, service=service, force=FORCE_REPROCESS)
    
elif MODE == 'local':
    image_files = []
    for ext in IMAGE_EXTENSIONS:
        image_files.extend(Path('downloaded_images').glob(f'*{ext}'))
    print(f"Found {len(image_files)} local images")
    process_files(image_files, force=FORCE_REPROCESS)
    
elif MODE == 'export':
    export_csv(CSV_OUTPUT_FILE)
    
else:
    print("Invalid mode selected. Please choose from: 'drive', 'local', 'export'")

print("\nProcessing complete!")