In [None]:
# Imports
import re
from typing import Dict, Optional
import os
from pymongo import MongoClient
import json
import pandas as pd
from IPython.display import HTML, display
import logging

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

# MongoDB connection
def get_db():
    client = MongoClient(os.getenv('MONGODB_URI', 'mongodb://db:27017/'))
    return client['chelle']

def convert_markdown_table_to_html(table_str: str, table_index: int) -> Optional[str]:
    """Convert a markdown table string to HTML with styling"""
    # Split the table into lines
    lines = table_str.strip().split('\n')
    if len(lines) < 2:
        return None
        
    # Process header
    header = lines[0]
    headers = [h.strip() for h in header.split('|')[1:-1]]
    
    # Process alignment row
    align_row = lines[1]
    alignments = []
    for align in align_row.split('|')[1:-1]:
        align = align.strip()
        if align.startswith(':') and align.endswith(':'):
            alignments.append('center')
        elif align.endswith(':'):
            alignments.append('right')
        else:
            alignments.append('left')
            
    # Process data rows
    rows = []
    for line in lines[2:]:
        if line.strip():
            cells = [cell.strip() for cell in line.split('|')[1:-1]]
            rows.append(cells)
            
    # Generate HTML with styling
    html = [
        '<div style="overflow-x: auto;">',
        '<style>',
        'table { border-collapse: collapse; width: 100%; margin: 1em 0; }',
        'th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }',
        'th { background-color: #f5f5f5; font-weight: bold; }',
        'tr:nth-child(even) { background-color: #f9f9f9; }',
        'tr:hover { background-color: #f5f5f5; }',
        '</style>',
        '<table>',
        '<thead>',
        '<tr>'
    ]
    
    # Add headers
    for i, header in enumerate(headers):
        align = f' style="text-align: {alignments[i]}"' if i < len(alignments) else ''
        html.append(f'<th{align}>{header}</th>')
        
    html.append('</tr>')
    html.append('</thead>')
    html.append('<tbody>')
    
    # Add rows
    for row in rows:
        html.append('<tr>')
        for i, cell in enumerate(row):
            align = f' style="text-align: {alignments[i]}"' if i < len(alignments) else ''
            html.append(f'<td{align}>{cell}</td>')
        html.append('</tr>')
        
    html.append('</tbody>')
    html.append('</table>')
    html.append('</div>')
    
    return '\n'.join(html)

def extract_tables_from_markdown(markdown_content: str) -> Dict[str, str]:
    """Extract tables from markdown content and convert them to HTML"""
    table_pattern = r'(?:\n\n|\A)(\|[^\n]*\|\n\|[-:| ]*\|\n(?:\|[^\n]*\|\n?)*)'
    tables = {}
    matches = re.finditer(table_pattern, markdown_content)
    
    for i, match in enumerate(matches):
        table_str = match.group(1)
        html = convert_markdown_table_to_html(table_str, i)
        if html:
            tables[f'table_{i}'] = html
            
    return tables

# Test functions
def test_with_sample_markdown():
    """Test table extraction with a sample markdown string"""
    sample_markdown = """
Here's a simple table:

| Name | Age | City |
|------|-----|------|
| John | 30  | NYC  |
| Jane | 25  | LA   |
| Bob  | 35  | CHI  |

And here's another table with alignment:

| Item | Price | Quantity |
|:-----|:-----:|--------:|
| Apple | $1.00 | 10 |
| Orange | $0.75 | 15 |
| Banana | $0.50 | 20 |
"""
    
    tables = extract_tables_from_markdown(sample_markdown)
    print(f"Found {len(tables)} tables")
    
    for table_name, table_html in tables.items():
        print(f"\nTable: {table_name}")
        display(HTML(table_html))
        
def test_with_file(file_id: str):
    """Test table extraction with a real file from MongoDB"""
    db = get_db()
    asset = db['raw_assets'].find_one({'_id': ObjectId(file_id)})
    
    if not asset:
        print(f"File not found: {file_id}")
        return
        
    if not asset.get('processed_paths', {}).get('markdown'):
        print("No markdown content found for this file")
        return
        
    # Read markdown content
    with open(asset['processed_paths']['markdown'], 'r') as f:
        markdown_content = f.read()
        
    # Extract tables
    tables = extract_tables_from_markdown(markdown_content)
    print(f"Found {len(tables)} tables in file {asset['original_name']}")
    
    for table_name, table_html in tables.items():
        print(f"\nTable: {table_name}")
        display(HTML(table_html))
        
def analyze_table_structure(markdown_content: str):
    """Analyze the structure of tables in markdown content"""
    table_pattern = r'(?:\n\n|\A)(\|[^\n]*\|\n\|[-:| ]*\|\n(?:\|[^\n]*\|\n?)*)'
    matches = re.finditer(table_pattern, markdown_content)
    
    for i, match in enumerate(matches):
        table_str = match.group(1)
        lines = table_str.strip().split('\n')
        
        print(f"\nTable {i + 1} Analysis:")
        print(f"Total rows: {len(lines)}")
        print(f"Header: {lines[0]}")
        print(f"Alignment row: {lines[1]}")
        
        # Analyze column counts
        header_cols = len([col for col in lines[0].split('|') if col.strip()])
        data_cols = [len([col for col in line.split('|') if col.strip()]) for line in lines[2:]]
        
        print(f"Header columns: {header_cols}")
        print(f"Data row column counts: {data_cols}")
        
        if len(set(data_cols)) > 1:
            print("WARNING: Inconsistent column counts in d