In [None]:
import re
    

Text cleaning to remove common prefixes and suffixes. Also normalizing whitespaces.

In [None]:
def clean_text(text):
    if not text:
        return 
    text = text.replace('\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
    text = re.sub(r'\s+', ' ', text) 
    text = text.strip().lower()
    text = re.sub(r'^\$\s*', '', text)  
    text = re.sub(r'\s*\$\s*$', '', text)  
    return text


In [None]:

def extract_numeric(text):
    """Enhanced numeric extraction with better handling of negative numbers and formats"""
    if not text:
        return None
    
    # Clean the text first
    text = text.replace(',', '').replace('$', '')
    
    # Handle parentheses as negative (accounting format)
    if '(' in text and ')' in text:
        text = text.replace('(', '-').replace(')', '')
    
    # Find all numbers including decimals and negatives
    matches = re.findall(r'-?\d+(?:\.\d+)?', text)
    
    if matches:
        try:
            return float(matches[0])
        except ValueError:
            return None
    return None

def is_likely_financial_table(table):
    """Check if table is likely to contain financial data"""
    text = table.get_text().lower()
    financial_indicators = [
        'assets', 'liabilities', 'equity', 'income', 'revenue', 'cash',
        'debt', 'current', 'total', 'million', 'thousand', '$'
    ]
    return sum(indicator in text for indicator in financial_indicators) >= 3

def should_exclude_match(target, cell_text, matched_variant):
    """Check if a match should be excluded based on exclusion patterns"""
    if target not in EXCLUSION_PATTERNS:
        return False
    
    exclusion_patterns = EXCLUSION_PATTERNS[target]
    
    # Check if any exclusion pattern is found in the cell text
    for pattern in exclusion_patterns:
        if pattern.lower() in cell_text.lower():
            print(f"🚫 Excluding '{cell_text}' for {target} (matches exclusion pattern: '{pattern}')")
            return True
    
    return False

def calculate_match_score(cell_text, variant, target):
    """Enhanced scoring system that prioritizes specificity and completeness"""
    cell_text = cell_text.lower().strip()
    variant = variant.lower().strip()
    
    # Perfect exact match
    if cell_text == variant:
        return 100
    
    # Check for exclusion patterns first
    if should_exclude_match(target, cell_text, variant):
        return 0
    
    # Penalize matches that are too generic compared to what we want
    # For example, if looking for "comprehensive income" but finding "other comprehensive income"
    if len(variant.split()) > 1:  # Multi-word targets
        variant_words = set(variant.split())
        cell_words = set(cell_text.split())
        
        # If cell has extra words that might make it more specific (and wrong)
        extra_words = cell_words - variant_words
        if extra_words:
            # Common words that make items more specific (and often wrong for totals)
            specificity_words = {'other', 'foreign', 'unrealized', 'current', 'non-current', 'noncurrent'}
            if extra_words.intersection(specificity_words):
                return 0  # Reject overly specific matches
    
    # Enhanced scoring based on different match types
    if variant in cell_text:
        # Variant is completely contained in cell text
        coverage_ratio = len(variant) / len(cell_text)
        base_score = 85 * coverage_ratio
        
        # Bonus for being at the start or end (often indicates it's a main item)
        if cell_text.startswith(variant) or cell_text.endswith(variant):
            base_score += 10
            
        return min(base_score, 95)  # Cap at 95 to reserve 100 for exact matches
    
    elif cell_text in variant:
        # Cell text is contained in variant (less ideal but acceptable)
        coverage_ratio = len(cell_text) / len(variant)
        return 70 * coverage_ratio
    
    # Fuzzy matching for similar words
    variant_words = set(variant.split())
    cell_words = set(cell_text.split())
    
    # Calculate word overlap
    common_words = variant_words.intersection(cell_words)
    if common_words:
        word_match_ratio = len(common_words) / len(variant_words)
        return 60 * word_match_ratio
    
    return 0

def find_best_match_in_row(row_cells, target_variants, target_name):
    """Find the best matching cell for target variants with improved scoring"""
    best_match = None
    best_score = 0
    best_variant = None
    
    for i, cell in enumerate(row_cells):
        cell_text = clean_text(cell.get_text(" ", strip=True))
        
        # Skip empty cells
        if not cell_text:
            continue
        
        # Try each variant
        for variant in target_variants:
            score = calculate_match_score(cell_text, variant, target_name)
            
            if score > best_score:
                best_score = score
                best_match = i
                best_variant = variant
    
    return best_match, best_score, best_variant

def extract_value_from_row(row_cells, label_cell_index):
    """Extract numeric value from row, prioritizing right-aligned cells, then others."""
    # Try scanning right-side first
    for i in range(label_cell_index + 1, len(row_cells)):
        text = clean_text(row_cells[i].get_text(" ", strip=True))
        value = extract_numeric(text)
        if value is not None:
            return value

    # Then try all numeric-looking cells
    for i, cell in enumerate(row_cells):
        if i == label_cell_index:
            continue
        text = clean_text(cell.get_text(" ", strip=True))
        if re.search(r'[\d\)$]', text):  # heuristic for financial numbers
            value = extract_numeric(text)
            if value is not None:
                return value
    return None




Company filings often vary in how they present data, making extraction challenging. To handle different formats, I created two functions:

1. find_hierarchical_matches– Searches horizontally across rows where the label and value appear on the same row.
2. find_vertical_match – Scans vertically down columns where labels are in one column and financial values (e.g., US\$) in another.


In [None]:
def find_hierarchical_matches(table, target, variants):
    rows = table.find_all('tr')
    matches = []

    for row_idx, row in enumerate(rows):
        cells = row.find_all(['td', 'th'])
        if len(cells) < 2:
            continue
        cell_texts = [clean_text(c.get_text(" ", strip=True)) for c in cells]
        row_text_combined = " ".join(cell_texts)
        if any(term in row_text_combined for term in ['continued', 'schedule', 'note']):
            continue

        match_index, score, matched_variant = find_best_match_in_row(cells, variants, target)

        if match_index is not None and score >= 60:
            value = extract_value_from_row(cells, match_index)
            if value is not None:
                matches.append({
                    'row_idx': row_idx,
                    'score': score,
                    'value': value,
                    'label': cells[match_index].get_text(" ", strip=True),
                    'variant': matched_variant
                })

    if not matches:
        return None
        matches.sort(key=lambda x: (x['score'], x['row_idx']), reverse=True)
    best = matches[0]
    print(f"🎯 Best match for {target}: {best['value']} | Label: {best['label']} | Score: {best['score']:.1f}")
    return best['value']



def find_vertical_match(table, target, variants):
    rows = table.find_all('tr')
    if len(rows) < 2:
        return None

    grid = []
    for row in rows:
        cells = row.find_all(['td', 'th'])
        grid.append([clean_text(cell.get_text(" ", strip=True)) for cell in cells])

    num_rows = len(grid)
    num_cols = max(len(row) for row in grid)
    for row in grid:
        while len(row) < num_cols:
            row.append("")

    usd_col_indices = []
    for col_idx in range(num_cols):
        col_text_top = " ".join([grid[i][col_idx] for i in range(min(3, num_rows))])
        if any(curr in col_text_top for curr in ["us$", "usd", "$"]) and "percentage" not in col_text_top:
            usd_col_indices.append(col_idx)

    if not usd_col_indices:
        return None

    best_value = None
    best_score = 0
    for row in grid:
        for label_idx, cell_text in enumerate(row):
            for variant in variants:
                score = calculate_match_score(cell_text, variant, target)
                if score > best_score:
                    for usd_col in usd_col_indices:
                        if usd_col < len(row):
                            candidate = row[usd_col]
                            if "%" in candidate or "percentage" in candidate.lower():
                                continue
                            value = extract_numeric(candidate)
                            if value is not None and value > 100:  # avoid matching percentages
                                best_value = value
                                best_score = score

    if best_value is not None:
        print(f"📈 US$ vertical match for {target}: {best_value}")
    return best_value