In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dataasa/valid_data.csv
/kaggle/input/random-dataset/maximum_weight_recommendation-20240913T170711Z-001/maximum_weight_recommendation/71wjWkzMivL.jpg
/kaggle/input/random-dataset/maximum_weight_recommendation-20240913T170711Z-001/maximum_weight_recommendation/71Ns89h5j3L.jpg
/kaggle/input/random-dataset/maximum_weight_recommendation-20240913T170711Z-001/maximum_weight_recommendation/61bEPf9eRVL.jpg
/kaggle/input/random-dataset/maximum_weight_recommendation-20240913T170711Z-001/maximum_weight_recommendation/6135h5ZfarL.jpg
/kaggle/input/random-dataset/maximum_weight_recommendation-20240913T170711Z-001/maximum_weight_recommendation/71mldPiI5IL.jpg
/kaggle/input/random-dataset/maximum_weight_recommendation-20240913T170711Z-001/maximum_weight_recommendation/61bIYnfgPeL.jpg
/kaggle/input/random-dataset/maximum_weight_recommendation-20240913T170711Z-001/maximum_weight_recommendation/51M7pOXsy7L.jpg
/kaggle/input/random-dataset/maximum_weight_recommendation-20240913T170711Z-001/m

In [4]:
import pandas as pd
import re
from collections import defaultdict
import pytesseract
from PIL import Image
from transformers import BertTokenizer, BertForTokenClassification, pipeline
import os

In [5]:
# Step 1: Perform OCR to extract text from an image
def extract_text_from_image(image_path):
    img = Image.open(image_path)
    text = pytesseract.image_to_string(img)
    print(f"OCR extracted text: {text}")
    return text

In [6]:
def setup_bert_ner_pipeline():
    tokenizer = BertTokenizer.from_pretrained("dslim/bert-base-NER")
    model = BertForTokenClassification.from_pretrained("dslim/bert-base-NER")
    nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
    return nlp

In [7]:
# Step 3: Extract weight and height-related entities using BERT NER
def extract_weights_and_heights(nlp, text):
    weights_heights = []
    
    # List of units to check against (converted to lowercase)
    units = ["centilitre", "cubic foot", "cubic inch", "cup", "decilitre", "fluid ounce", "gallon",
             "imperial gallon", "litre", "microlitre", "millilitre", "pint", "quart", "kilowatt", 
             "watt", "kilovolt", "millivolt", "volt", "gram", "kilogram", "microgram", "milligram", 
             "ounce", "pound", "ton", "centimetre", "foot", "inch", "metre", "millimetre", "yard",
             "cl", "cu ft", "cu in", "dl", "fl oz", "gal", "imp gal", "l", "μl", "ml", "pt", "qt",
             "kw", "w", "kv", "mv", "v", "g", "kg", "μg", "mg", "oz", "lb", "cm", "ft", "in", "m", "mm", "yd"]

    # Add common OCR misreadings
    ocr_errors = {
        "lbs": ["Ibs", "ibs", "1bs", "lbs"],
        "kg": ["kg", "Kg", "KG"],
        "lb": ["Ib", "lb", "LB"],
        "cm": ["crn", "cm", "CM"],
        "mm": ["rnm", "mm", "MM"],
        "in": ["in", "In", "IN"],
        "ft": ["ft", "Ft", "FT"]
    }

    # Create regex pattern for numbers and units
    number_pattern = r"([\d.,]+)\s*"
    unit_pattern = "(" + "|".join(units + [item for sublist in ocr_errors.values() for item in sublist]) + ")"
    full_pattern = number_pattern + unit_pattern

    # Additional patterns for specific cases
    weight_limit_pattern = r"(?:up to|maximum|max|limit|capacity of|weight).*?" + full_pattern
    
    # First, try to extract entities using BERT NER
    entities = nlp(text)
    for entity in entities:
        if entity['entity_group'] in ['QUANTITY', 'MEASURE', 'CARDINAL']:
            weights_heights.append(entity['word'])
    
    # If BERT NER didn't find anything or found very few entities, use regex as a fallback
    if len(weights_heights) < 2:
        # Try to find weight limits first
        limit_matches = re.findall(weight_limit_pattern, text.lower(), re.IGNORECASE)
        if limit_matches:
            for value, unit in limit_matches:
                value = value.replace(',', '')  # Remove commas from numbers
                weights_heights.append(f"{value} {unit}")
        
        # If no weight limits found, look for any measurement
        if not weights_heights:
            matches = re.findall(full_pattern, text.lower(), re.IGNORECASE)
            for value, unit in matches:
                value = value.replace(',', '')  # Remove commas from numbers
                weights_heights.append(f"{value} {unit}")
    
    # Normalize units (convert OCR errors to correct units)
    normalized_weights_heights = []
    for item in weights_heights:
        value, unit = item.split()
        for correct_unit, error_units in ocr_errors.items():
            if unit in error_units:
                unit = correct_unit
                break
        normalized_weights_heights.append(f"{value} {unit}")
    print(f"Extracted entities: {normalized_weights_heights}")
    return normalized_weights_heights, text

In [10]:
#Step 4: Parse numeric value and unit from extracted entities and standardize units
def parse_entity_value(entity_value):
    # Standardize units by mapping common variations and OCR misreadings to a single form
    unit_mappings = {
        'centilitres': 'cl', 'centilitre': 'cl', 'cl': 'cl',
        'cubic foot': 'cu ft', 'cu ft': 'cu ft', 'cubic inch': 'cu in', 'cu in': 'cu in',
        'cup': 'cup', 'decilitre': 'dl', 'dl': 'dl', 'fluid ounce': 'fl oz', 'fl oz': 'fl oz',
        'gallon': 'gal', 'gal': 'gal', 'imperial gallon': 'imp gal', 'imp gal': 'imp gal',
        'litre': 'l', 'litres': 'l', 'l': 'l', 'microlitre': 'μl', 'ml': 'ml',
        'millilitre': 'ml', 'millilitres': 'ml', 'pint': 'pt', 'pt': 'pt', 'quart': 'qt', 'qt': 'qt',
        'kilowatt': 'kw', 'kw': 'kw', 'watt': 'w', 'w': 'w', 'kilovolt': 'kv', 'kv': 'kv',
        'millivolt': 'mv', 'mv': 'mv', 'volt': 'v', 'v': 'v',
        'gram': 'g', 'grams': 'g', 'g': 'g', 'kilogram': 'kg', 'kilograms': 'kg', 'kgs': 'kg', 'kg': 'kg',
        'microgram': 'μg', 'μg': 'μg', 'milligram': 'mg', 'mg': 'mg', 'ounce': 'oz', 'ounces': 'oz', 'oz': 'oz',
        'pound': 'lb', 'pounds': 'lb', 'lbs': 'lb', 'lb': 'lb', 'ton': 'ton', 'tons': 'ton', 'tonne': 'ton',
        'centimetre': 'cm', 'centimetres': 'cm', 'cm': 'cm', 'foot': 'ft', 'feet': 'ft', 'ft': 'ft',
        'inch': 'in', 'inches': 'in', 'in': 'in', 'metre': 'm', 'metres': 'm', 'm': 'm',
        'millimetre': 'mm', 'millimetres': 'mm', 'mm': 'mm', 'yard': 'yd', 'yd': 'yd'
    }
    
    # Use regex to extract the numeric value and the unit
    match = re.search(r"([\d.]+)\s*([a-zA-Z\"”’]+)", entity_value)
    if match:
        value, unit = match.groups()
        standardized_unit = unit_mappings.get(unit.lower(), unit.lower())  # Map the unit to standardized form
        return float(value), standardized_unit
    return None, None

# def parse_entity_value(entity_value):
#     match = re.search(r"([\d.]+)\s*([a-zA-Z]+)", entity_value)
#     if match:
#         value, unit = match.groups()
#         return float(value), unit.lower()
#     return None, None

In [11]:

# Step 5: Compare extracted values with actual values and compute true positives, false positives, and false negatives
def compare_entities(extracted_entities, true_value, original_text):
    true_value, true_unit = parse_entity_value(true_value)
    
    if true_value is None or true_unit is None:
        print(f"Comparison failed: Could not extract true value/unit from: '{true_value}'")
        return 0, 0, 1, original_text  # False negative
    
    for entity in extracted_entities:
        extracted_value, extracted_unit = parse_entity_value(entity)
        if extracted_value is not None and extracted_unit is not None:
            print(f"Comparing extracted: {extracted_value} {extracted_unit} with true: {true_value} {true_unit}")
            
            if abs(extracted_value - true_value) < 0.1 and extracted_unit == true_unit:
                print("Match found: True Positive")
                return 1, 0, 0, None  # True positive
            else:
                print(f"No match: {extracted_value} {extracted_unit} did not match {true_value} {true_unit}")
    
    print(f"Comparison failed: False Negative")
    return 0, 0, 1, original_text  # False negative

In [12]:
# Step 6: Full processing function
def process_image_for_weights_and_heights(image_path):
    text = extract_text_from_image(image_path)
    nlp = setup_bert_ner_pipeline()
    extracted_entities, original_text = extract_weights_and_heights(nlp, text)
    return extracted_entities, original_text

In [14]:
# Step 7: Process dataset and calculate F1 score with a limit of 500 rows
def process_dataset_and_calculate_f1(csv_path, row_limit=500):
    df = pd.read_csv(csv_path)
    df = df.head(row_limit)
    
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    unmatched_cases = defaultdict(list)

    for idx, row in df.iterrows():
        image_path = row['image_path']
        true_value = row['entity_value']
        
        print(f"\nProcessing image {idx + 1}/{row_limit}: {image_path}")
        
        extracted_entities, original_text = process_image_for_weights_and_heights(image_path)
        
        tp, fp, fn, unmatched_text = compare_entities(extracted_entities, true_value, original_text)
        true_positives += tp
        false_positives += fp
        false_negatives += fn
        
        if unmatched_text:
            unmatched_cases[true_value].append(unmatched_text)
        
        print(f"Cumulative - TP: {true_positives}, FP: {false_positives}, FN: {false_negatives}")
    
    precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    
    # Save unmatched cases to a CSV file
    unmatched_df = pd.DataFrame([(tv, '\n'.join(texts)) for tv, texts in unmatched_cases.items()],
                                columns=['True Value', 'Unmatched OCR Texts'])
    csv_path = os.path.join(os.getcwd(), 'unmatched_cases.csv')
    unmatched_df.to_csv(csv_path, index=False)
    
    print(f"\nUnmatched cases saved to: {csv_path}")
    print(f"\nTotal processed images: {row_limit}")
    print(f"True Positives: {true_positives}")
    print(f"False Positives: {false_positives}")
    print(f"False Negatives: {false_negatives}")
    
    return precision, recall, f1_score

In [19]:
# Example usage
csv_path = '/kaggle/input/random-dataset/66e31d6ee96cd_student_resource_3/student_resource 3/dataset/test.csv'
precision, recall, f1_score = process_dataset_and_calculate_f1(csv_path, row_limit=17)

KeyError: 'image_path'

In [18]:
# Display precision, recall, and F1 score
print(f"Precision: {precision:.18f}")
print(f"Recall: {recall:.18f}")
print(f"F1 Score: {f1_score:.18f}")

Precision: 1.000000000000000000
Recall: 0.352941176470588258
F1 Score: 0.521739130434782594
