In [11]:
import re
# List of possible units
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

# Flattened set of all units from the entity_unit_map
all_units = {unit for units in entity_unit_map.values() for unit in units}
# Extend unit mapping with more abbreviations like "IN" to "inch" and "0Z" to "ounce"
unit_mapping = {
    'g': 'gram',
    'kg': 'kilogram',
    'mg': 'milligram',
    'mcg': 'microgram',
    'oz': 'ounce',
    '0z': 'ounce',  # Handle "0Z" case
    'lb': 'pound',
    't': 'ton',
    'ml': 'millilitre',
    'l': 'litre',
    'cl': 'centilitre',
    'dl': 'decilitre',
    'fl oz': 'fluid ounce',
    'ft': 'foot',
    'in': 'inch',  # Handle "IN" case
    'cm': 'centimetre',
    'mm': 'millimetre',
    'yd': 'yard',
    'v': 'volt',
    'mv': 'millivolt',
    'kv': 'kilovolt',
    'w': 'watt',
    'kw': 'kilowatt',
    'gal': 'gallon',
    'pt': 'pint',
    'qt': 'quart',
    'cu ft': 'cubic foot',
    'cu in': 'cubic inch',
    'imp gal': 'imperial gallon'
}

def normalize_unit(unit):
    # Normalize the input unit to match the standard list
    unit = unit.lower().strip()

    # Handle plural units by removing 's' and checking if the singular form exists
    if unit.endswith('s') and unit[:-1] in all_units:
        unit = unit[:-1]

    if unit in unit_mapping:
        return unit_mapping[unit]

    return unit

def extract_value_unit(input_string):
    # Regular expression to extract numeric value and unit
    pattern = r"(\d+\.?\d*)\s*([a-zA-Z]+(?:\s*[a-zA-Z]*)?)"
    matches = re.findall(pattern, input_string)

    # Return the first valid match
    for match in matches:
        value = match[0]  # Extract numeric value
        unit = normalize_unit(match[1])  # Extract and normalize unit

        if unit in all_units:  # Check if unit is in the list of allowed units
            return f"{value} {unit}"
    
    return None  # If no valid match is found

def convert_units_in_context(context_string):
 
    # Split the context string into words
    words = context_string.split()

    # Convert each word if it's a unit abbreviation
    converted_words = []
    for word in words:
        # Check if the word matches any of the unit mappings
        lower_word = word.lower()  # For case-insensitive matching
        if lower_word in unit_mapping:
            converted_words.append(unit_mapping[lower_word])
        else:
            converted_words.append(word)

    # Join the words back into a context string
    return ' '.join(converted_words)

# Example context from JSON
context = "51*70 IN 31 0Z warm soft and ventilate Particularly smooth Comfortable to skin GARDEN"

# Convert the context units
converted_context = convert_units_in_context(context)

print(f"Original: {context}")
print(f"Converted: {converted_context}")



Original: 51*70 IN 31 0Z warm soft and ventilate Particularly smooth Comfortable to skin GARDEN
Converted: 51*70 inch 31 ounce warm soft and ventilate Particularly smooth Comfortable to skin GARDEN


In [16]:
import pandas as pd
import re

# Load your CSV file
df = pd.read_csv('dataset/final_train.csv')

# List of possible units
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

# Flattened set of all units from the entity_unit_map
all_units = {unit for units in entity_unit_map.values() for unit in units}

# Extend unit mapping with more abbreviations
unit_mapping = {
    'g': 'gram',
    'kg': 'kilogram',
    'mg': 'milligram',
    'mcg': 'microgram',
    'oz': 'ounce',
    '0z': 'ounce',  # Handle "0Z" case
    'lb': 'pound',
    't': 'ton',
    'ml': 'millilitre',
    'l': 'litre',
    'cl': 'centilitre',
    'dl': 'decilitre',
    'fl oz': 'fluid ounce',
    'ft': 'foot',
    'in': 'inch',  # Handle "IN" case
    'cm': 'centimetre',
    'mm': 'millimetre',
    'yd': 'yard',
    'v': 'volt',
    'mv': 'millivolt',
    'kv': 'kilovolt',
    'w': 'watt',
    'kw': 'kilowatt',
    'gal': 'gallon',
    'pt': 'pint',
    'qt': 'quart',
    'cu ft': 'cubic foot',
    'cu in': 'cubic inch',
    'imp gal': 'imperial gallon'
}

def normalize_unit(unit):
    unit = unit.lower().strip()
    if unit.endswith('s') and unit[:-1] in all_units:
        unit = unit[:-1]
    return unit_mapping.get(unit, unit)

def convert_units_in_context(context_string):
    words = context_string.split()
    converted_words = [unit_mapping.get(word.lower(), word) for word in words]
    return ' '.join(converted_words)

## Function to ensure the text is a string and handle NaN values
def safe_str(value):
    return str(value) if pd.notnull(value) else ""

# Function to extract the number or values before the first space
def extract_number(entity_value):
    match = re.match(r'^\S+', entity_value)
    if match:
        return match.group(0)
    return entity_value

# Function to find the answer_start indices in both easyocr_text and tesseract_text
def find_answer_indices(row):
    # Apply unit conversion on the text
    easyocr_text = convert_units_in_context(safe_str(row['easyocr_text']))
    tesseract_text = convert_units_in_context(safe_str(row['tesseract_text']))

    # Concatenate the easyocr_text and tesseract_text as a single context string
    concatenated_text = easyocr_text + " " + tesseract_text
    
    # Extract the number from entity_value
    entity_value_number = extract_number(row['entity_value'])
    
    # Find the number in both easyocr_text and tesseract_text
    index_easyocr = easyocr_text.find(entity_value_number)
    index_tesseract = tesseract_text.find(entity_value_number)
    
    # If found in both texts, return the answer start indices for both
    if index_easyocr != -1 and index_tesseract != -1:
        return index_easyocr, len(easyocr_text) + 1 + index_tesseract
    else:
        return -1, -1

# Apply the function to each row in the dataframe
df['answer_start_easyocr'], df['answer_start_tesseract'] = zip(*df.apply(find_answer_indices, axis=1))

# Show the processed dataframe
print(df[['easyocr_text', 'tesseract_text', 'entity_value', 'answer_start_easyocr', 'answer_start_tesseract']].head())


                                        easyocr_text  \
0  PROPOS' NATURE INGREDIENT MENAGER MULTI-USAGE ...   
1  TLaeel=_ 7672 Xe RRIFIC LEBENSMITTELECHT Gw DA...   
2                                                NaN   
3  3 3 1 1 F IW! 1 5833 1 3 1 1 1 1 H 0 L 1 W # I...   
4  Horbaach' HIGH StRENGTH PSYLLIUM HUSK PLANTAGO...   

                                      tesseract_text    entity_value  \
0                                                NaN      500.0 gram   
1  GEPRAGTES  par Sitaram]  a UND GESCHUTZTE DESIGNS         1.0 cup   
2  Serving Size: 1 Tablet (0.709 g) | Each servin...      0.709 gram   
3                                                NaN      0.709 gram   
4                         Sos ai ace  PSYLLIUM  HUSK  1400 milligram   

   answer_start_easyocr  answer_start_tesseract  
0                    -1                      -1  
1                    -1                      -1  
2                    -1                      -1  
3                    -1       

In [17]:
import re

# Function to extract the number or values before space
def extract_number(entity_value):
    # Use regex to extract the part before the first space
    match = re.match(r'^\S+', entity_value)  # Extract the numeric part before space
    if match:
        return match.group(0)
    return entity_value  # Return original if no match

## Function to ensure the text is a string and handle NaN values
def safe_str(value):
    return str(value) if pd.notnull(value) else ""

# Function to clean text by removing symbols and spaces (only retain numbers)
def clean_text(text):
    # Remove all symbols except numbers
    return re.sub(r'[^\d]', '', text)  # Keep only digits, remove everything else

# Function to remove decimal point from entity_value
def clean_entity_value(entity_value):
    return entity_value.replace('.', '')  # Remove decimal point

# Function to find answer indices in original text, with fallback to cleaned text
def find_answer_indices(row):
    original_easyocr_text = safe_str(row['easyocr_text'])
    original_tesseract_text = safe_str(row['tesseract_text'])
    
    # Extract the number from entity_value
    entity_value_number = extract_number(row['entity_value'])
    
    # First search: find number in original text
    index_easyocr = original_easyocr_text.find(entity_value_number)
    index_tesseract = original_tesseract_text.find(entity_value_number)
    
    # If found in both, return the indices
    if index_easyocr != -1 and index_tesseract != -1:
        return index_easyocr, len(original_easyocr_text) + 1 + index_tesseract
    
    # Second attempt: Clean text and entity_value
    cleaned_easyocr_text = clean_text(original_easyocr_text)
    cleaned_tesseract_text = clean_text(original_tesseract_text)
    cleaned_entity_value_number = clean_entity_value(entity_value_number)
        
    # Search again in cleaned text
    cleaned_index_easyocr = cleaned_easyocr_text.find(cleaned_entity_value_number)
    cleaned_index_tesseract = cleaned_tesseract_text.find(cleaned_entity_value_number)
    
    # If found in cleaned text, map the indices back to original text
    if cleaned_index_easyocr != -1 and cleaned_index_tesseract != -1:
        index_easyocr = map_cleaned_to_original(cleaned_easyocr_text, original_easyocr_text, cleaned_index_easyocr)
        index_tesseract = map_cleaned_to_original(cleaned_tesseract_text, original_tesseract_text, cleaned_index_tesseract)
        return index_easyocr, len(original_easyocr_text) + 1 + index_tesseract
    elif cleaned_index_easyocr != -1: 
        index_easyocr = map_cleaned_to_original(cleaned_easyocr_text, original_easyocr_text, cleaned_index_easyocr)
        # return index_easyocr, -1
    elif cleaned_index_tesseract != -1:
        index_tesseract = map_cleaned_to_original(cleaned_tesseract_text, original_tesseract_text, cleaned_index_tesseract)
        # return -1, len(original_easyocr_text) + 1 + index_tesseract
    
    cleaned_shortened_entity_value_number = cleaned_entity_value_number[:2]
    cleaned_shortened_index_easyocr = cleaned_easyocr_text.find(cleaned_shortened_entity_value_number)
    cleaned_shortened_index_tesseract = cleaned_tesseract_text.find(cleaned_shortened_entity_value_number)
    
    if cleaned_shortened_index_easyocr != -1 and cleaned_shortened_index_tesseract != -1:
        index_easyocr = map_cleaned_to_original(cleaned_easyocr_text, original_easyocr_text, cleaned_shortened_index_easyocr)
        index_tesseract = map_cleaned_to_original(cleaned_tesseract_text, original_tesseract_text, cleaned_shortened_index_tesseract)
        return index_easyocr, len(original_easyocr_text) + 1 + index_tesseract
    elif cleaned_shortened_index_easyocr != -1:
        index_easyocr = map_cleaned_to_original(cleaned_easyocr_text, original_easyocr_text, cleaned_shortened_index_easyocr)
        return index_easyocr, -1
    elif cleaned_shortened_index_tesseract != -1:
        index_tesseract = map_cleaned_to_original(cleaned_tesseract_text, original_tesseract_text, cleaned_shortened_index_tesseract)
        return -1, len(original_easyocr_text) + 1 + index_tesseract
    else :
        return -1, -1
        

# Function to map cleaned text index back to original text index
def map_cleaned_to_original(cleaned_text, original_text, cleaned_index):
    cleaned_so_far = 0
    for i, char in enumerate(original_text):
        if char.isdigit():  # Only count digits
            if cleaned_so_far == cleaned_index:
                return i
            cleaned_so_far += 1
    return -1  # Return -1 if no match

# Apply the function to each row
df['answer_start_easyocr'], df['answer_start_tesseract'] = zip(*df.apply(find_answer_indices, axis=1))

# Show the processed dataframe
print(df[['easyocr_text', 'tesseract_text', 'entity_value', 'answer_start_easyocr', 'answer_start_tesseract']].head(100))

                                         easyocr_text  \
0   PROPOS' NATURE INGREDIENT MENAGER MULTI-USAGE ...   
1   TLaeel=_ 7672 Xe RRIFIC LEBENSMITTELECHT Gw DA...   
2                                                 NaN   
3   3 3 1 1 F IW! 1 5833 1 3 1 1 1 1 H 0 L 1 W # I...   
4   Horbaach' HIGH StRENGTH PSYLLIUM HUSK PLANTAGO...   
..                                                ...   
95                                                NaN   
96                                                NaN   
97  Herbal max BENEFITS OF GREEN COFFEE Discov er ...   
98  166 Thick High Grade 304 Stainless Steel Anti-...   
99  NEW OOs FooD ONLT # NEW RPURINAP @ SUPERCOAT '...   

                                       tesseract_text    entity_value  \
0                                                 NaN      500.0 gram   
1   GEPRAGTES  par Sitaram]  a UND GESCHUTZTE DESIGNS         1.0 cup   
2   Serving Size: 1 Tablet (0.709 g) | Each servin...      0.709 gram   
3                      

In [15]:
test_sample = {
    'easyocr_text': "12345.6 la",
    'tesseract_text': "1,234.56 kg",
    'entity_value': "1234.56 kilogram",
    'entity_name': "weight"
}
# Convert the test sample into a pandas Series (like a single row)
test_row = pd.Series(test_sample)

# Call the function to find answer indices for the single sample
answer_start_easyocr, answer_start_tesseract = find_answer_indices(test_row)

# Output the results for the single test case
print(f"Answer start index in easyocr_text: {answer_start_easyocr}")
print(f"Answer start index in tesseract_text: {answer_start_tesseract}")

Answer start index in easyocr_text: 0
Answer start index in tesseract_text: 11


In [18]:
total_values = df.count()
print(total_values)

image_link                263859
group_id                  263859
entity_name               263859
entity_value              263859
image_name                263859
easyocr_text              105978
tesseract_text            124498
answer_start_easyocr      263859
answer_start_tesseract    263859
dtype: int64


In [20]:
import csv
import json
import pandas as pd

def generate_json_from_dataframe(df):
    train_data = []
    
    # Iterate over each row in the DataFrame
    for idx, row in df.iterrows():
        # Convert NaN values to empty strings
        easyocr_text = str(row['easyocr_text']) if pd.notna(row['easyocr_text']) else ''
        tesseract_text = str(row['tesseract_text']) if pd.notna(row['tesseract_text']) else ''
        
        # Concatenate the context from easyocr_text and tesseract_text
        context = easyocr_text + ' ' + tesseract_text
        
        question = f"What is the value of the {row['entity_name']}?"
        entity_value = row['entity_value']
        
        # Check if both answer_start_easyocr and answer_start_tesseract are -1
        if row['answer_start_easyocr'] == -1 and row['answer_start_tesseract'] == -1:
            is_impossible = True
            answers = [{"text": "not possible", "answer_start": -1}]
        else:
            is_impossible = False
            # Collect answer starting indexes that are not -1
            answer_starts = [row['answer_start_easyocr'], row['answer_start_tesseract']]
            answer_starts = [start for start in answer_starts if start != -1]
            answers = [{"text": entity_value, "answer_start": start} for start in answer_starts]
        
        # Create the Q&A structure
        qas = {
            "id": f"{idx+1:05}",
            "is_impossible": is_impossible,
            "question": question,
            "answers": answers if not is_impossible else []
        }
        
        # Append the context and Q&A to the training data
        train_data.append({
            "context": context,
            "qas": [qas]
        })
    
    return train_data


# Generate the JSON data from the DataFrame
json_data = generate_json_from_dataframe(df)

# Write to JSON file
with open("train.json", "w", encoding='utf-8') as json_file:
    json.dump(json_data, json_file, ensure_ascii=False, indent=4)

print("JSON data created successfully!")


JSON data created successfully!
