In [6]:
import pandas as pd
import re

# Load your CSV file
df = pd.read_csv('/DATA1/ai23mtech12001/Amazon/amazon-ml/dataset/final_train.csv')

## Function to ensure the text is a string and handle NaN values
def safe_str(value):
    return str(value) if pd.notnull(value) else ""

# Function to extract the number or values before space
def extract_number(entity_value):
    # Use regex to extract the part before the first space
    match = re.match(r'^\S+', entity_value)  # \S+ matches any non-whitespace character before the first space
    if match:
        return match.group(0)  # Return the part before the space
    return entity_value  # Return the original if no match is found

# Function to find the answer_start indices in both easyocr_text and tesseract_text
def find_answer_indices(row):
    easyocr_text = safe_str(row['easyocr_text'])
    tesseract_text = safe_str(row['tesseract_text'])
    
    question = easyocr_text + " " + tesseract_text  # Concatenate as strings
    entity_value_number = extract_number(row['entity_value'])  # Extract the number
    
    # Find the number in both texts
    index_easyocr = easyocr_text.find(entity_value_number)
    index_tesseract = tesseract_text.find(entity_value_number)
    
    # Return indices if the number is found in both texts
    if index_easyocr != -1 and index_tesseract != -1:
        return index_easyocr, len(easyocr_text) + 1 + index_tesseract
    else:
        return -1, -1  # Return -1 if not found in both

# Apply the function to each row
df['answer_start_easyocr'], df['answer_start_tesseract'] = zip(*df.apply(find_answer_indices, axis=1))

# Show the processed dataframe
print(df[['easyocr_text', 'tesseract_text', 'entity_value', 'answer_start_easyocr', 'answer_start_tesseract']].head())

                                        easyocr_text  tesseract_text  \
0  PROPOS' NATURE INGREDIENT MENAGER MULTI-USAGE ...             NaN   
1  TLaeel=_ 7672 Xe RRIFIC LEBENSMITTELECHT Gw DA...             NaN   
2                                                NaN             NaN   
3  3 3 1 1 F IW! 1 5833 1 3 1 1 1 1 H 0 L 1 W # I...             NaN   
4  Horbaach' HIGH StRENGTH PSYLLIUM HUSK PLANTAGO...             NaN   

     entity_value  answer_start_easyocr  answer_start_tesseract  
0      500.0 gram                    -1                      -1  
1         1.0 cup                    -1                      -1  
2      0.709 gram                    -1                      -1  
3      0.709 gram                    -1                      -1  
4  1400 milligram                    -1                      -1  


In [32]:
import re

# Function to extract the number or values before space
def extract_number(entity_value):
    # Use regex to extract the part before the first space
    match = re.match(r'^\S+', entity_value)  # Extract the numeric part before space
    if match:
        return match.group(0)
    return entity_value  # Return original if no match

## Function to ensure the text is a string and handle NaN values
def safe_str(value):
    return str(value) if pd.notnull(value) else ""

# Function to clean text by removing symbols and spaces (only retain numbers)
def clean_text(text):
    # Remove all symbols except numbers
    return re.sub(r'[^\d]', '', text)  # Keep only digits, remove everything else

# Function to remove decimal point from entity_value
def clean_entity_value(entity_value):
    return entity_value.replace('.', '')  # Remove decimal point

# Function to find answer indices in original text, with fallback to cleaned text
def find_answer_indices(row):
    original_easyocr_text = safe_str(row['easyocr_text'])
    original_tesseract_text = safe_str(row['tesseract_text'])
    
    # Extract the number from entity_value
    entity_value_number = extract_number(row['entity_value'])
    
    # First search: find number in original text
    index_easyocr = original_easyocr_text.find(entity_value_number)
    index_tesseract = original_tesseract_text.find(entity_value_number)
    
    # If found in both, return the indices
    if index_easyocr != -1 and index_tesseract != -1:
        return index_easyocr, len(original_easyocr_text) + 1 + index_tesseract
    
    # Second attempt: Clean text and entity_value
    cleaned_easyocr_text = clean_text(original_easyocr_text)
    cleaned_tesseract_text = clean_text(original_tesseract_text)
    cleaned_entity_value_number = clean_entity_value(entity_value_number)
        
    # Search again in cleaned text
    cleaned_index_easyocr = cleaned_easyocr_text.find(cleaned_entity_value_number)
    cleaned_index_tesseract = cleaned_tesseract_text.find(cleaned_entity_value_number)
    
    # If found in cleaned text, map the indices back to original text
    if cleaned_index_easyocr != -1 and cleaned_index_tesseract != -1:
        index_easyocr = map_cleaned_to_original(cleaned_easyocr_text, original_easyocr_text, cleaned_index_easyocr)
        index_tesseract = map_cleaned_to_original(cleaned_tesseract_text, original_tesseract_text, cleaned_index_tesseract)
        return index_easyocr, len(original_easyocr_text) + 1 + index_tesseract
    elif cleaned_index_easyocr != -1: 
        index_easyocr = map_cleaned_to_original(cleaned_easyocr_text, original_easyocr_text, cleaned_index_easyocr)
        # return index_easyocr, -1
    elif cleaned_index_tesseract != -1:
        index_tesseract = map_cleaned_to_original(cleaned_tesseract_text, original_tesseract_text, cleaned_index_tesseract)
        # return -1, len(original_easyocr_text) + 1 + index_tesseract
    
    cleaned_shortened_entity_value_number = cleaned_entity_value_number[:2]
    cleaned_shortened_index_easyocr = cleaned_easyocr_text.find(cleaned_shortened_entity_value_number)
    cleaned_shortened_index_tesseract = cleaned_tesseract_text.find(cleaned_shortened_entity_value_number)
    
    if cleaned_shortened_index_easyocr != -1 and cleaned_shortened_index_tesseract != -1:
        index_easyocr = map_cleaned_to_original(cleaned_easyocr_text, original_easyocr_text, cleaned_shortened_index_easyocr)
        index_tesseract = map_cleaned_to_original(cleaned_tesseract_text, original_tesseract_text, cleaned_shortened_index_tesseract)
        return index_easyocr, len(original_easyocr_text) + 1 + index_tesseract
    elif cleaned_shortened_index_easyocr != -1:
        index_easyocr = map_cleaned_to_original(cleaned_easyocr_text, original_easyocr_text, cleaned_shortened_index_easyocr)
        return index_easyocr, -1
    elif cleaned_shortened_index_tesseract != -1:
        index_tesseract = map_cleaned_to_original(cleaned_tesseract_text, original_tesseract_text, cleaned_shortened_index_tesseract)
        return -1, len(original_easyocr_text) + 1 + index_tesseract
    else :
        return -1, -1
        

# Function to map cleaned text index back to original text index
def map_cleaned_to_original(cleaned_text, original_text, cleaned_index):
    cleaned_so_far = 0
    for i, char in enumerate(original_text):
        if char.isdigit():  # Only count digits
            if cleaned_so_far == cleaned_index:
                return i
            cleaned_so_far += 1
    return -1  # Return -1 if no match

# Apply the function to each row
df['answer_start_easyocr'], df['answer_start_tesseract'] = zip(*df.apply(find_answer_indices, axis=1))

# Show the processed dataframe
print(df[['easyocr_text', 'tesseract_text', 'entity_value', 'answer_start_easyocr', 'answer_start_tesseract']].head(100))

                                         easyocr_text  tesseract_text  \
0   PROPOS' NATURE INGREDIENT MENAGER MULTI-USAGE ...             NaN   
1   TLaeel=_ 7672 Xe RRIFIC LEBENSMITTELECHT Gw DA...             NaN   
2                                                 NaN             NaN   
3   3 3 1 1 F IW! 1 5833 1 3 1 1 1 1 H 0 L 1 W # I...             NaN   
4   Horbaach' HIGH StRENGTH PSYLLIUM HUSK PLANTAGO...             NaN   
..                                                ...             ...   
95                                                NaN             NaN   
96                                                NaN             NaN   
97                                                NaN             NaN   
98  166 Thick High Grade 304 Stainless Steel Anti-...             NaN   
99  NEW OOs FooD ONLT # NEW RPURINAP @ SUPERCOAT '...             NaN   

      entity_value  answer_start_easyocr  answer_start_tesseract  
0       500.0 gram                   431                

In [36]:
test_sample = {
    'easyocr_text': "12345.6 la",
    'tesseract_text': "1,234.56 kg",
    'entity_value': "1234.56 kg",
    'entity_name': "weight"
}
# Convert the test sample into a pandas Series (like a single row)
test_row = pd.Series(test_sample)

# Call the function to find answer indices for the single sample
answer_start_easyocr, answer_start_tesseract = find_answer_indices(test_row)

# Output the results for the single test case
print(f"Answer start index in easyocr_text: {answer_start_easyocr}")
print(f"Answer start index in tesseract_text: {answer_start_tesseract}")

Answer start index in easyocr_text: 0
Answer start index in tesseract_text: 11
