In [4]:
import os
import requests
from PIL import Image
import pytesseract
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Doc

# Download Tesseract OCR engine if not installed
if not os.path.exists("tesseract.exe"):
    os.system("pip install pytesseract")
    pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"  # Adjust path if needed

# Download spaCy English model (large recommended for accuracy)
if not spacy.util.get_data_path("en_core_web_lg"):
    os.system("python -m spacy download en_core_web_lg")
nlp = spacy.load("en_core_web_lg")
matcher = Matcher(nlp.vocab)

# Load allowed units from constants.py (assuming it's defined)
allowed_units = ALLOWED_UNITS  # Replace with actual definition from constants.py

def extract_text_from_image(image_path):
    """
    Extracts text from an image using Tesseract OCR.
    """
    try:
        img = Image.open(image_path)
        text = pytesseract.image_to_string(img)
        return text.strip()
    except Exception as e:
        print(f"Error extracting text from image: {image_path} ({e})")
        return ""

def extract_entities(text):
    """
    Extracts entity values using spaCy NER and custom patterns.
    """
    doc = nlp(text)

    # Define custom patterns for specific entity types (e.g., weight, volume)
    pattern1 = [{LEMMA: "weight"}, {TEXT: {"OP": "?"}, UNIT: {"POS": "NOUN"}}]
    pattern2 = [{LEMMA: "volume"}, {TEXT: {"OP": "?"}, UNIT: {"POS": "NOUN"}}]
    matcher.add("WEIGHT_PATTERN", None, pattern1)
    matcher.add("VOLUME_PATTERN", None, pattern2)

    matches = matcher(doc)

    entities = []
    for match_id, start, end in matches:
        matched_span = doc[start:end]
        entity_type = matcher.label_to_string(match_id)
        entity_value = matched_span.text.split()[0]  # Extract first word as value
        unit = matched_span.text.split()[-1]  # Extract last word as unit
        if unit in allowed_units:
            entities.append((entity_type, f"{entity_value} {unit}"))

    return entities

def download_image(image_url, filename):
    """
    Downloads an image from the given URL.
    """
    try:
        response = requests.get(image_url, stream=True)
        if response.status_code == 200:
            with open(filename, 'wb') as f:
                for chunk in response.iter_content(1024):
                    f.write(chunk)
            return True
        else:
            print(f"Failed to download image: {image_url} ({response.status_code})")
            return False
    except Exception as e:
        print(f"Error downloading image: {image_url} ({e})")
        return False

def process_image(image_url, index):
    """
    Processes an image, extracts text and entities, and returns prediction.
    """
    filename = f"image_{index}.jpg"
    if not download_image(image_url, filename):
        return index, ""

    text = extract_text_from_image(filename)
    entities = extract_entities(text)

    if not entities:
        return index, ""

    prediction = entities[0][1]  # Use the first extracted entity
    os.remove(filename)  # Clean up downloaded image

    return index, prediction

def main(input_csv, output_csv):
    """
    Reads input CSV, processes images, extracts entities, and writes predictions.
    """
    with open(input_csv, 'r') as f, open(output_csv, 'w') as out_file:
        out_file.write("index,prediction\n")

OSError: [WinError 126] The specified module could not be found. Error loading "c:\Users\agarw\AppData\Local\Programs\Python\Python312\Lib\site-packages\torch\lib\fbgemm.dll" or one of its dependencies.