In [1]:
import os
import json
from tqdm import tqdm

from doctr.io import DocumentFile
from doctr.models import ocr_predictor
from fuzzywuzzy import fuzz
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MAX_LINE_MATCHES = 10
CUT_OFF_THRESHOLD = 70
QUESTION_WEIGHT = 0.2
ANSWER_WEIGHT = 0.8
LEVEL = "line"


JSON_FILE = "/data/BADRI/FINAL/THESIS/GRVQA/main/outputs/json/filtered_grounding_annotations.json"
IMG_DIR = "/data/BADRI/FINAL/THESIS/GRVQA/ANNOTATION/final/"

OUTPUT_JSON_FILE = "/data/BADRI/FINAL/THESIS/GRVQA/main/outputs/json/algorithm_llama_grounding_annotations.json"

stop_words = {'what', 'is', 'the', 'this', 'that', 'these', 'those', 'which', 'how', 'why', 'where', 'when', 'who', 'will', 'be', 'and', 'or', 'in', 'at', 'to', 'for', 'of', 'with', 'by'}


In [3]:
def get_matched_regions(question_text, target_text, predictions):

    question_terms = [word.lower() for word in question_text.split() if word.lower() not in stop_words]
    matched_regions = []
    for region in predictions:
        region_text = region['text']
        region_copy = region.copy()

        if target_text.lower() in region_text.lower():
            region_copy['match_score'] = 100
            region_copy['match_details'] = {
                    'exact_match': True,
                    'answer_score': 100,
                    'question_score': 100
                }
            matched_regions.append(region_copy)
            continue

        partial_score = fuzz.partial_ratio(target_text.lower(), region_text.lower())
        token_score = fuzz.token_set_ratio(target_text.lower(), region_text.lower())
        
        # Calculate length factor (preference for longer matches that contain meaningful content)
        target_len = len(target_text)
        region_len = len(region_text)
        length_factor = min(1.0, region_len / min(50, target_len))  # Cap at 1.0, adapt based on target length
        
        # Combine scores for answer with weights
        # Higher weight to token matching for longer texts, higher weight to partial matching for shorter texts
        if region_len > 10:
            answer_score = (partial_score * 0.3) + (token_score * 0.5) + (length_factor * 100 * 0.2)
        else:
            # For very short texts, reduce their overall score unless they're exact matches
            answer_score = (partial_score * 0.3) + (token_score * 0.4) + (length_factor * 100 * 0.3)
            if region_len < 5 and partial_score < 100:
                answer_score *= 0.5  # Penalize very short inexact matches

        # penalize shorter region_texts
        if region_len < 5:
            answer_score *= 0.5
        
        # Calculate fuzzy match scores for question terms using both methods
        partial_question_scores = [fuzz.partial_ratio(term, region_text.lower()) for term in question_terms]
        token_question_scores = [fuzz.token_set_ratio(term, region_text.lower()) for term in question_terms]
        
        # Get best scores for question terms
        best_partial_question = max(partial_question_scores) if partial_question_scores else 0
        best_token_question = max(token_question_scores) if token_question_scores else 0
        
        # Combine question scores
        question_score = (best_partial_question * 0.4) + (best_token_question * 0.6)
        
        # Combine scores (giving more weight to answer matches)
        combined_score = (answer_score * ANSWER_WEIGHT) + (question_score * QUESTION_WEIGHT)

        # print(combined_score)
        
        if combined_score >= CUT_OFF_THRESHOLD:
            region_copy['match_score'] = combined_score
            region_copy['match_details'] = {
                'exact_match': False,
                'answer_score': answer_score,
                'question_score': question_score,
                'answer_weight': ANSWER_WEIGHT,
                'question_weight': QUESTION_WEIGHT
            }
            matched_regions.append(region_copy)


    matched_regions.sort(key=lambda x: x['match_score'], reverse=True)
    top_matches = matched_regions[:MAX_LINE_MATCHES]
    return top_matches
        

In [4]:
def load_llm_model(device):
    pipe = pipeline("text-generation", model="meta-llama/Meta-Llama-3.1-8B-Instruct", device=device)
    return pipe

def generate_llm_answer(question, answer, context, pipe):
    
    prompt = f"""You are analyzing text extracted from an image. Find the line(s) of text that best matches the answer to the question.


    Question: {question}
    Answer: {answer}
    Context: {context}

    Analyze the document text and determine which line(s) of text are most relevant to answering the question with respect to the answer based on the context.
    For each line that's relevant, provide a relevance score from 0 to 1 where 1 means highly relevant.
    Also the lines are at max 3 lines.
    Strictly ensure to only provide the line bbox of the relavant lines and the relevance score in JSON-like format and no other textual content, for example:
    [
    {{"line_bbox": "1 1 2 2", "relevance": 0.9}},
    {{"line_bbox": "2 3 4 4", "relevance": 0.4}}
    ]
    """

    messages = [ {"role": "user", "content": prompt}]
    result = pipe(messages, max_new_tokens=1024, do_sample=True, temperature=0.7)
    # print(result[0]["generated_text"][1])
    # exit()
    ans = result[0]["generated_text"][1]['content']
    # print(question)
    # print(ans)
    return ans

In [5]:
pipe = load_llm_model("cuda")

model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)

Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.68it/s]
Device set to use cuda


In [6]:


with open(JSON_FILE, 'r') as f:
    data = json.load(f)

for image_name, qa_data in tqdm(data.items()):
    IMG_PATH = os.path.join(IMG_DIR, image_name)
    doc = DocumentFile.from_images(IMG_PATH)
    result = model(doc)

    predictions = []

    for page in result.pages:     
        dim = tuple(reversed(page.dimensions))
        for block in page.blocks:
            for line in block.lines:
                output = {}
                geo = line.geometry
                a = list(a*b for a,b in zip(geo[0],dim))
                b = list(a*b for a,b in zip(geo[1],dim))
                x1 = round(a[0], 2).astype(float)
                y1 = round(a[1], 2).astype(float)
                x2 = round(b[0], 2).astype(float)
                y2 = round(b[1], 2).astype(float)
                line_bbox = [x1, y1, x2, y2]
                
                sent = []
                words_data = []
                for word in line.words:
                    word_data = {}
                    sent.append(word.value)
                    geo = word.geometry
                    a = list(a*b for a,b in zip(geo[0],dim))
                    b = list(a*b for a,b in zip(geo[1],dim))
                    x1 = round(a[0], 2).astype(float)
                    y1 = round(a[1], 2).astype(float)
                    x2 = round(b[0], 2).astype(float)
                    y2 = round(b[1], 2).astype(float)
                    bbox = [x1, y1, x2, y2]
                    
                    word_data['bbox'] = bbox
                    word_data['text'] = word.value
                    words_data.append(word_data)
                output['bbox'] = line_bbox
                output['text'] = " ".join(sent)
                output['words'] = words_data
                predictions.append(output)


    for qa in tqdm(qa_data):
        question = qa['question']
        answer = qa['answer']
        top_k_matches = get_matched_regions(question, answer, predictions)

        

        matched_data = []
        for match in top_k_matches:
            content = ""
            for box in match['bbox']:
                content += str(box) + " "
            content += match['text']
            matched_data.append(content)

        # print(matched_data)
        # print(question)
        # print(answer)
        # # break
        # print()

        grounding_answer = generate_llm_answer(question, answer, matched_data, pipe)
        try:
            grounding_answer = json.loads(grounding_answer)
            qa['line_level_predictions'] = grounding_answer
        except:
            qa['line_level_predictions'] = grounding_answer
            print(qa['id'])

            # save json file dynamically
        with open(OUTPUT_JSON_FILE, 'w') as f:
            json.dump(data, f, indent=4, ensure_ascii=False)
    
with open(OUTPUT_JSON_FILE, 'w') as f:
    json.dump(data, f, indent=4, ensure_ascii=False)

  0%|          | 0/69 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 9/9 [00:32<00:00,  3.61s/it]
  1%|▏         | 1/69 [00:34<38:59, 34.40s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_

fp_26_8


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 12/12 [00:38<00:00,  3.20s/it]
 32%|███▏      | 22/69 [09:37<24:43, 31.57s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 9/9 [00:24<0