### Basic library imports

In [3]:
import os
import pandas as pd

### Read Dataset

In [4]:
DATASET_FOLDER = '../dataset/'
train = pd.read_csv(os.path.join(DATASET_FOLDER, 'train.csv'))
test = pd.read_csv(os.path.join(DATASET_FOLDER, 'test.csv'))
sample_test = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test.csv'))
sample_test_out = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test_out.csv'))

### Run Sanity check using src/sanity.py

In [5]:
!python sanity.py --test_filename ../dataset/sample_test.csv --output_filename ../dataset/sample_test_out.csv

Parsing successfull for file: ../dataset/sample_test_out.csv


In [6]:
!python sanity.py --test_filename ../dataset/sample_test.csv --output_filename ../dataset/sample_test_out_fail.csv

Error: Invalid unit [lbs] found in 6.75 lbs. Allowed units: {'yard', 'inch', 'metre', 'millivolt', 'microgram', 'ounce', 'kilovolt', 'milligram', 'kilogram', 'quart', 'decilitre', 'ton', 'kilowatt', 'watt', 'litre', 'pound', 'centilitre', 'microlitre', 'pint', 'cubic inch', 'volt', 'gram', 'foot', 'gallon', 'cubic foot', 'fluid ounce', 'imperial gallon', 'centimetre', 'cup', 'millimetre', 'millilitre'}


### Splitting the test dataset

In [7]:
from utils import download_images
from sklearn.model_selection import StratifiedShuffleSplit

In [8]:
cd ..

/workspaces/Solutions/student_resource 3


In [9]:
%pwd

'/workspaces/Solutions/student_resource 3'

In [12]:
# Read the test.csv file
df = pd.read_csv('dataset/test.csv')

In [13]:
df["entity_name"].value_counts()

entity_name
height                           32282
depth                            28146
width                            26931
item_weight                      22032
maximum_weight_recommendation     7028
voltage                           5488
wattage                           5447
item_volume                       3833
Name: count, dtype: int64

In [14]:
# Number of rows in each split
split_size = 30000

In [15]:
# Split the dataframe into 4 parts
df_part1 = df.iloc[:split_size]
df_part2 = df.iloc[split_size:2*split_size]
df_part3 = df.iloc[2*split_size:3*split_size]
df_part4 = df.iloc[3*split_size:]

In [16]:
# Save each part to a new CSV file
df_part1.to_csv('test_part1.csv', index=False)
df_part2.to_csv('test_part2.csv', index=False)
df_part3.to_csv('test_part3.csv', index=False)
df_part4.to_csv('test_part4.csv', index=False)

print("CSV has been split into 4 parts successfully!")

CSV has been split into 4 parts successfully!


In [14]:
# Save the stratified sample to a new CSV file
stratified_sample.to_csv('stratified_sample.csv', index=False)

In [15]:
# The folder where images will be downloaded
download_folder = 'downloaded_images/'

In [16]:
import cv2
from multiprocessing import Pool
from utils import download_images, parse_string, common_mistake
import constants
from PIL import Image
from tqdm import tqdm
from functools import partial

In [17]:
# Step 1: Preprocessing function for images with contrast and optional skew correction
def preprocess_image(image_path):
    try:
        image = cv2.imread(image_path)
        # Convert to grayscale
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        # Apply CLAHE for contrast enhancement
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
        enhanced_image = clahe.apply(gray)
        # Apply thresholding
        _, thresh = cv2.threshold(enhanced_image, 150, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        # Apply median blur to remove noise
        processed_image = cv2.medianBlur(thresh, 3)
        return processed_image
    except Exception as e:
        print(f"Error in preprocessing image {image_path}: {e}")
        return None

## Paddle-OCR

In [18]:
from paddleocr import PaddleOCR

In [19]:
# Initialize the PaddleOCR model once
ocr_paddleocr = PaddleOCR(use_angle_cls=True, lang='en')

download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to /home/codespace/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer/en_PP-OCRv3_det_infer.tar


100%|██████████| 4.00M/4.00M [00:07<00:00, 510kiB/s]


download https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar to /home/codespace/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/en_PP-OCRv4_rec_infer.tar


100%|██████████| 10.2M/10.2M [00:03<00:00, 3.10MiB/s]

[2024/09/15 16:49:43] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/home/codespace/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/home/codespace/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_t




In [20]:
# Step 2: Extract text from image using PaddleOCR
def extract_text_paddleocr(image_path):
    try:
        preprocessed_image = preprocess_image(image_path)
        if preprocessed_image is None:
            return ""
        result = ocr_paddleocr.ocr(image_path)
        extracted_text = ' '.join([line[1][0] for line in result[0]])
        return extracted_text.strip()
    except Exception as e:
        print(f"PaddleOCR extraction failed for {image_path}: {e}")
        return ""

In [21]:
# Step 3: Process images and save the extracted text
def process_images_paddleocr(stratified_sample_file, download_folder, output_file):
    df = pd.read_csv(stratified_sample_file)
    print("Downloading images...")
    download_images(df['image_link'], download_folder)
    
    print("Extracting text using PaddleOCR...")
    df['Extracted'] = df['image_link'].apply(lambda link: extract_text_paddleocr(os.path.join(download_folder, os.path.basename(link))))
    
    # Clean up images after extraction
    for file in os.listdir(download_folder):
        os.remove(os.path.join(download_folder, file))
    
    df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")

In [22]:
# Usage
process_images_paddleocr("stratified_sample.csv", "downloaded_images", "output_paddleocr.csv")

Downloading images...


100%|██████████| 50/50 [00:00<00:00, 143.52it/s]


Extracting text using PaddleOCR...
[2024/09/15 16:50:10] ppocr DEBUG: dt_boxes num : 2, elapsed : 0.3413219451904297
[2024/09/15 16:50:10] ppocr DEBUG: cls num  : 2, elapsed : 0.02020430564880371
[2024/09/15 16:50:10] ppocr DEBUG: rec_res num  : 2, elapsed : 0.09525585174560547
[2024/09/15 16:50:11] ppocr DEBUG: dt_boxes num : 5, elapsed : 0.1657571792602539
[2024/09/15 16:50:11] ppocr DEBUG: cls num  : 5, elapsed : 0.016232728958129883
[2024/09/15 16:50:11] ppocr DEBUG: rec_res num  : 5, elapsed : 0.14435386657714844
[2024/09/15 16:50:11] ppocr DEBUG: dt_boxes num : 7, elapsed : 0.17621970176696777
[2024/09/15 16:50:11] ppocr DEBUG: cls num  : 7, elapsed : 0.029121875762939453
[2024/09/15 16:50:12] ppocr DEBUG: rec_res num  : 7, elapsed : 0.564643144607544
[2024/09/15 16:50:12] ppocr DEBUG: dt_boxes num : 4, elapsed : 0.16322636604309082
[2024/09/15 16:50:12] ppocr DEBUG: cls num  : 4, elapsed : 0.015394449234008789
[2024/09/15 16:50:12] ppocr DEBUG: rec_res num  : 4, elapsed : 0.1090

## Mistral-Testing

In [28]:
from mistralai import Mistral

In [29]:
%pip install -qU langchain_mistralai

Note: you may need to restart the kernel to use updated packages.


In [50]:
os.environ["MISTRAL_API_KEY"]=os.getenv("MISTRAL_API_KEY")

In [51]:
import requests
import time
from langchain_mistralai import ChatMistralAI

In [52]:
llm = ChatMistralAI(model="mistral-large-latest")

In [53]:
llm.invoke("who is narender modi?")

AIMessage(content="Narendra Modi is an Indian politician who has been serving as the Prime Minister of India since 2014. Here are some key points about him:\n\n1. **Full Name**: Narendra Damodardas Modi\n2. **Birthdate**: September 17, 1950\n3. **Birthplace**: Vadnagar, Gujarat, India\n4. **Political Party**: Bharatiya Janata Party (BJP)\n5. **Previous Offices**:\n   - Chief Minister of Gujarat (2001-2014)\n   - Member of Parliament from Varanasi (2014-present)\n6. **Education**: Modi holds a Bachelor's degree in Political Science from School of Open Learning, University of Delhi, and a Master's degree in Political Science from Gujarat University.\n7. **Policies and Initiatives**: As Prime Minister, Modi has launched several initiatives, including:\n   - Swachh Bharat Abhiyan (Clean India Mission)\n   - Make in India\n   - Digital India\n   - Demonetization (currency note ban) in 2016\n   - Goods and Services Tax (GST) implementation\n8. **International Recognition**: Modi has been rec

In [54]:
# Retry decorator function with increased wait time
def retry_on_rate_limit(func):
    def wrapper(*args, **kwargs):
        retries = 5  # Number of retries
        wait_time = 120  # Start with 2 minutes wait time
        
        for i in range(retries):
            try:
                return func(*args, **kwargs)
            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 429:
                    print(f"Rate limit exceeded. Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                    wait_time *= 2  # Exponential backoff
                else:
                    raise e
        raise Exception("Max retries reached. Failed due to rate limiting.")
    return wrapper

In [55]:
from langchain.prompts import PromptTemplate

In [56]:
# Example usage
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 
                    'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

In [57]:
# Function to format the value and unit as required
def format_value_unit(value, unit):
    # Ensure whole numbers are formatted with .0
    if float(value).is_integer():
        value = f"{float(value):.1f}"

    # Normalize the unit based on entity_unit_map
    unit_mapping = {
        "mm": "millimetre",
        "cm": "centimetre",
        "m": "metre",
        "in": "inch",
        "g": "gram",
        "kg": "kilogram",
        "mg": "milligram",
        "oz": "ounce",
        "lb": "pound",
        "ton": "ton",
        "kv": "kilovolt",
        "mv": "millivolt",
        "v": "volt",
        "kw": "kilowatt",
        "w": "watt",
        # Add more as needed based on your entity_unit_map
    }

    # Convert the unit to its full form
    unit = unit_mapping.get(unit.lower(), unit)

    return f"{value} {unit}"

In [58]:
# Function to process the LLM response and apply post-processing
def process_llm_response(response):
    response_text = response.content.strip()

    # If the response includes irrelevant text (e.g., "no mention of value"), return empty string
    if "no mention" in response_text.lower() or not response_text:
        return ""

    # Extract value and unit from the response using regex
    import re
    match = re.search(r'(\d+\.?\d*)\s*([a-zA-Z]+)', response_text)
    if match:
        value, unit = match.groups()
        # Apply formatting
        return format_value_unit(value, unit)
    else:
        return ""

In [59]:
# Function to generate the prompt using LangChain's PromptTemplate
def create_prompt(entity_name, extracted_text, allowed_units):
    prompt_template = PromptTemplate(
        input_variables=["entity_name", "allowed_units", "extracted_text"],
        template="""
        Extract the value and unit for the given entity based on the following text.
        Return only the value and unit in the format: "<value> <unit>".

        ### Entity Name:
        {entity_name}

        ### Allowed Units:
        {allowed_units}

        ### Text:
        "{extracted_text}"

        ### Expected Output:
        """
    )
    return prompt_template.format(
        entity_name=entity_name,
        allowed_units=', '.join(allowed_units),
        extracted_text=extracted_text
    )

In [60]:
# Function to generate the predict column with retry logic
@retry_on_rate_limit
def get_value_from_llm(entity_name, extracted_text):
    allowed_units = entity_unit_map.get(entity_name, [])
    if not allowed_units:
        return ""

    # Generate the prompt using LangChain
    prompt = create_prompt(entity_name, extracted_text, allowed_units)
    
    # Send the prompt to the LLM using .invoke() and get the response
    response = llm.invoke(prompt)  # Correctly call the Mistral LLM
    
    # Post-process the LLM's response
    entity_value = process_llm_response(response)
    
    return entity_value

In [61]:
# Function to process CSV in batches with delay between batches
def process_csv_in_batches(input_csv, output_csv, batch_size=20, wait_time=300):
    df = pd.read_csv(input_csv)

    # Initialize a new column for predictions
    df['predict'] = ""

    # Process in batches
    num_batches = (len(df) // batch_size) + 1

    for batch_num in range(num_batches):
        start_idx = batch_num * batch_size
        end_idx = min((batch_num + 1) * batch_size, len(df))
        batch_df = df.iloc[start_idx:end_idx]

        print(f"Processing batch {batch_num + 1}/{num_batches}")

        for idx, row in batch_df.iterrows():
            entity_name = row['entity_name']
            extracted_text = row['Extracted']
            
            # Get the value and unit using the Mistral LLM
            entity_value = get_value_from_llm(entity_name, extracted_text)

            # Insert the generated entity_value into the 'predict' column
            df.at[idx, 'predict'] = entity_value

        # Save after each batch to avoid losing data in case of interruption
        df.to_csv(output_csv, index=False)

        # Wait between batches to avoid hitting the rate limit
        print(f"Batch {batch_num + 1} processed. Waiting {wait_time} seconds before processing the next batch.")
        time.sleep(wait_time)  # Wait before processing the next batch

    print(f"Finished processing {num_batches} batches.")

In [62]:
input_csv = "output_paddleocr.csv"
output_csv = "output_with_predictions.csv"

In [65]:
process_csv_in_batches(input_csv, output_csv, batch_size=50, wait_time=40)

Processing batch 1/21
Batch 1 processed. Waiting 40 seconds before processing the next batch.
Processing batch 2/21
Batch 2 processed. Waiting 40 seconds before processing the next batch.
Processing batch 3/21
Batch 3 processed. Waiting 40 seconds before processing the next batch.
Processing batch 4/21
Batch 4 processed. Waiting 40 seconds before processing the next batch.
Processing batch 5/21
Batch 5 processed. Waiting 40 seconds before processing the next batch.
Processing batch 6/21
Batch 6 processed. Waiting 40 seconds before processing the next batch.
Processing batch 7/21
Batch 7 processed. Waiting 40 seconds before processing the next batch.
Processing batch 8/21
Batch 8 processed. Waiting 40 seconds before processing the next batch.
Processing batch 9/21
Batch 9 processed. Waiting 40 seconds before processing the next batch.
Processing batch 10/21
Batch 10 processed. Waiting 40 seconds before processing the next batch.
Processing batch 11/21
Batch 11 processed. Waiting 40 seco

HTTPStatusError: Error response 429 while fetching https://api.mistral.ai/v1/chat/completions: {"message":"Requests rate limit exceeded"}

## LLM(LLAMA3.1)-Testing

In [22]:
from langchain_community.llms import Ollama

In [25]:
!curl -fsSL https://ollama.com/install.sh | sh # download ollama api

>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%##O#- #                                                                                                                                   1.8%#     97.0%
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


In [26]:
import os
import threading
import subprocess
import requests
import json

def ollama():
    os.environ['OLLAMA_HOST'] = '0.0.0.0:11434'
    os.environ['OLLAMA_ORIGINS'] = '*'
    subprocess.Popen(["ollama", "serve"])

ollama_thread = threading.Thread(target=ollama)
ollama_thread.start()

In [27]:
!ollama run llama3.1

2024/09/15 09:39:49 routes.go:1125: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://0.0.0.0:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/home/codespace/.ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[* http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_RUNNERS_DIR: OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES:]"
time=2024-09-15T09:39:49.662Z level=INFO source=images.go:753 msg="total blobs: 5"
time=2024-09-15T09:39:49.662Z level=INFO source=images.go:760 msg=

[GIN] 2024/09/15 - 09:40:04 | 200 |      32.109µs |       127.0.0.1 | HEAD     "/"
[GIN] 2024/09/15 - 09:40:04 | 200 |   20.205479ms |       127.0.0.1 | POST     "/api/show"
INFO [main] build info | build=1 commit="8962422" tid="132312726518720" timestamp=1726393204
INFO [main] system info | n_threads=2 n_threads_batch=2 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="132312726518720" timestamp=1726393204 total_threads=4
INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="6" port="42013" tid="132312726518720" timestamp=1726393204
[?25l⠙ [?25h

time=2024-09-15T09:40:04.723Z level=INFO source=payload.go:44 msg="Dynamic LLM libraries [cpu_avx cpu_avx2 cuda_v11 cuda_v12 rocm_v60102 cpu]"
time=2024-09-15T09:40:04.723Z level=INFO source=gpu.go:200 msg="looking for compatible GPUs"
time=2024-09-15T09:40:04.740Z level=INFO source=gpu.go:347 msg="no compatible GPUs were discovered"
time=2024-09-15T09:40:04.740Z level=INFO source=types.go:107 msg="inference compute" id=0 library=cpu variant=avx2 compute="" driver=0.0 name="" total="15.6 GiB" available="7.7 GiB"
time=2024-09-15T09:40:04.817Z level=INFO source=server.go:101 msg="system memory" total="15.6 GiB" free="7.7 GiB" free_swap="0 B"
time=2024-09-15T09:40:04.818Z level=INFO source=memory.go:326 msg="offload to cpu" layers.requested=-1 layers.model=33 layers.offload=0 layers.split="" memory.available="[7.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="5.8 GiB" memory.required.partial="0 B" memory.required.kv="1.0 GiB" memory.required.allocations="[5.8 GiB]" memory.weights.

[?25l[2K[1G⠹ [?25h[?25l[2K[1G⠸ [?25h

llama_model_loader: - kv  24:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
llama_model_loader: - kv  25:                tokenizer.ggml.bos_token_id u32              = 128000
llama_model_loader: - kv  26:                tokenizer.ggml.eos_token_id u32              = 128009
llama_model_loader: - kv  27:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
llama_model_loader: - kv  28:               general.quantization_version u32              = 2
llama_model_loader: - type  f32:   66 tensors
llama_model_loader: - type q4_0:  225 tensors
llama_model_loader: - type q6_K:    1 tensors
time=2024-09-15T09:40:05.071Z level=INFO source=server.go:624 msg="waiting for server to become available" status="llm server loading model"


[?25l[2K[1G⠸ [?25h[?25l[2K[1G⠴ [?25h[?25l[2K[1G⠦ [?25h

llm_load_vocab: special tokens cache size = 256
llm_load_vocab: token to piece cache size = 0.7999 MB
llm_load_print_meta: format           = GGUF V3 (latest)
llm_load_print_meta: arch             = llama
llm_load_print_meta: vocab type       = BPE
llm_load_print_meta: n_vocab          = 128256
llm_load_print_meta: n_merges         = 280147
llm_load_print_meta: vocab_only       = 0
llm_load_print_meta: n_ctx_train      = 131072
llm_load_print_meta: n_embd           = 4096
llm_load_print_meta: n_layer          = 32
llm_load_print_meta: n_head           = 32
llm_load_print_meta: n_head_kv        = 8
llm_load_print_meta: n_rot            = 128
llm_load_print_meta: n_swa            = 0
llm_load_print_meta: n_embd_head_k    = 128
llm_load_print_meta: n_embd_head_v    = 128
llm_load_print_meta: n_gqa            = 4
llm_load_print_meta: n_embd_k_gqa     = 1024
llm_load_print_meta: n_embd_v_gqa     = 1024
llm_load_print_meta: f_norm_eps       = 0.0e+00
llm_load_print_meta: f_norm_rms_eps   = 1

[?25l[2K[1G⠧ [?25h[?25l[2K[1G⠇ [?25h[?25l[2K[1G⠏ [?25h[?25l[2K[1G⠏ [?25h[?25l[2K[1G⠙ [?25h[?25l[2K[1G⠹ [?25h[?25l[2K[1G⠸ [?25h[?25l[2K[1G⠼ [?25h[?25l[2K[1G⠴ [?25h[?25l[2K[1G⠴ [?25h[?25l[2K[1G⠧ [?25h[?25l[2K[1G⠇ [?25h[?25l[2K[1G⠏ [?25h[?25l[2K[1G⠋ [?25h[?25l[2K[1G⠋ [?25h[?25l[2K[1G⠹ [?25h[?25l[2K[1G⠹ [?25h[?25l[2K[1G⠼ [?25h[?25l[2K[1G⠼ [?25h[?25l[2K[1G⠦ [?25h[?25l[2K[1G⠧ [?25h[?25l[2K[1G⠇ [?25h[?25l[2K[1G⠏ [?25h[?25l[2K[1G⠋ [?25h[?25l[2K[1G⠙ [?25h[?25l[2K[1G⠹ [?25h[?25l[2K[1G⠸ [?25h[?25l[2K[1G⠼ [?25h[?25l[2K[1G⠴ [?25h[?25l[2K[1G⠦ [?25h[?25l[2K[1G⠧ [?25h[?25l[2K[1G⠇ [?25h[?25l[2K[1G⠏ [?25h[?25l[2K[1G⠋ [?25h[?25l[2K[1G⠙ [?25h[?25l[2K[1G⠹ [?25h[?25l[2K[1G⠸ [?25h[?25l[2K[1G⠼ [?25h[?25l[2K[1G⠼ [?25h[?25l[2K[1G⠴ [?25h[?25l[2K[1G⠧ [?25h[?25l[2K[1G⠇ [?25h[?25l[2K[1G⠏ [?25h[?25l[2K[1G⠏ [?25h[?25l[2K[1G⠙ [?25h[?25l[2K

llama_new_context_with_model: n_ctx      = 8192
llama_new_context_with_model: n_batch    = 512
llama_new_context_with_model: n_ubatch   = 512
llama_new_context_with_model: flash_attn = 0
llama_new_context_with_model: freq_base  = 500000.0
llama_new_context_with_model: freq_scale = 1


[?25l[2K[1G⠋ [?25h[?25l[2K[1G⠙ [?25h[?25l[2K[1G⠹ [?25h[?25l[2K[1G⠸ [?25h[?25l[2K[1G⠸ [?25h[?25l[2K[1G⠼ [?25h

llama_kv_cache_init:        CPU KV buffer size =  1024.00 MiB
llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
llama_new_context_with_model:        CPU  output buffer size =     2.02 MiB
llama_new_context_with_model:        CPU compute buffer size =   560.01 MiB
llama_new_context_with_model: graph nodes  = 1030
llama_new_context_with_model: graph splits = 1


[?25l[2K[1G⠴ [?25h[?25l[2K[1G⠧ [?25hINFO [main] model loaded | tid="132312726518720" timestamp=1726393217
[?25l[2K[1G⠇ [?25h[GIN] 2024/09/15 - 09:40:17 | 200 | 12.864573209s |       127.0.0.1 | POST     "/api/chat"
[?25l[?25l[2K[1G[?25h[2K[1G[?25h[?2004h>>> [38;5;245mSend a message (/? for help)[28D[0m

time=2024-09-15T09:40:17.626Z level=INFO source=server.go:629 msg="llama runner started in 12.81 seconds"


[K
Use Ctrl + d or /bye to exit.
>>> [38;5;245mSend a message (/? for help)[28D[0m[K
>>> [38;5;245mSend a message (/? for help)[28D[0m

In [28]:
llm = Ollama(model="llama3.1:latest")

In [29]:
llm.invoke("who is president of india?")

[GIN] 2024/09/15 - 09:41:01 | 200 | 26.971848962s |             ::1 | POST     "/api/generate"


'The President of India as of my last update in 2021 was Droupadi Murmu. She is the first person from the indigenous people of Jharkhand (the Santal Pargana) and also the second woman to hold this office. Her tenure began on July 25, 2022. Prior to her presidency, she served as the Governor of Jharkhand from May 2015 until July 2021.'

In [30]:
# Function to generate the prompt for LLaMA 3.1
def generate_prompt(entity_name, extracted_text, allowed_units):
    return f"""
    Extract the value and unit for the given entity based on the following text.
    Return only the value and unit in the format: "<value> <unit>".

    ### Entity Name:
    {entity_name}

    ### Allowed Units:
    {', '.join(allowed_units)}

    ### Text:
    "{extracted_text}"

    ### Expected Output:
    """

In [31]:
# Function to query LLaMA 3.1 via Ollama
def query_llama_ollama(prompt):
    response = llm(prompt)  # Using langchain Ollama integration
    return response.strip()

In [32]:
# Function to process CSV with LLaMA 3.1 via Ollama
def process_csv_with_llama(input_csv, output_csv):
    df = pd.read_csv(input_csv)

    # Initialize a new column for predictions
    df['predict'] = ""

    # Loop over each row and generate the predict value
    for idx, row in df.iterrows():
        entity_name = row['entity_name']
        extracted_text = row['Extracted']
        
        allowed_units = entity_unit_map.get(entity_name, [])
        if not allowed_units:
            df.at[idx, 'predict'] = ""
            continue

        # Generate the prompt for each row
        prompt = generate_prompt(entity_name, extracted_text, allowed_units)
        
        # Get the prediction from LLaMA 3.1 via Ollama
        response = query_llama_ollama(prompt)
        
        # Insert the response into the predict column
        df.at[idx, 'predict'] = response

    # Save the updated DataFrame to a new CSV
    df.to_csv(output_csv, index=False)

In [33]:
# Example entity_unit_map
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 
                    'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

In [34]:
# Example usage
input_csv = "output_paddleocr.csv"
output_csv = "output_with_predictions.csv"

In [35]:
# Process the CSV and generate the predict column using LLaMA 3.1 through Ollama
process_csv_with_llama(input_csv, output_csv)

  response = llm(prompt)  # Using langchain Ollama integration


[GIN] 2024/09/15 - 09:41:21 | 200 | 20.361383189s |             ::1 | POST     "/api/generate"
[GIN] 2024/09/15 - 09:41:40 | 200 | 18.621085989s |             ::1 | POST     "/api/generate"
[GIN] 2024/09/15 - 09:42:17 | 200 | 37.669356193s |             ::1 | POST     "/api/generate"
[GIN] 2024/09/15 - 09:43:11 | 200 | 53.277146063s |             ::1 | POST     "/api/generate"
[GIN] 2024/09/15 - 09:43:26 | 200 | 15.825207096s |             ::1 | POST     "/api/generate"
[GIN] 2024/09/15 - 09:43:47 | 200 | 20.349248119s |             ::1 | POST     "/api/generate"
[GIN] 2024/09/15 - 09:43:58 | 200 | 11.145292842s |             ::1 | POST     "/api/generate"
[GIN] 2024/09/15 - 09:44:37 | 200 |  38.98120871s |             ::1 | POST     "/api/generate"
[GIN] 2024/09/15 - 09:45:07 | 200 | 30.221182432s |             ::1 | POST     "/api/generate"
[GIN] 2024/09/15 - 09:45:19 | 200 | 11.565085136s |             ::1 | POST     "/api/generate"
[GIN] 2024/09/15 - 09:45:41 | 200 | 21.781266882s 

## Evaluation

In [36]:
def calculate_f1_score(df, entity_value_col='entity_value', predict_col='predict'):
    # Initialize counts
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    true_negatives = 0

    # Iterate through each row in the DataFrame
    for idx, row in df.iterrows():
        GT = row[entity_value_col]
        OUT = row[predict_col]

        # Apply classification logic based on GT and OUT
        if OUT != "" and GT != "":
            if OUT == GT:
                true_positives += 1  # True Positive
            else:
                false_positives += 1  # False Positive (OUT != GT)
        elif OUT != "" and GT == "":
            false_positives += 1  # False Positive (OUT != "" but GT is "")
        elif OUT == "" and GT != "":
            false_negatives += 1  # False Negative (OUT is "" but GT != "")
        elif OUT == "" and GT == "":
            true_negatives += 1  # True Negative

    # Calculate precision and recall
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

    # Calculate F1 score
    if precision + recall > 0:
        f1_score = 2 * (precision * recall) / (precision + recall)
    else:
        f1_score = 0

    # Print results
    print(f"True Positives: {true_positives}")
    print(f"False Positives: {false_positives}")
    print(f"False Negatives: {false_negatives}")
    print(f"True Negatives: {true_negatives}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1_score:.4f}")

    return f1_score


In [37]:
# Example usage
df = pd.read_csv("output_with_predictions.csv")  # Load your DataFrame
f1_score = calculate_f1_score(df)

True Positives: 0
False Positives: 50
False Negatives: 0
True Negatives: 0
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
