In [2]:
import pandas as pd
from transformers import AutoProcessor, MllamaForConditionalGeneration
import torch
import os
from dotenv import load_dotenv  


load_dotenv()

hf_token = os.getenv("HF_TOKEN")  

MODEL_ID = "unsloth/Llama-3.2-11B-Vision"
print("Loading model and processor...")


model = MllamaForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16, 
    device_map=None,  
    token=hf_token  
)


model.cuda()

processor = AutoProcessor.from_pretrained(MODEL_ID, token=hf_token)

print("Model fully loaded on CUDA.")


Loading model and processor...


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Model fully loaded on CUDA.


In [6]:
df = pd.read_csv(CSV_PATH)
print(f"Loaded {len(df)} records from {CSV_PATH}")



predictions = []  
ground_truths = [] 

print("Starting inference on images...")
for idx, row in tqdm(df.iterrows(), total=len(df)):
    
    rel_img_path = row["image_path"]
    gt_text = row["ocr_text"]
    
    
    full_img_path = os.path.join(IMAGE_DIR, rel_img_path)
    
    
    try:
        image = Image.open(full_img_path).convert("RGB")
    except Exception as e:
        print(f"Error loading image {full_img_path}: {e}")
        continue

Loaded 27 records from /teamspace/studios/this_studio/ocr_results.csv
Starting inference on images...


100%|██████████| 27/27 [00:00<00:00, 69.06it/s]


In [None]:
import os
from PIL import Image

# Specify the path to your single image
image_path = "/teamspace/studios/this_studio/test/india_news_p000141.jpg"

# Attempt to load the image
try:
    image = Image.open(image_path).convert("RGB")
except Exception as e:
    print(f"Error loading image {image_path}: {e}")
    exit(1)

# Define the prompt for text extraction
prompt = "<|image|><|begin_of_text|>Identify and extract all textual content from this newspaper page while preserving the logical reading flow. Ensure that text spanning across images or boxed sections is correctly ordered. Ignore purely decorative elements and prioritize complete sentences. Maintain proper line breaks and paragraph structure."


# Process the image and prompt using the processor, then move the inputs to the model's device
inputs = processor(image, prompt, return_tensors="pt").to(model.device)

# Generate output from the model with a maximum of 1024 new tokens
output = model.generate(**inputs, max_new_tokens=1024)

# Decode the output and remove any surrounding whitespace
extracted_text = processor.decode(output[0]).strip()

# Print the extracted text
print(f"Extracted text: {extracted_text}")


Extracted text: <|begin_of_text|><|image|><|begin_of_text|>Identify and extract all textual content from this newspaper page while preserving the logical reading flow. Ensure that text spanning across images or boxed sections is correctly ordered. Ignore purely decorative elements and prioritize complete sentences. Maintain proper line breaks and paragraph structure. I'm not able to provide information about the image's purpose. 1. The Patents (Amendment) Act, 1999 passed by the Indian Parliament on March 10, 1999 to amend the Patents Act of 1970 that provides for establishment of a mail box system to file patents and accords exclusive marketing rights for 5 years. 2. The Trade Marks Bill, 1999, which repeals and replaces the Trade and Merchandise Marks Act, 1958 passed by the Indian Parliament in the Winter Session that concluded on December 23, 1999. The Copyright (Amendment) Act, 1999 passed by both houses of the Indian Parliament, and signed by the President of India on December 30

In [7]:
import zipfile
import os

def unzip_file(zip_file_path, extract_to):
    """
    Unzips the given zip file to the specified directory.
    
    Args:
        zip_file_path (str): The path to the zip file.
        extract_to (str): The directory where files will be extracted.
    """
    # Ensure the extraction directory exists
    os.makedirs(extract_to, exist_ok=True)
    
    # Open the zip file in read mode
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        # Extract all the contents into the given directory
        zip_ref.extractall(extract_to)
        print(f"Extracted all files to: {extract_to}")

# Example usage:
if __name__ == "__main__":
    zip_file_path = "/teamspace/studios/this_studio/test.zip"   # Replace with your zip file path
    extract_to = "./"    # Replace with your desired extraction directory
    unzip_file(zip_file_path, extract_to)


Extracted all files to: ./


In [None]:
predictions = []   
ground_truths = [] 

print("Starting inference on images...")
for idx, row in tqdm(df.iterrows(), total=len(df)):
    
    rel_img_path = row["image_path"]
    gt_text = row["ocr_text"]
    
    full_img_path = os.path.join(IMAGE_DIR, rel_img_path)
    
 
    try:
        image = Image.open(full_img_path).convert("RGB")
    except Exception as e:
        print(f"Error loading image {full_img_path}: {e}")
        continue

 
    prompt = "<|image|><|begin_of_text|>Identify and extract all textual content from this newspaper page while preserving the logical reading flow. Ensure that text spanning across images or boxed sections is correctly ordered. Ignore purely decorative elements and prioritize complete sentences. Maintain proper line breaks and paragraph structure."



    inputs = processor(image, prompt, return_tensors="pt").to(model.device)


    output = model.generate(**inputs, max_new_tokens=1024)
    

    extracted_text = processor.decode(output[0]).strip()


    predictions.append(extracted_text)
    ground_truths.append(gt_text)
    
    
    print(f"\nImage: {rel_img_path}")
    print(f"Ground Truth: {gt_text}")
    print(f"Extracted:    {extracted_text}")


Starting inference on images...


  4%|▎         | 1/27 [00:40<17:35, 40.59s/it]


Image: /teamspace/studios/this_studio/test/india_news_p000059.jpg
Ground Truth: CLINTON VISIT normalcy has been restored to the bilateral rela- (Continued from page 1) tionship, although some issues still remain to be economic, scientific and technological assistance resolved. President Clinton's forthcoming visit to that, among others, made the Green Revolution India is an opportunity for both countries to in the mid-|960s possible build a new relationship between the world 's two largest democracies in the 2Ist century based The end of the Cold War in the 1990s, coinciding on their common strengths, values and interests. with the  liberalization of the Indian economy, saw a steady improvement in India-US relations Bilateral trade between the two countries has with the Clinton Administration identifying India exceeded US $12 billion in 1999. There was a as one of the 10 major emerging markets. The decline in FDI inflows from the US in 1998 (US last few years have witnessed a number o

  7%|▋         | 2/27 [01:20<16:38, 39.95s/it]


Image: /teamspace/studios/this_studio/test/india_news_p000026.jpg


 11%|█         | 3/27 [01:59<15:53, 39.74s/it]


Image: /teamspace/studios/this_studio/test/india_news_p000077.jpg
Ground Truth: 1 3 U.S. President Clinton and his daughter Chelsea paying homage at the Mahatma Gandhi Memorial, Rajghat in New Delhi on 83 March 21, 2000. INDIAN PRESS MONITOR (Continued from page II) H Clinton's Productive Visit For the first time, the world's most populous and most powerful democracies have an agreed vision of the way to foster closer and more cooperative bilateral relations over decade or two_ Moreover, the vision statement is 1 17 not just an essay in rhetoric; practicality is its hallmark: Above all, an institutional mechanism has been put in place to ensure that the promise is matched by performance: 2 What makes this declaration of intent credible is that neither side has made any attempt to hide or paper over 3 differences that persist, most notably on the crucial and sensitive nuclear issue. Mr Clinton has made it clear that while India is the best judge of what it should do about its security,

 15%|█▍        | 4/27 [02:39<15:11, 39.65s/it]


Image: /teamspace/studios/this_studio/test/india_news_p000087.jpg
Ground Truth: India does not want an arms race: External Affairs Minister Jaswant Singh India's Minister of External Affairs Jaswant Singh said India's borders and in the regions with which it has India has no intention of engaging in any arms race: increased interactions:  He added that India only wants to maintain a credible deterrent nuclear defense capability. we are ASEAN's geographical neighbors with greater interactions on economic, political and secu- Mr. Jaswant Singh, gave the reassurance in his speech rity interests than ever before_ The engagement of a on  "India and ASEAN: Security Paradigms for AD militarily stronger, economically prosperous, demo- 2000" organized by the Institute of Defense and Stra - cratic and secular India imparts greater stability to tegic Studies his visit to Singapore. India plays the region, " he said: a strategic  role in regional  stability, asserted Mr. Jaswant Singh, but it doe

 19%|█▊        | 5/27 [03:18<14:31, 39.60s/it]


Image: /teamspace/studios/this_studio/test/india_news_p000095.jpg
Ground Truth: India Press Monitor Wired to the World sector will account for 35 per cent the focus of its software exports Leveraging Power of the total exports from  India; from the low-end to the high-end, there are a potential 2.2 million then India can be a $IOO billion Through IT jobs in IT by 2008; the IT sector player in the IT world by 2008. According to the recently released will attract  foreign direct  invest- By Akshay Joshi NASSCOM-McKinsey Study on ment (FDI) of $4-5 billion; and the Times of India Indian IT   strategies, Indian and overall revenues from the IT sec - 2000 India-centric companies have Op tor will be nearly 90 billion in- portunities in four broad areas cluding $50 billion in exports: The value-added IT services, software IT task force is trying to radically Underestimating India products, IT-enabled services and enhance per capita productivity of The situation in southern Asia e-business: T

 22%|██▏       | 6/27 [03:58<13:50, 39.56s/it]


Image: /teamspace/studios/this_studio/test/india_news_p000019.jpg
Ground Truth: The Election Commission Role shall be determined by the President as per Under Article 324 of the Indian constitution the rules, and subject to the provisions of any law Election Commission is vested with the role of made by Parliament: supervision, direction and control of prepara- tion of the electoral rolls and the conduct of, The President or the Govemor of a State, will all elections to Parliament and to the Legisla- make available as many staff as is required by ture of every state and of elections to the offices the Election Commissioner  the  discharge of of the President and Vice President: duties conferred on the Election Commission as stated above: Generel Electlons 1952 1990 Theerehasbeenamarked increase in tt total number of candidates contesting in the elections. While in 1952 there were 1864 General Electoral Roll and Elections candidates for 489 seats, the number steadily increased to 4,620

 26%|██▌       | 7/27 [04:37<13:10, 39.53s/it]


Image: /teamspace/studios/this_studio/test/india_news_p000091.jpg
Ground Truth: Embassy In the continuing celebration of Soth anniversary of India aS a Republic, a series of events have been organised by the Embassy over the last few weeks: Surbahar and Pakhawaj: Shubha Sankaran and Peter Fagiola regaled a large audience with  their performance on these unique instruments at the World Bank auditorium on April 29. Shubha Sankaran who studied instrumental music with Ustad Imrat Khan, has performed in the United States and India as well as other countries and is a regular on radio and television. Peter Fagiola, a disciple of late Pandit Taranath Rao, is known for his diversity as a percussionist and is equally at home with pakhawaj and tabla: Memphis in May Festival: India was chosen for this turn of the millenium festival in Memphis held 4-30. The festival included exhibitions of Indian art and artifacts, a photo exhibition on River Ganga, a trade fair organised by the ITPO with the par

 30%|██▉       | 8/27 [05:17<12:31, 39.55s/it]


Image: /teamspace/studios/this_studio/test/india_news_p000090.jpg
Ground Truth: NEWS IN BRIEF (Continued from page 8) the meeting with the chief The delegation impressed upon India Exempt executives of these  companies, international companies to invest Phormaceutical Componies Information Technology Minister in the field of research and de- Mr. Pramod Mahajan said that velopment in India. The delega- From Environment Cledronce government was committed to prO- tion focused on India's capabil- India's Chemical and Fertilizers mote collaboration and diversifi- ity to emerge as a major R&D ministry has favored the exemp- cation and facilitate an enabling hub, attracting foreign investment tion of pharmaceutical units with climate through a series of ongo- into the chemical sector, technol- less than US: $1.2I million in fiscal and other policy mea- ogy transfer and strategic collabo- investments from obligatory envi- sures. He   said Government was rations: ronmental clearance and will s

 33%|███▎      | 9/27 [05:56<11:51, 39.52s/it]


Image: /teamspace/studios/this_studio/test/india_news_p000099.jpg
Ground Truth: U.S. and India neonatal and pediatric mor- Reproductive health including (Continued from page I) bidity and mortality (including socio-behavioral aspects, birth toxemia, hemorrhage, sepsis, practices and development and the National   Institute on birth asphyxia and trauma_ evaluation of newer contracep- Abuse; the Fogarty International accidents, diarrhea and other tives; Center; and other agencies of the infectious diseases, etc ): Taditional systems of medi- U.S:. Government; including the Prevention and treatment of cine and practices which have Centers for Disease Control and low birth weight and intrau- an important impact on the Prevention and the U.S: Agency terine growth retardation; health and well-being of for International Development: women and children adoles - Maternal and child nutrition Under the agreement, both the and the role of micronutrients cents; NIH and ICMR are the primary in heal

 37%|███▋      | 10/27 [06:36<11:12, 39.55s/it]


Image: /teamspace/studios/this_studio/test/india_news_p000130.jpg
Ground Truth: President Clinton's visit to India in March 2000 the follow-up March 2000: Minister of Commerce and Industry and U.S. Commerce Secretary constituted the India-U.S. Commercial Dialogue the visit of President Clinton: April 2000: The India-US: Financial and Economic Forum was constituted the visit of Finance Minister to Washington: The High Level coordinating group for Indo-U.S: Economic Dialogue has been constituted on the Indian side The U.S. has to communicate membership of ihe High Level coordinating group on the U.S. side: 2000: Mr: Thomas Pickering U.S. under Secretary of State for Political Affairs, visited India for the Foreign Office Consultations and the Asian Security Dialogue_with Foreign Secretary Lalit Mansingh. Extensive consultations were also held on Sri Lanka, Fiji and Sierra Leone. 2000: Discussions were held in in Washington on Mutual Legal Assistance June 2000: External Affairs Minister 

 41%|████      | 11/27 [07:15<10:32, 39.55s/it]


Image: /teamspace/studios/this_studio/test/india_news_p000058.jpg
Ground Truth: India-U.S  Space Cooperation: Reaching for new frontiers In the early 1960's, the United States had offered substantial assistance to India in setting up an Equatorial Rocket Launching Station at Thumba (TERLS) : Subsequently, India dedicated this facility to the United Nations in 1968. Since then, scientists from various countries have launched more than 3000 sounding rockets for research purposes. During 1975-76, under a collaborative bilateral agreement, an experiment, Satellite Instrumental Television Experiment (SITE) was conducted. Under this agreement, a U.S. satellite, ATS-6, beamed educational programs to direct reception television sets to 2400 far flung villages exposing them to a new and immensely powerful medium of television. Anuradha, an Indian experiment for cosmic ray studies was of NASA's third Spacelab mission. The Indian Institute of Geomagnetism (IIG) and Survey of India have made use 

 44%|████▍     | 12/27 [07:55<09:53, 39.57s/it]


Image: /teamspace/studios/this_studio/test/india_news_p000015.jpg
Ground Truth: PRIME MNISTER'S ADDRESS measures that we consider necessary in our na- tional Interest; from page 9 Also the world has seen that whatever we will do will be for self-defence, never for aggression. Our National Income has grown by 6 percent; Having crossed 200 million tonnes, production of But the world has also seen that we are capable of foodgrains is higher today than ever before; doing s0 with utmost restraint;with utmost respon- Food stocks are higher today than ever. For this sibility: These were the principles, which guided us our kisans deserve our felicitations. Our agricul- when Pakistan forced the Kargil war on us. Our re- tural scientists are also worthy of our praise: sponse was well-thought out: It was s0 effective that Industrial production is reviving at an energetic it left the enemy stunned: Pace; The world today has well realized that we would do The new initiatives which have been taken 

 48%|████▊     | 13/27 [08:35<09:14, 39.60s/it]


Image: /teamspace/studios/this_studio/test/india_news_p000100.jpg
Ground Truth: Visit of INS Mysore INS Mysore, an Indian guided missile destroyer; participated in the International : Naval Review on July 4, 2000 in New York A of the US: and 20 other J; countries participated in this event and was reviewed by the U.S. President Bill Clinton: INS Mysore earlier made a visit to the Norfolk, VA port between June 27 29, 2000 and Ves: also took part in the Sail Boston-2OOO in Boston on July 11, 2000. General Purpose Destroyer M Weight: 6,700 ton m" Propulsion: 4 gas turbines; 64,000 HP 18F" h: Speed: over 32 Knots 16 Surface-to-surface missiles 1 Surface-to-air missiles U 100 mm gun, 4 30 mm Gatling guns 5 2 Helicopters, anti-submarine rockets, torpedoes and electronic warfare equipment [ Commanding Officer: Captain Rajiv Dhamdhere 1 8 500 crew members E INS Mysore was designed and built at Mazagon Docks, India. 1 Q: 2 32" INS Mysore is the second Indian Naval which has been christened as 

 52%|█████▏    | 14/27 [09:14<08:35, 39.62s/it]


Image: /teamspace/studios/this_studio/test/india_news_p000073.jpg
Ground Truth: India and the United States sign commercial agreements exceeding $3.5 billion Information Technologies a small antenna used for trans- Finance and receiving data com- Motorola (Schaumburg; IL)_ munications) system to The Principal Financial Group working with the government of S.Kumars Com Limited. This net- (Des Moines, IA) signed an agree- Andhra Pradesh, will establish the ment with its Indian joint venture work is designed to promote elec- Motorola School of Communica- partner, the Industrial Develop tronic communication, with a spe- tions Technology as part of the cial focus on rural India: ment Bank of India (IDBI) , solidi - Indian Institute of Information fying its plans to launch the IDBI Technology, in Hyderabad: The Motorola (Schaumburg; IL) Principal Asset Management Motorola school is the first of its Company Ltd. The products of kind for training students in the signed a memorandum of under- 

 56%|█████▌    | 15/27 [09:54<07:54, 39.58s/it]


Image: /teamspace/studios/this_studio/test/india_news_p000140.jpg
Ground Truth: Indian IT industry, the Government of India has set IT policy to meet global  requirements   Specifi- up a National Task Force on IT and Software Devel- this will help angel investors, venture cre - opment to examine the feasibility of strengthening ators and incubation; the industry: The Task Force has already submitted its recommendations, which are under active con- Promote the growth of human resource develop- sideration. Norms for the operations of venture capital ment in the IT sector with the aim of creating funds have also been liberalized to boost the indus- quality-based education; try: The Government of India is also actively provid- fiscal incentives and liberalizing norms for FDI Promote R&D in the sector by identifying thrust and raising capital abroad. areas and drawing up a blueprint for action. Recently, an IT committee was set up by the Minis- India's most prized resource in "s knowledge 

 59%|█████▉    | 16/27 [10:33<07:14, 39.54s/it]


Image: /teamspace/studios/this_studio/test/india_news_p000040.jpg
Ground Truth: Qn 1 JAN INDIA UXVa NEWS PUBLISHED BY PRESS & INFORMATION, EMBASSY OF INDIA , WASHINGTON, DC INDIA NEWS ONLINE: http:llwwwindianembassy org DECEMBER 1, 1999 India not to engage in a nuclear arms race: sh Iour Jaswant Singh, external affairs minister the You have been engaged in evidently; these would range from discussing cOOp- extended talks with the eration in the field of energy science and technology; U.S: since the nuclear tests environment; trade; taxation and economic develop- last year: How would you ment, to global concerns such as terrorism, narcotics, assess the results SO far? non- ~proliferation, disarmament, reform of multilateral First, by restating the objec institutions, expansion of the U.N. Security Council tives from our side: are and regional developments in Asia-Pacific etc: Obvi- two-fold: in the first place, to ously, a multifaceted dialogue of this nature can 1 reconcile the stated

 63%|██████▎   | 17/27 [11:13<06:35, 39.53s/it]


Image: /teamspace/studios/this_studio/test/india_news_p000034.jpg
Ground Truth: PRIME MINISTER'S ADDRESS We shall not betray this trust: (Continued from page 2) With the help of a billion people proud of being Indian, One of our immediate tasks will be to firmly down terrorism, which has come to cast its There is nothing that we cannot achieve; cruel shadow on  innocent people: Our mes- No problem we cannot tackle; sage is loud and clear: The life of every India No challenge we cannot face; and, citizen under our  dispensation is precious. In our fight against terrorism, we will be guided by No opportunity we cannot seize. the principle of Zero Tolerance' Jai Hindl The same principle of 'Zero Tolerance will apply while dealing with corruption that has bred con- tempt for the law. One of the first legislations we will take up is the Lokpal Bill so that the rot Trade Events can be checked from the India Intemational Trade Fair '99. Householdl A broad consensus already exists on electora

 67%|██████▋   | 18/27 [11:52<05:55, 39.52s/it]


Image: /teamspace/studios/this_studio/test/india_news_p000051.jpg
Ground Truth: ECONOMIC NEWS UPDATES (Continued from page 3) in this regard and a task force con- a period of five years through 2003- visages stringent penalties for viola - sisting o the Finance Ministers of 04 will have four mini-missions to be tion of foreign exchange norms_ India and Singapore can be fommed undertaken by Indian Council of Ag: FEMA is also aimed at consolidating to work out the modalities. Shri Sinha riculture Research (ICAR), Ministries and amending the law relating to for- also said that such an exercise should of Agriculture and Textiles: While eign exchange with the objective of be carried out by the private sector ICAR and the Agriculture Ministry will facilitating exteral trade and pay- with both the Govemments playing administer the first two missions re- ment and for promoting orderly de- the role of facilitators spectively, the remaining ones would velopment and maintenance of for- be implem

 70%|███████   | 19/27 [12:32<05:16, 39.50s/it]


Image: /teamspace/studios/this_studio/test/india_news_p000016.jpg
Ground Truth: bined with religious extremism, it becomes & graver the impulse which lies behind it measure UP to the danger to humanity All of you are familiar with the spirit which filled that soldier as he fought to protect saying Karela Aur Neem Chadha" (The bitterness of our Motherland?" Karela is further worsened by adding to it the bitter- ness of Neem ) The challenges that confront us cannot be overcome only by the men on the frontiers doing their duty: In our case, over five thousand of our People There Is a need for an organized and disciplined nation have been killed by terrorists. Terrorism has become to stand behind them_ We must defend our country a big problem in many other parts of the world, too. and develop our society by keeping national inter- It is obstructing the of peace and development ests utmost in our minds: If our economy is not strong; Today; there is a need to mobilize world opinion and if w

 74%|███████▍  | 20/27 [13:11<04:36, 39.51s/it]


Image: /teamspace/studios/this_studio/test/india_news_p000117.jpg
Ground Truth: Excerpts from U.S. news media on Prime Minister Vajpayee's visit A "Tilt" towards India At This Dinner, Harmony Is Served Two years ag0, India's nuclear tests provoked world- India and America clinked glasses in their new whirl- wide condemnation and retaliatory sanctions by the wind   friendship at an elaborate and exotic state United States. Yet in recent months, culminating in dinner at the White House last night as Prime Min- Prime Minister Atal Behari Vajpayee's visit to Wash- ister Atal Bihari Vajpayee profusely praised Bill and ington last weekend, the United States has drawn Hillary Clinton-~and also Christopher  Columbus, closer to India diplomatically than at any time since "who set sail for India but landed in America: the early 1 960's. President Clinton has shaped a wonder where we would be if he had actually reached new foreign policy course in South Asia by embrac- India. India and distancin

 78%|███████▊  | 21/27 [13:51<03:57, 39.53s/it]


Image: /teamspace/studios/this_studio/test/india_news_p000098.jpg
Ground Truth: o8h INDIA NEWS LER PUBLISHED BY PRESS & INFORMATION, EMBASSYQF INDIA, WASHINGTON; DC 0 & SE http:/ /wwwindianembassy org July-August 2000 ndus: U.S. and India Cooperative Research in Health Issues WASHINGTON Health and Hu- further step in meeting that com- AIDS and maternal and child man  Services  Secretary Donna mitment; she said: health areas: Shalala and Indian Minister   of Health and Family Welfare €C. P "It is indeed a momentous occa- The new agreement commits In - sion that the governments of In- dia and the U.S. to address HIV / Thakur signed a joint agreement dian and the United States are AIDS through a disease preven- 8 June 13, pledging cooperative coming together in this collabo- tion program: Their plan includes research in HIV/AIDS prevention D and maternal and child health rative venture in the areas of improved surveillance; prevention J dus: research immediate public health concern - res

 81%|████████▏ | 22/27 [14:30<03:17, 39.52s/it]


Image: /teamspace/studios/this_studio/test/india_news_p000141.jpg
Ground Truth: India & Intellectual Property Rights HERE is a well-established  statutory, ad- further amend the Patents Act; 1970 and make it ministrative and judicial framework to safe- TRIPS compliant was introduced in the Upper intellectual property rights in India, whether House of Indian Parliament on December 20, 1999. relate to patents; trademarks, copyright or in- dustrial designs: Well-known international trade- In addition to the above legislative   changes, the Government of India has taken several measures to marks have been protected in India even when were not registered in India. streamline and strengthen the intellectual property administration   system  in the country: The   Trade Computer  software   companies have  successfully Marks Registry is also proposed to be further strength - curtailed piracy through court orders:   Computer ened and modernized: As regards the aspect en- databases have been pr

 85%|████████▌ | 23/27 [15:10<02:38, 39.50s/it]


Image: /teamspace/studios/this_studio/test/india_news_p000014.jpg
Ground Truth: Prime Minister Vajpayees Independence address to the Nation Sisters, Brothers and have t0 take concrete measures for the families of Dear Children, the martyrs and wounded soldiers s0 that may Accept my greetings on the sacred occasion of Inde- live a life of comfort and dignity. It has been said that Pendence This is a of hallowed remem- we remember and honour soldiers during a war, and brance for us. This is a of dedication for us. in the immediate aftermath: But as the Pass, we forget them: And it is a sad fact that many who sac- This year's Independence has special signifi- rificed their life and limb in previous wars were often cance for all of us: The present century is coming to forgotten. I give my personal pledge that this will an end: The world will have entered the next century not happen again: by the time of the next Independence This Red Fort and its world-renowned ramparts are As we stand at

 89%|████████▉ | 24/27 [15:49<01:58, 39.49s/it]


Image: /teamspace/studios/this_studio/test/india_news_p000050.jpg
Ground Truth: India:com goes Global ECONOMIC Excerpts from an article written by NEWS Dewang Mehta, Director; UPDATES National Association of Software and Service Companies The Indian  software  industry has S2 billion of e-commerce solutions zoomed from a mere S20 million ten exports by 2002, when total Indian India and US sign years ago to a whopping US S3.9 software exports are projected to be tor billion in 1998-99. No industry has US $8.7 billion, during that year: done as much for the competitive- Quantitative Restrictions ness of global corporations and, cer- Global Presence After  months of negotiations India tainly , no industry has created as and US inked an agreement for a 15- many millionaires in India in such a Already  212 Indian software com- month phase-out of import curbs short span of time. In 1998-99, 203 panies have either subsidiaries Or between the two countries. Announc- out of the Fortune 500 out

 93%|█████████▎| 25/27 [16:29<01:18, 39.48s/it]


Image: /teamspace/studios/this_studio/test/india_news_p000029.jpg
Ground Truth: Note n Violation of India's airspace Pakistan's Atlantique aircraft and consequent action On August 10, 1999, a Pakistani Naval Anti-Subma- It is crucial to recognise that surveillance recon- rine Warfare and maritime reconnaissance   aircraft, naissance 1 intelligence   gathering is an offensive called Atlantique intruded 10 Kms into Indian Territory military operation and a hostile activity. It cannot be in the Area of KORI CREEK. passed off as harmless: The intruding aircraft was detected by IAF ground ra- The central issue therefore is not whether Pakistan dars and was intercepted 10 Kms south of the Interna- aircraft was armed' Or unarmed" The issue is that tional Border: When the IAF fighters closed in to iden- it was a military aircraft engaged in offensive and tify and signal the intruding Pakistani aircraft to force hostile military operation compounded by its com- it to land at an Indian base, th

 96%|█████████▋| 26/27 [17:08<00:39, 39.46s/it]


Image: /teamspace/studios/this_studio/test/india_news_p000039.jpg
Ground Truth: IIN THIS ISSUE India News is published by the Press & Information Embassy of India. An electronic edition is available at the Embassy'$ web site: Address to the Nation by hllp:Ilwww.indianembassy org Atal Bihari Vajpayee You can also receive an email version of India News. Join te Embassy Announcement List by sending email to indianembassy @ Election Results of 199 egroups com or visit te following site: http:llwww egroups com/grouplindianembassyhnfu html Trade Events _ The Embassy also maintains discussion group for India News readers. You can join by sending email t0: Excerpts from Ambassador Naresh india_discussion @egroups com Chandra's interview on NewsHour or by visiting the following site: on Senates rejection to ratify the http:llwwwegroups cOm/gruuplindia_discussionfinfohtml Nuclear Test Ban Treaty Embassy of India Prime Minister and the Cabinet 5 Press & Information @ 2107 Massachusetts Ave-, NW 

100%|██████████| 27/27 [17:47<00:00, 39.56s/it]


Image: /teamspace/studios/this_studio/test/india_news_p000064.jpg
Ground Truth: Indian American Community: A Story of Achievements There are now more than 1.5 million peoples of Indian origin in America. reflect the multi-ethnic, multi-religious and multi-lingual society of India. Indian-Americans are represented in many fields including academics and entrepreneurs, doctors and lawyers, engineers and financiers: According to the U.S. Census Bureau, Indian-American median family income is 560,093 as against the national median family income of $38, 885. The high income clearly reflects the advanced educational levels achieved by the community: More than 87 % of Indians in America have completed high school while at least 62% have some college education: As much as 58% of Indian Americans over the age of 25 hold a bachelor's degree or higher: High levels of education have also enabled Indian-Americans to become a productive segment of the U. S. population, with 72.3 % participating in t




In [6]:
import pandas as pd


results_df = pd.DataFrame({"image_path": df["image_path"], "ground_truth": ground_truths, "prediction": predictions})


results_csv_path = "predictionsunsloth.csv"
results_df.to_csv(results_csv_path, index=False, encoding="utf-8")

print(f"Predictions saved to {results_csv_path}")


Predictions saved to predictionsunsloth.csv


In [2]:
import pandas as pd


df = pd.read_csv("//teamspace/studios/this_studio/wer_cer.csv")


unwanted_text = "<|begin_of_text|><|image|><|begin_of_text|>Extract the text from this image."
df["prediction"] = df["prediction"].str.replace(rf"^{unwanted_text}\s*", "", regex=True)


df["prediction"] = df["prediction"].str.replace(r"^\|+", "", regex=True).str.lstrip()


df.to_csv("wer_cer.csv", index=False)


In [3]:
import pandas as pd

# Load the two CSV files
csv1_path = "/teamspace/studios/this_studio/cleaned_fileunsloth.csv"  
csv2_path = "/teamspace/studios/this_studio/groud_truth_test.csv"

# Read the CSV files
df1 = pd.read_csv(csv1_path)
df2 = pd.read_csv(csv2_path)

# Copy 'prediction' column from df1 to df2 based on 'image_path'
df2 = df2.merge(df1[['image_path', 'prediction']], on='image_path', how='left')

# Save the updated dataframe to a new CSV file
output_path = "wer_cer_alt.csv"  # Replace with the desired output path
df2.to_csv(output_path, index=False)

print(f"Updated CSV saved to {output_path}")

Updated CSV saved to wer_cer_alt.csv


In [6]:
import pandas as pd
import editdistance
from jiwer import wer


csv_path = "wer_cer_alt.csv"
df = pd.read_csv(csv_path)


if "prediction" not in df.columns or "ground_truth" not in df.columns:
    raise ValueError("CSV file must contain 'prediction' and 'ground_truth' columns.")


predictions = df["prediction"].astype(str).tolist()
ground_truths = df["ground_truth"].astype(str).tolist()

print("\nEvaluating performance...")

total_wer = 0.0
total_cer = 0.0
num_samples = len(predictions)

for pred, gt in zip(predictions, ground_truths):
   
    sample_wer = wer(gt, pred)
    total_wer += sample_wer
    
    
    sample_cer = editdistance.eval(pred, gt) / len(gt) if len(gt) > 0 else 0.0
    total_cer += sample_cer

average_wer = total_wer / num_samples if num_samples > 0 else float('nan')
average_cer = total_cer / num_samples if num_samples > 0 else float('nan')

print(f"\nAverage Word Error Rate (WER): {average_wer:.4f}")
print(f"Average Character Error Rate (CER): {average_cer:.4f}")



Evaluating performance...

Average Word Error Rate (WER): 1.0075
Average Character Error Rate (CER): 0.8006


# The following code finetunes the  llama 3.2 vision model using lora with the train split of the dataset 
## I used unsloth in order to make the training faster

In [None]:
import cv2
import os

def crop_left_bottom(image_path, left_crop_percent=10, bottom_crop_percent=10):
    """Crop only the left and bottom borders of an image."""
    image = cv2.imread(image_path)

    if image is None:
        raise ValueError(f"Could not open image: {image_path}")

    h, w, _ = image.shape  # Get image dimensions


    crop_x = int(w * left_crop_percent / 100)  
    crop_y = int(h * bottom_crop_percent / 100)  

    # Apply cropping (keep top and right intact)
    cropped_image = image[:h - crop_y, crop_x:w]

    return cropped_image

def process_images_in_directory(directory, left_crop_percent=10, bottom_crop_percent=10):
    """Process all images in the given directory and replace them with cropped versions."""
    for filename in os.listdir(directory):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg')):  # Only process images
            image_path = os.path.join(directory, filename)

            try:
                cropped_image = crop_left_bottom(image_path, left_crop_percent, bottom_crop_percent)

                # Overwrite the original image
                cv2.imwrite(image_path, cropped_image)
                print(f"Processed and replaced: {filename}")

            except Exception as e:
                print(f"Error processing {filename}: {e}")

# Set your directory and crop percentages
image_directory = "/teamspace/studios/this_studio/train"
left_crop_percent = 7  # Adjust this to crop more/less from the left
bottom_crop_percent = 7  # Adjust this to crop more/less from the bottom

# Process and replace all images in the directory
process_images_in_directory(image_directory, left_crop_percent, bottom_crop_percent)


Processed and replaced: india_news_p000006.jpg
Processed and replaced: india_news_p000007.jpg
Processed and replaced: india_news_p000008.jpg
Processed and replaced: india_news_p000009.jpg
Processed and replaced: india_news_p000010.jpg
Processed and replaced: india_news_p000011.jpg
Processed and replaced: india_news_p000012.jpg
Processed and replaced: india_news_p000013.jpg
Processed and replaced: india_news_p000017.jpg
Processed and replaced: india_news_p000018.jpg
Processed and replaced: india_news_p000020.jpg
Processed and replaced: india_news_p000021.jpg
Processed and replaced: india_news_p000024.jpg
Processed and replaced: india_news_p000025.jpg
Processed and replaced: india_news_p000027.jpg
Processed and replaced: india_news_p000028.jpg
Processed and replaced: india_news_p000030.jpg
Processed and replaced: india_news_p000031.jpg
Processed and replaced: india_news_p000032.jpg
Processed and replaced: india_news_p000033.jpg
Processed and replaced: india_news_p000035.jpg
Processed and

In [3]:
from unsloth import FastVisionModel
import torch

model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Llama-3.2-11B-Vision-Instruct",
    load_in_4bit = True,
    use_gradient_checkpointing = "unsloth",
)
model.cuda()

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.2.12: Fast Mllama vision patching. Transformers: 4.48.3.
   \\   /|    GPU: NVIDIA L40S. Max memory: 44.527 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, 
    finetune_language_layers   = True, 
    finetune_attention_modules = True,
    finetune_mlp_modules       = True,
    r = 16,           
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    random_state = 3443,
    use_rslora = False,
    loftq_config = None,
)

Unsloth: Making `model.base_model.model.vision_model.transformer` require gradients


In [5]:
import os
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset


class OCRDataset(Dataset):
    def __init__(self, csv_path: str, images_dir: str):
        self.data = pd.read_csv(csv_path)
        self.images_dir = images_dir

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx: int):
        row = self.data.iloc[idx]
        image_path = os.path.join(self.images_dir, row["image_path"])
        image = Image.open(image_path).convert("RGB")
        return {"image": image, "model_output": row["model_output"]}


ocr_dataset = OCRDataset(
    csv_path="/teamspace/studios/this_studio/ground_truth_train.csv",
    images_dir="/teamspace/studios/this_studio/train",
)

dataset = [ocr_dataset[i] for i in range(min(500, len(ocr_dataset)))]

instruction = """
Identify and extract all textual content from this newspaper page while preserving the logical reading flow. Ensure that text spanning across images or boxed sections is correctly ordered. Ignore purely decorative elements and prioritize complete sentences. Maintain proper line breaks and paragraph structure."
"""

def convert_to_conversation(sample):
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": instruction},
                {"type": "image", "image": sample["image"]},
            ],
        },
        {
            "role": "assistant",
            "content": [{"type": "text", "text": sample["model_output"]}],
        },
    ]
    return {"messages": conversation}


converted_dataset = [convert_to_conversation(sample) for sample in dataset]

print(converted_dataset[45])


{'messages': [{'role': 'user', 'content': [{'type': 'text', 'text': '\nIdentify and extract all textual content from this newspaper page while preserving the logical reading flow. Ensure that text spanning across images or boxed sections is correctly ordered. Ignore purely decorative elements and prioritize complete sentences. Maintain proper line breaks and paragraph structure."\n'}, {'type': 'image', 'image': <PIL.Image.Image image mode=RGB size=1622x2114 at 0x7F821D46E950>}]}, {'role': 'assistant', 'content': [{'type': 'text', 'text': 'India\'s Experience Has Taught That Peace Lies in Strength\n\nBy Atal Bihari Vajpayee, Prime Minister of India\n\nInternational Herald Tribune - September 21, 2000\n\nM Y RECENT visit to the United States has consolidated relations between the world\'s two largest democracies. The joint initiatives and understandings that we reached represent a major step forward. India and the United States can be natural allies in the 21st century. Events and circum

In [6]:
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

FastVisionModel.for_training(model) 
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=UnslothVisionDataCollator(model, tokenizer),  
    train_dataset=converted_dataset,
    args=SFTConfig(
        per_device_train_batch_size=1,         
        gradient_accumulation_steps=8,          
        warmup_steps=10,                        
        max_steps=100,                           
        learning_rate=1e-4,                      
        fp16=not is_bf16_supported(),
        bf16=is_bf16_supported(),
        logging_steps=10,                        
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",  
        remove_unused_columns=False,
        dataset_text_field="",
        dataset_kwargs={"skip_prepare_dataset": True},
        dataset_num_proc=4,
        max_seq_length=2048,
    ),
)


In [7]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 107 | Num Epochs = 8
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 8
\        /    Total batch size = 8 | Total steps = 100
 "-____-"     Number of trainable parameters = 67,174,400
🦥 Unsloth needs about 1-3 minutes to load everything - please wait!


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,0.4437
20,0.1555
30,0.0777
40,0.0627
50,0.0558
60,0.0527
70,0.0397
80,0.0376
90,0.0356
100,0.0319


In [11]:
ocr_dataset = OCRDataset(
    csv_path="/teamspace/studios/this_studio/groud_truth_test.csv",
    images_dir="/teamspace/studios/this_studio/test",
)

dataset = [ocr_dataset[i] for i in range(min(500, len(ocr_dataset)))]
FastVisionModel.for_inference(model)  

image = dataset[9]["image"]

messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": instruction},
        ],
    }
]
input_text = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True
)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens=False,
    return_tensors="pt",
).to("cuda")

from transformers import TextStreamer

text_streamer = TextStreamer(tokenizer, skip_prompt=True)
_ = model.generate(
    **inputs,
    streamer=text_streamer,
    max_new_tokens=1024,
    use_cache=True,
    temperature=1.5,
    min_p=0.1
)

PRIME MINISTER'S ADDRESS from page 9

Our National Income has grown by 6 percent;
Having crossed 200 million tonnes, production of foodgrains is higher today than ever before;
Food stocks are higher today than ever. For this our kisans deserve our felicitations. Our agricultural scientists are also worthy of our praise.

Industrial production is reviving at an energetic pace;

The new initiatives which have been taken in infrastructure have infused a new purposefulness right across the economy;

At over 30 billion dollars our foreign exchange reserves are higher than ever before;

The Sensex in the stock market has risen to record levels. In spite of Kargil, our companies have been able to increase their market value by over Rs. 200,000 crore.

The off-take of cement for building houses is 22 percent higher than it has ever been;

Facilities which were known only to the rich, to the few, and in our cities alone --- insurance for the output, credit cards --- are now available to, and ar

In [12]:
import pandas as pd
from tqdm import tqdm

# Paths to your CSV and image directory
csv_path = "/teamspace/studios/this_studio/groud_truth_test.csv"
images_dir = "/teamspace/studios/this_studio/test"

# Load the CSV into a DataFrame (this assumes the CSV already contains image paths or identifiers)
df = pd.read_csv(csv_path)

# Create your OCRDataset instance
ocr_dataset = OCRDataset(csv_path=csv_path, images_dir=images_dir)

# Prepare your model for inference (make sure 'model' is already loaded and 'FastVisionModel' is imported)
FastVisionModel.for_inference(model)

# List to collect predictions for each image
predictions = []

# Loop over the entire dataset (one sample per image)
for idx in tqdm(range(len(ocr_dataset)), desc="Processing images"):
    # Retrieve the sample; assumes each sample is a dictionary with key "image"
    sample = ocr_dataset[idx]
    image = sample["image"]
    
    # Prepare the prompt/message.
    # Ensure that the variable 'instruction' is defined with your desired text.
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": instruction},
            ],
        }
    ]
    
    # Apply the chat template to create the input text for the model
    input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
    
    # Tokenize both the image and text input
    inputs = tokenizer(
        image,
        input_text,
        add_special_tokens=False,
        return_tensors="pt",
    ).to("cuda")
    
    # Generate the prediction
    outputs = model.generate(
        **inputs,
        max_new_tokens=1024,
        use_cache=True,
        temperature=1.5,
        min_p=0.1
    )
    
    # Decode the generated tokens into text (skip special tokens)
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Append the prediction to the list
    predictions.append(prediction)

# Add the predictions as a new column in the DataFrame.
# Make sure the order of 'predictions' matches the rows in your CSV.
df['prediction'] = predictions

# Save the updated DataFrame back to CSV
df.to_csv(csv_path, index=False)


Processing images: 100%|██████████| 27/27 [33:05<00:00, 73.55s/it]


In [15]:
import pandas as pd

# Load the CSV
file_path = "/teamspace/studios/this_studio/groud_truth_test.csv"  # Update with your actual file path
df = pd.read_csv(file_path)

# Define the exact multiline string to remove
text_to_remove = """user


Identify and extract all textual content from this newspaper page while preserving the logical reading flow. Ensure that text spanning across images or boxed sections is correctly ordered. Ignore purely decorative elements and prioritize complete sentences. Maintain proper line breaks and paragraph structure."""

# Remove occurrences of the specific string in the 'prediction' column
df["prediction"] = df["prediction"].str.replace(text_to_remove, "", regex=False)

# Save the cleaned CSV
df.to_csv("cleaned_file_final.csv", index=False)


In [11]:
import os
import pandas as pd
from PIL import Image
import torch
from tqdm import tqdm
from jiwer import wer
import editdistance

from transformers import MllamaForConditionalGeneration, AutoProcessor


CSV_PATH = "/teamspace/studios/this_studio/groud_truth_test.csv"      
IMAGE_DIR = "/teamspace/studios/this_studio/test"       
df = pd.read_csv(CSV_PATH)
df = df.dropna(subset=["ocr_text"])
df = pd.read_csv(CSV_PATH)
print(f"Loaded {len(df)} records from {CSV_PATH}")



predictions = []  
ground_truths = [] 

print("Starting inference on images...")
for idx, row in tqdm(df.iterrows(), total=len(df)):
    
    rel_img_path = row["image_path"]
    gt_text = row["ocr_text"]
    
    
    full_img_path = os.path.join(IMAGE_DIR, rel_img_path)
    
    
    try:
        image = Image.open(full_img_path).convert("RGB")
    except Exception as e:
        print(f"Error loading image {full_img_path}: {e}")
        continue

Loaded 27 records from /teamspace/studios/this_studio/groud_truth_test.csv
Starting inference on images...


100%|██████████| 27/27 [00:00<00:00, 89.09it/s]


In [12]:
model.save_pretrained("llama_3.2_vision_finetuned") 
tokenizer.save_pretrained("llama_3.2_vision_finetuned")

[]

In [14]:
import pandas as pd

# Paths for the original and new CSV files
CSV_PATH = "/teamspace/studios/this_studio/cleaned_file_final.csv"
NEW_CSV_PATH = "/teamspace/studios/this_studio/updated_cleaned_file_final.csv"

# Read the original CSV file
df = pd.read_csv(CSV_PATH)

# Remove occurrences where 'assistant' appears as a standalone line along with extra whitespace/newlines
df["prediction"] = df["prediction"].str.replace(r"\nassistant\s*\n", "\n", regex=True)

# Remove a starting double quote at the beginning of the text or following a newline
df["prediction"] = df["prediction"].str.replace(r'(^|\n)"', r'\1', regex=True)

# Display the first few predictions to verify changes
print(df["prediction"].head())

# Save the updated dataframe to a new CSV file
df.to_csv(NEW_CSV_PATH, index=False)
print(f"Updated CSV saved to {NEW_CSV_PATH}")


0    \nNEWS IN BRIEF (Continued from page 8)\n\nDur...
1    \nIndia not to engage in a nuclear arms race: ...
2    \nIndia News is published by the Press & Infor...
3    \nIndia & Intellectual Property Rights\n\nTHER...
4    \nIndia Press Monitor\n\nWired to the World\nL...
Name: prediction, dtype: object
Updated CSV saved to /teamspace/studios/this_studio/updated_cleaned_file_final.csv


In [15]:
import pandas as pd
import editdistance
from jiwer import wer


csv_path = "/teamspace/studios/this_studio/updated_cleaned_file_final.csv"
df = pd.read_csv(csv_path)

if "prediction" not in df.columns or "ground_truth" not in df.columns:
    raise ValueError("CSV file must contain 'prediction' and 'ground_truth' columns.")


predictions = df["prediction"].astype(str).tolist()
ground_truths = df["ground_truth"].astype(str).tolist()

print("\nEvaluating performance...")

total_wer = 0.0
total_cer = 0.0
num_samples = len(predictions)

for pred, gt in zip(predictions, ground_truths):
  
    sample_wer = wer(gt, pred)
    total_wer += sample_wer
    
 
    sample_cer = editdistance.eval(pred, gt) / len(gt) if len(gt) > 0 else 0.0
    total_cer += sample_cer

average_wer = total_wer / num_samples if num_samples > 0 else float('nan')
average_cer = total_cer / num_samples if num_samples > 0 else float('nan')

print(f"\nAverage Word Error Rate (WER): {average_wer:.4f}")
print(f"Average Character Error Rate (CER): {average_cer:.4f}")



Evaluating performance...



Average Word Error Rate (WER): 0.1041
Average Character Error Rate (CER): 0.0942


## Following code compares the code before and after fine tuning 

In [18]:
import pandas as pd
import editdistance
from jiwer import wer
from nltk.translate.bleu_score import sentence_bleu
from tabulate import tabulate


normal_csv = "/teamspace/studios/this_studio/cleaned_fileunsloth.csv"
finetuned_csv = "/teamspace/studios/this_studio/updated_cleaned_file_final.csv"

df_normal = pd.read_csv(normal_csv)
df_finetuned = pd.read_csv(finetuned_csv)


for df in [df_normal, df_finetuned]:
    if "prediction" not in df.columns or "ground_truth" not in df.columns:
        raise ValueError("CSV files must contain 'prediction' and 'ground_truth' columns.")

def calculate_metrics(predictions, ground_truths):
    total_wer, total_cer, total_edit_distance, total_bleu, exact_matches = 0, 0, 0, 0, 0
    num_samples = len(predictions)
    
    for pred, gt in zip(predictions, ground_truths):
        total_wer += wer(gt, pred)
        total_cer += editdistance.eval(pred, gt) / len(gt) if len(gt) > 0 else 0.0
        total_edit_distance += editdistance.eval(pred, gt)
        total_bleu += sentence_bleu([gt.split()], pred.split()) 
        exact_matches += 1 if pred == gt else 0
    
    return {
        "WER": total_wer / num_samples if num_samples > 0 else float('nan'),
        "CER": total_cer / num_samples if num_samples > 0 else float('nan'),
        "Edit Distance": total_edit_distance / num_samples if num_samples > 0 else float('nan'),
        "BLEU Score": total_bleu / num_samples if num_samples > 0 else float('nan'),
    }


normal_metrics = calculate_metrics(df_normal["prediction"].astype(str).tolist(), df_normal["ground_truth"].astype(str).tolist())
finetuned_metrics = calculate_metrics(df_finetuned["prediction"].astype(str).tolist(), df_finetuned["ground_truth"].astype(str).tolist())


data = [[metric, f"{normal_metrics[metric]:.4f}", f"{finetuned_metrics[metric]:.4f}"] for metric in normal_metrics.keys()]

print("\nPerformance Comparison:")
print(tabulate(data, headers=["Metric", "Normal Model", "Fine-Tuned Model"], tablefmt="grid"))


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()



Performance Comparison:
+---------------+----------------+--------------------+
| Metric        |   Normal Model |   Fine-Tuned Model |
| WER           |         1.1465 |             0.1041 |
+---------------+----------------+--------------------+
| CER           |         0.9231 |             0.0942 |
+---------------+----------------+--------------------+
| Edit Distance |      3325.07   |           267.926  |
+---------------+----------------+--------------------+
| BLEU Score    |         0.3453 |             0.8848 |
+---------------+----------------+--------------------+
