In [1]:
import os
import numpy as np
import time
import Levenshtein

from dotenv import load_dotenv
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes
from msrest.authentication import CognitiveServicesCredentials

In [2]:
'''
Authenticate
Authenticates your credentials and creates a client.
'''

load_dotenv('azure.env')

subscription_key = os.getenv("AZURE_SUBSCRIPTION_KEY")
endpoint = os.getenv("AZURE_ENDPOINT")

if not subscription_key or not endpoint:
    raise ValueError("Environment variables are missing!")

cognitive_services_credentials = CognitiveServicesCredentials(subscription_key)

computervision_client = ComputerVisionClient(endpoint, cognitive_services_credentials)

'''
END - Authenticate
'''

'\nEND - Authenticate\n'

In [3]:
def extract_text_from_image(img_path):
    with open(img_path, "rb") as image_stream:
        read_response = computervision_client.read_in_stream(image_stream, raw=True)

    # Aflam ID-ul operatiei
    read_operation_location = read_response.headers["Operation-Location"]
    operation_id = read_operation_location.split("/")[-1]

    # Asteptam pana cand operatia este completata
    while True:
        read_result = computervision_client.get_read_result(operation_id)
        if read_result.status not in ['notStarted', 'running']:
            break
        time.sleep(1)

    # Extragem textul din imagine
    text = ""
    if read_result.status == OperationStatusCodes.succeeded:
        for result in read_result.analyze_result.read_results:
            for line in result.lines:
                text += line.text + " "
    return text.strip()

In [4]:
def levenshtein_distance(result, text):
    char_distance = Levenshtein.distance(text, result)

    words = text.split()
    result_words = result.split()

    word_distance = Levenshtein.distance(words, result_words)

    return char_distance, word_distance

In [5]:
def jaccard_similarity(text, result):
    # Construieste seturile de caractere sau cuvinte
    set_text = set(text.split())
    set_result = set(result.split())

    # Calculam intersectia si reuniunea seturilor
    intersection = len(set_text & set_result)
    union = len(set_text | set_result)

    return intersection / union if union != 0 else 0

def hamming_distance(text, result):
    # Verifica daca textul si rezultatul au aceeasi lungime
    if len(text) != len(result):
        raise ValueError("Textul și rezultatul trebuie să aibă aceeași lungime")

    # Calcularea distantei Hamming
    distance = 0
    for t_char, r_char in zip(text, result):
        if t_char != r_char:
            distance += 1

    return distance

def euclidean_distance(text, result):
    # Construieste vectorii de frecventa a caracterelor
    text_freq = np.array([text.count(chr(i)) for i in range(128)])
    result_freq = np.array([result.count(chr(i)) for i in range(128)])

    # Calculeaza distanta Euclidiana
    distance = np.linalg.norm(text_freq - result_freq)

    return distance

In [10]:
image_path = "data/images2/test3.png"

#expected_text = "Succes in rezolvarea tEMELOR la LABORAtoarele de Inteligenta Artificiala"
#expected_text = "Google Cloud Platform"
expected_text = "Ana are mere și pere Anei nu îi plac merele, dar adoră ciocolata."

extracted_text = extract_text_from_image(image_path)
print("Text extras:", extracted_text)

lev_dist_char, lev_dist_word = levenshtein_distance(extracted_text, expected_text)
jaccard_sim = jaccard_similarity(expected_text, extracted_text)
hamming_dist = hamming_distance(expected_text, extracted_text)
euclidean_dist = euclidean_distance(expected_text, extracted_text)

print(f"\nLevenshtein Distance (caractere): {lev_dist_char}")
print(f"Levenshtein Distance (cuvinte): {lev_dist_word}")
print(f"Jaccard Similarity: {jaccard_sim:.2f}")
print(f"Hamming Distance: {hamming_dist}")
print(f"Euclidean Distance: {euclidean_dist}")


Text extras: Ana are mere di pere Anei nu li plac merele, dar adora ciocolata.

Levenshtein Distance (caractere): 3
Levenshtein Distance (cuvinte): 3
Jaccard Similarity: 0.62
Hamming Distance: 3
Euclidean Distance: 1.7320508075688772
