In [136]:
import os
import io
from azure.core.exceptions import ResourceNotFoundError
from azure.ai.formrecognizer import FormRecognizerClient
from azure.ai.formrecognizer import FormTrainingClient
from azure.core.credentials import AzureKeyCredential
from azure.keyvault.secrets import SecretClient
from azure.identity import DefaultAzureCredential
from azure.storage.blob import BlobServiceClient, ContainerClient
from waiting import wait
import pandas as pd

## Azure authentication
Don't forget to `az login` in cmd!

In [137]:
keyVaultName = "cvprojectkeyvault"
KVUri = f"https://{keyVaultName}.vault.azure.net"

credential = DefaultAzureCredential()
client = SecretClient(vault_url=KVUri, credential=credential)

In [138]:
AZURE_FORM_RECOGNIZER_ENDPOINT = "https://cvprojectformrecognizer.cognitiveservices.azure.com/"
secretName = "formrecognizerkey1"
retrieved_secret = client.get_secret(secretName)

endpoint = AZURE_FORM_RECOGNIZER_ENDPOINT
key = retrieved_secret.value
form_training_client = FormTrainingClient(endpoint=endpoint, credential=AzureKeyCredential(key))

# Get previously trained custom model

In [139]:
saved_model_list = form_training_client.list_custom_models()

In [140]:
for model in saved_model_list:
    if model.status == "invalid":
        form_training_client.delete_model(model.model_id)

In [141]:
saved_model_list = form_training_client.list_custom_models()
my_model_ids = []
for model in saved_model_list:
    my_model_ids.append(model.model_id)
    print(model)

CustomFormModelInfo(model_id=08bc6a71-bf06-45c6-9a0d-f662f9ad44d5, status=ready, training_started_on=2024-05-23 15:51:18+00:00, training_completed_on=2024-05-23 15:51:20+00:00, properties=CustomFormModelProperties(is_composed_model=False), model_name=None)
CustomFormModelInfo(model_id=22d27070-d5eb-4c08-b97f-3a9fe0edc6fb, status=ready, training_started_on=2024-05-24 15:07:05+00:00, training_completed_on=2024-05-24 15:07:06+00:00, properties=CustomFormModelProperties(is_composed_model=False), model_name=None)
CustomFormModelInfo(model_id=2630d0c6-6194-4ce4-ad23-92bdb4029140, status=ready, training_started_on=2024-05-24 14:54:00+00:00, training_completed_on=2024-05-24 14:54:01+00:00, properties=CustomFormModelProperties(is_composed_model=False), model_name=None)
CustomFormModelInfo(model_id=901abd54-b4b0-422c-b75e-68815c55a367, status=ready, training_started_on=2024-05-24 15:08:42+00:00, training_completed_on=2024-05-24 15:08:44+00:00, properties=CustomFormModelProperties(is_composed_mod

In [142]:
#labeled_custom_model_id = my_model_ids[-1]
labeled_custom_model_id = 'ddae42d8-830e-4b06-9b64-bcd0fde83cb5'
print(labeled_custom_model_id)

ddae42d8-830e-4b06-9b64-bcd0fde83cb5


In [143]:
labeled_custom_model = form_training_client.get_custom_model(labeled_custom_model_id)

In [144]:
labeled_custom_model.status

'ready'

In [145]:
form_recognizer_client = form_training_client.get_form_recognizer_client()

# Read boarding passes from Blob storage

In [146]:
account_url = "https://computervisionproject.blob.core.windows.net"
default_credential = DefaultAzureCredential()

# Create the BlobServiceClient object
blob_service_client = BlobServiceClient(account_url, credential=default_credential)
kiosk_container_client = ContainerClient(account_url, credential=default_credential, container_name="kiosk")

In [147]:
files_to_process = []
for file in kiosk_container_client.walk_blobs('boarding_passes/', delimiter='/'):
    print(file.name)
    files_to_process.append(file)

boarding_passes/boarding-avkash.pdf
boarding_passes/boarding-james-webb.pdf
boarding_passes/boarding-james.pdf
boarding_passes/boarding-libby.pdf
boarding_passes/boarding-radha-s-kumar.pdf
boarding_passes/boarding-sameer.pdf
boarding_passes/boarding_pass_johannes.pdf


# Recognize information from boarding passes using custom model

In [148]:
def is_model_ready(action):
    if action.status() == 'succeeded':
        return True
    return False

In [149]:
results = []

for file in files_to_process:
    blob_client = blob_service_client.get_blob_client(container="kiosk", blob=file.name)
    stream = io.BytesIO()
    blob_client.download_blob().readinto(stream)
    pdf = stream.getvalue()
    recognize_action = form_recognizer_client.begin_recognize_custom_forms(model_id=labeled_custom_model_id, form=pdf)
    wait(lambda: is_model_ready(recognize_action), timeout_seconds=120, waiting_for="Custom Form Recognition: "+file.name)
    print("Custom Form recognition done: "+file.name)
    results.append({"file": file.name, "model_result": recognize_action.result()})

Custom Form recognition done: boarding_passes/boarding-avkash.pdf
Custom Form recognition done: boarding_passes/boarding-james-webb.pdf
Custom Form recognition done: boarding_passes/boarding-james.pdf
Custom Form recognition done: boarding_passes/boarding-libby.pdf
Custom Form recognition done: boarding_passes/boarding-radha-s-kumar.pdf
Custom Form recognition done: boarding_passes/boarding-sameer.pdf
Custom Form recognition done: boarding_passes/boarding_pass_johannes.pdf


# Format results and write to Blob Storage

In [150]:
def model_result_to_dict(mr):
    res_dict = {}
    for recognized_content in mr:
        for name, field in recognized_content.fields.items():
            res_dict[name] = field.value
    return res_dict

In [151]:
for r in results:
    print(r['file'])
    r.update(model_result_to_dict(r['model_result']))
    del r['model_result']

boarding_passes/boarding-avkash.pdf
boarding_passes/boarding-james-webb.pdf
boarding_passes/boarding-james.pdf
boarding_passes/boarding-libby.pdf
boarding_passes/boarding-radha-s-kumar.pdf
boarding_passes/boarding-sameer.pdf
boarding_passes/boarding_pass_johannes.pdf


In [152]:
results

[{'file': 'boarding_passes/boarding-avkash.pdf',
  'Seat': '20A',
  'Gate 2': 'G1',
  'Boarding Time 2': '10:00 AM PST',
  'Seat 2': '20A',
  'Gate': 'G1',
  'Passenger Name 2': 'Avkash Chauhan',
  'Boarding Time': '10:00 AM PST',
  'Date 2': 'April 20, 2022',
  'Passenger Name': 'Avkash Chauhan',
  'To 2': 'Chicago',
  'Baggage': 'NO',
  'Carrier': 'UA',
  'Flight No.': '234',
  'From 2': 'San Francisco',
  'To': 'Chicago',
  'Date': 'April 20, 2022',
  'Class': 'E',
  'From': 'San Francisco',
  'Ticket No.': 'ETK-34236751E'},
 {'file': 'boarding_passes/boarding-james-webb.pdf',
  'Baggage': 'YES',
  'Seat': '1A',
  'To 2': 'Chicago',
  'Passenger Name 2': 'James Webb',
  'Seat 2': '1A',
  'From': 'San Francisco',
  'Flight No.': '234',
  'Gate': 'G1',
  'Gate 2': 'G1',
  'From 2': 'San Francisco',
  'Carrier': 'UA',
  'To': 'Chicago',
  'Date': 'April 20, 2022',
  'Ticket No.': 'ETK-34236748B',
  'Passenger Name': 'James Webb',
  'Boarding Time 2': '10:00 AM PST',
  'Boarding Time': 

In [153]:
df_bp_info = pd.DataFrame(results)
df_bp_info

Unnamed: 0,file,Seat,Gate 2,Boarding Time 2,Seat 2,Gate,Passenger Name 2,Boarding Time,Date 2,Passenger Name,To 2,Baggage,Carrier,Flight No.,From 2,To,Date,Class,From,Ticket No.
0,boarding_passes/boarding-avkash.pdf,20A,G1,10:00 AM PST,20A,G1,Avkash Chauhan,10:00 AM PST,"April 20, 2022",Avkash Chauhan,Chicago,NO,UA,234,San Francisco,Chicago,"April 20, 2022",E,San Francisco,ETK-34236751E
1,boarding_passes/boarding-james-webb.pdf,1A,G1,10:00 AM PST,1A,G1,James Webb,10:00 AM PST,"April 20, 2022",James Webb,Chicago,YES,UA,234,San Francisco,Chicago,"April 20, 2022",B,San Francisco,ETK-34236748B
2,boarding_passes/boarding-james.pdf,25B,G1,10:00 AM PST,25B,G1,James Jackson,10:00 AM PST,"April 20, 2022",James Jackson,Chicago,YES,UA,234,San Francisco,Chicago,"April 20, 2022",E,San Francisco,ETK-34236750E
3,boarding_passes/boarding-libby.pdf,3D,G1,10:00 AM PST,3D,G1,Libby Herold,10:00 AM PST,"April 20, 2022",Libby Herold,Chicago,YES,UA,234,San Francisco,Chicago,"April 20, 2022",B,San Francisco,ETK-34236749B
4,boarding_passes/boarding-radha-s-kumar.pdf,34B,G1,10:00 AM PST,34B,G1,Radha S Kumar,10:00 AM PST,"April 20, 2022",Radha S Kumar,Chicago,YES,UA,234,San Francisco,Chicago,"April 20, 2022",E,San Francisco,ETK-34236747E
5,boarding_passes/boarding-sameer.pdf,34A,G1,10:00 AM PST,34A,G1,Sameer Kumar,10:00 AM PST,"April 20, 2022",Sameer Kumar,Chicago,YES,UA,234,San Francisco,Chicago,"April 20, 2022",E,San Francisco,ETK-34236746E
6,boarding_passes/boarding_pass_johannes.pdf,16F,G1,10:00 AM PST,16F,G1,Johannes Czylwik,10:00 AM PST,"April 20, 2022",Johannes Czylwik,Chicago,NO,UA,234,San Francisco,Chicago,"April 20, 2022",E,San Francisco,ETK-737268572620C


In [155]:
parquet_file = io.BytesIO()
df_bp_info.to_parquet(parquet_file, engine = 'pyarrow')
parquet_file.seek(0)

0

In [156]:
container = 'kiosk'
blob_path = 'extracted_boarding_pass_details/boarding_pass_details.parquet'
blob_client = blob_service_client.get_blob_client(container = container, blob = blob_path)

In [159]:
blob_client.upload_blob(data = parquet_file)

{'etag': '"0x8DC7C067617C8DA"',
 'last_modified': datetime.datetime(2024, 5, 24, 15, 30, 37, tzinfo=datetime.timezone.utc),
 'content_md5': bytearray(b'lV\xec\xde\x1cK\xbcC\x89\x80rY\xff\xc9\xd9J'),
 'client_request_id': '919dba58-19e2-11ef-a117-8cf8c5f0a202',
 'request_id': 'ea852516-e01e-0066-55ef-ad2aeb000000',
 'version': '2024-05-04',
 'version_id': None,
 'date': datetime.datetime(2024, 5, 24, 15, 30, 36, tzinfo=datetime.timezone.utc),
 'request_server_encrypted': True,
 'encryption_key_sha256': None,
 'encryption_scope': None}