In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Google sheet - Get handle on it

In [2]:
!pip install --upgrade -q gspread

In [3]:
from google.colab import auth
auth.authenticate_user()

import gspread
from google.auth import default
creds, _ = default()

gc = gspread.authorize(creds)

sh = gc.open('fieldbook_pages')
worksheet_pages = sh.worksheet(title="pages")

# Azure Congnitive Services. 
+ Authenticate and Connect
+ For each link in the spreadsheet, transcribe and store in sheet

In [4]:
import os
import pandas as pd

directory_config = "/content/drive/MyDrive/azure_config/" # Change this to suit your machine

path_config = os.path.join(directory_config,"cv.json")

# Verify the configuration file exists
if(os.path.exists(path_config)):
    print("Success.",path_config, "exists.")
else:
    print("Failure.",path_config, "does not exist.")

# Read the JSON file into a DataFrame
df_config = pd.read_json(path_config)
#print(df_config['COMPUTER_VISION_SUBSCRIPTION_KEY'].iloc[0])
#print(df_config['COMPUTER_VISION_ENDPOINT'].iloc[0])

# Store as enivonmental variables
os.environ['COMPUTER_VISION_SUBSCRIPTION_KEY'] = df_config['COMPUTER_VISION_SUBSCRIPTION_KEY'].iloc[0]
os.environ['COMPUTER_VISION_ENDPOINT'] = (df_config['COMPUTER_VISION_ENDPOINT'].iloc[0])

# Do some basic validation
if len(os.environ['COMPUTER_VISION_SUBSCRIPTION_KEY']) == 32:
    print("Success, COMPUTER_VISION_SUBSCRIPTION_KEY is loaded.")
else:
    print("Error, The COMPUTER_VISION_SUBSCRIPTION_KEY is not the expected length, please check it.")

Success. /content/drive/MyDrive/azure_config/cv.json exists.
Success, COMPUTER_VISION_SUBSCRIPTION_KEY is loaded.


In [5]:
# Install what is required to connect to Azure Cognitive Services Computer Vision
# Run this once on your machine. If you are using Google Colab, run this once per session.
!pip install --upgrade azure-cognitiveservices-vision-computervision


Collecting azure-cognitiveservices-vision-computervision
  Downloading azure_cognitiveservices_vision_computervision-0.9.0-py2.py3-none-any.whl (39 kB)
Collecting msrest>=0.5.0
  Downloading msrest-0.6.21-py2.py3-none-any.whl (85 kB)
[K     |████████████████████████████████| 85 kB 2.2 MB/s 
[?25hCollecting azure-common~=1.1
  Downloading azure_common-1.1.28-py2.py3-none-any.whl (14 kB)
Collecting isodate>=0.6.0
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[K     |████████████████████████████████| 41 kB 287 kB/s 
Installing collected packages: isodate, msrest, azure-common, azure-cognitiveservices-vision-computervision
Successfully installed azure-cognitiveservices-vision-computervision-0.9.0 azure-common-1.1.28 isodate-0.6.1 msrest-0.6.21


In [6]:
# Run this once per session

# Import the required libraries
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes
from azure.cognitiveservices.vision.computervision.models import VisualFeatureTypes
from msrest.authentication import CognitiveServicesCredentials
import sys

# Get your Computer Vision subscription key from your environment variable.
if 'COMPUTER_VISION_SUBSCRIPTION_KEY' in os.environ:
    subscription_key = os.environ['COMPUTER_VISION_SUBSCRIPTION_KEY']
else:      
    print("\nSet the COMPUTER_VISION_SUBSCRIPTION_KEY environment variable.\n**Restart your shell or IDE for changes to take effect.**")
    sys.exit()

# Get your Computer Vision endpoint from your environment variable.
if 'COMPUTER_VISION_ENDPOINT' in os.environ:
    endpoint = os.environ['COMPUTER_VISION_ENDPOINT']
else:
    print("\nSet the COMPUTER_VISION_ENDPOINT environment variable.\n**Restart your shell or IDE for changes to take effect.**")
    sys.exit()

# Authenticate with Azure Cognitive Services.
computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))


In [7]:
# Read just one file
import time
# This section is taken directly from:
# https://github.com/Azure-Samples/cognitive-services-quickstart-code/blob/master/python/ComputerVision/ComputerVisionQuickstart.py


# <snippet_read_call>
print("===== Read File - remote =====")
# Get an image with text. Set the url of the image to transcribe.
read_image_url = "https://pi.lib.uchicago.edu/1001/org/ochre/c14ddaa8-c833-42cc-be0b-a89624f1302c&load"

# Call API with URL and raw response (allows you to get the operation location). Call Azure using computervision_client with the URL.
read_response = computervision_client.read(read_image_url,  raw=True, model_version="2022-01-30-preview", language = "fr")
# </snippet_read_call>
  
# <snippet_read_response>
# Get the operation location (URL with an ID at the end) from the response
read_operation_location = read_response.headers["Operation-Location"]
# Grab the ID from the URL
operation_id = read_operation_location.split("/")[-1]

# Call the "GET" API and wait for it to retrieve the results 
while True:
    read_result = computervision_client.get_read_result(operation_id)
    if read_result.status not in ['notStarted', 'running']:
        break
        time.sleep(1)

# Print the detected text, line by line
if read_result.status == OperationStatusCodes.succeeded:
    for text_result in read_result.analyze_result.read_results:
        for line in text_result.lines:
            print(line.text)
            print(line.bounding_box)
print()

# </snippet_read_response>

===== Read File - remote =====
1 Juin.
[125.0, 30.0, 210.0, 30.0, 211.0, 59.0, 126.0, 58.0]
C'st un terrain situe a l' Est on la Renti ; inst contyn
[67.0, 85.0, 644.0, 89.0, 644.0, 122.0, 67.0, 117.0]
Le pek voyia à & maism ro'sine a fm muriau
[132.0, 340.0, 637.0, 340.0, 637.0, 371.0, 132.0, 370.0]
À la rent , pos ihr con Na Ih tivain - Jun 10m x3m
[64.0, 473.0, 628.0, 469.0, 628.0, 500.0, 64.0, 503.0]



In [None]:
# function to trascribe a link
import time

def get_text_from_image(read_image_url):

    # Call API with URL and raw response (allows you to get the operation location). Call Azure using computervision_client with the URL.
    read_response = computervision_client.read(read_image_url,  raw=True, model_version="2022-01-30-preview", language = "fr")  

    # Get the operation location (URL with an ID at the end) from the response
    read_operation_location = read_response.headers["Operation-Location"]
    # Grab the ID from the URL
    operation_id = read_operation_location.split("/")[-1]

    # Call the "GET" API and wait for it to retrieve the results 
    while True:
        read_result = computervision_client.get_read_result(operation_id)
        if read_result.status not in ['notStarted', 'running']:
            break
            time.sleep(1)

    # Print the detected text, line by line
    return_value = ""
    if read_result.status == OperationStatusCodes.succeeded:
        for text_result in read_result.analyze_result.read_results:
            for line in text_result.lines:
                return_value = return_value + str(line.text) + "\n"                
    return(return_value)

In [None]:
# Transcribe all rowns in the sheet
row = 2
link ="----"
while link != None:
    link = worksheet_pages.cell(row, 4).value
    print(row,link)
    text_from_page = get_text_from_image(link)
    worksheet_pages.update_cell(row, 5, "'"+text_from_page)
    row = row + 1

2 https://pi.lib.uchicago.edu/1001/org/ochre/f5337e52-97e4-4251-8d85-22aca943d220&load
3 https://pi.lib.uchicago.edu/1001/org/ochre/18a3aadc-d66c-4818-8a5e-e8e620267012&load
4 https://pi.lib.uchicago.edu/1001/org/ochre/94f8c988-05ba-4b24-b1bc-da7c791ad6b4&load
5 https://pi.lib.uchicago.edu/1001/org/ochre/090f5361-fbdc-4a1d-a43f-09ed36e2f018&load
6 https://pi.lib.uchicago.edu/1001/org/ochre/26225434-659f-4cb5-959c-8faff98a7a71&load
7 https://pi.lib.uchicago.edu/1001/org/ochre/17c2575f-004c-4988-af7a-e2ba5c5a85c7&load
8 https://pi.lib.uchicago.edu/1001/org/ochre/1707e378-9f04-4629-9e29-a61414c3aff6&load
9 https://pi.lib.uchicago.edu/1001/org/ochre/cb288380-884c-402e-9402-a10965ccc14a&load
10 https://pi.lib.uchicago.edu/1001/org/ochre/35bedc66-58a8-4a5b-b16d-9dcddc6b3b5d&load
11 https://pi.lib.uchicago.edu/1001/org/ochre/93d8f8f0-02d7-466a-8ecf-968563eb8ac8&load
12 https://pi.lib.uchicago.edu/1001/org/ochre/51898b50-712f-4066-b840-633c8e5fd122&load
13 https://pi.lib.uchicago.edu/1001/org/

ComputerVisionOcrErrorException: ignored