# Making sense of the world through vision

In [None]:
# Computer Vision
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes
from azure.cognitiveservices.vision.computervision.models import VisualFeatureTypes
from msrest.authentication import CognitiveServicesCredentials

# Face API
from azure.cognitiveservices.vision.face import FaceClient
from msrest.authentication import CognitiveServicesCredentials
from azure.cognitiveservices.vision.face.models import TrainingStatusType, Person

# Speech API
from azure.cognitiveservices.speech import AudioDataStream, SpeechConfig, SpeechSynthesizer, SpeechSynthesisOutputFormat
from azure.cognitiveservices.speech.audio import AudioOutputConfig
import azure.cognitiveservices.speech as speechsdk

# Other
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import requests, uuid, json
import numpy as np
import time
import os
from PIL import Image #, ImageDraw
from mpl_toolkits.axes_grid1 import AxesGrid
from IPython.display import display
from IPython.display import Image as IPythonImage
from IPython.display import Audio as IPythonAudio

In [None]:
# Login into our Azure Subscription
# az login --use-device-code

# Create a resource group
! az group create -n Build_2022_RG -l westeurope --output table

In [None]:
#! az cognitiveservices account list-kinds --output table

! az cognitiveservices account create \
    --name Build22-CognitiveEndpoint \
    --resource-group Build_2022_RG \
    --kind CognitiveServices \
    --sku S0 \
    --location westeurope \
    --yes \
    --output table

! az cognitiveservices account keys list \
    --name Build22-CognitiveEndpoint --resource-group Build_2022_RG \
    --query key1 > key.txt

!  az cognitiveservices account show \
    --name Build22-CognitiveEndpoint --resource-group Build_2022_RG \
    --query properties.endpoint

In [None]:
with open('key.txt') as f:
    key = f.readlines()

subscription_key =  (key[0].replace("\"","")).strip()

# 




# Computer vision

In [None]:
endpoint = "https://westeurope.api.cognitive.microsoft.com/" 

computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))

## Describe what is on an image

In [None]:
image_url = "images/amsterdam-gaa24fa0bd_1280.jpg"
display(IPythonImage(filename=image_url))

In [None]:
with open(os.path.join(image_url), "rb") as image_stream:
    description_results = computervision_client.describe_image_in_stream(image_stream)

    for description in description_results.captions:
        print(description.text, "| Confidence: ","%.2f" % description.confidence)

## Detect what is on the image

In [None]:
# Detect objects in the Images
with open(os.path.join(image_url), "rb") as image_stream:
    detect_objects_results_remote = computervision_client.detect_objects_in_stream(image_stream)

    im = plt.imread(image_url)

    # Create figure and axes
    fig = plt.figure(figsize = (im.shape[1]/80, im.shape[0]/80))
    ax = plt.axes((0,0,1,1))

    # Display the image
    ax.imshow(im,origin='upper')

    # Overlay the information
    for object in detect_objects_results_remote.objects:
        color = (np.random.rand(),np.random.rand(),np.random.rand())
        rect = patches.Rectangle((object.rectangle.x, object.rectangle.y), 
                                 object.rectangle.w, object.rectangle.h, 
                                 linewidth=2, edgecolor=color, facecolor='none')
        ax.add_patch(rect)

        ax.text(
            (1/im.shape[1]*object.rectangle.x), 1-(1/im.shape[0]*object.rectangle.y), object.object_property,
            horizontalalignment='left',
            verticalalignment='bottom',
            fontsize=16,
            color='w',
            backgroundcolor=color,
            transform=ax.transAxes
        )
    ax.axis('off')
    plt.show()

# 





# Get more insights on Faces

In [None]:
# Create an authenticated FaceClient.
face_api_endpoint = "https://westeurope.api.cognitive.microsoft.com/"
face_client = FaceClient(face_api_endpoint, CognitiveServicesCredentials(subscription_key))

In [None]:
mf_image = "face-photos/tech-a11y-crew.jpg"
display(IPythonImage(filename=mf_image))

In [None]:
with open(os.path.join(mf_image), "rb") as image_stream:
    
    # Detect faces in images
    detected_faces = face_client.face.detect_with_stream(image_stream, return_face_attributes=[
                    'age', 
                    'smile',
                    'facialHair',
                    'glasses',
                    'emotion',
                    'hair',
                    'accessories'
                ])
    
    # Display the result
    pil_img = Image.open(mf_image)
    for face in detected_faces: 
        img2 = pil_img.crop((face.face_rectangle.left, face.face_rectangle.top, face.face_rectangle.left+face.face_rectangle.width, face.face_rectangle.top+face.face_rectangle.height))
        display(img2)
        print (f'Face id: {face.face_id}')
        print (f'smile: {face.face_attributes.smile}')
        print (f'age: {face.face_attributes.age}')
        print (f'facial_hair moustache: {face.face_attributes.facial_hair.moustache}')
        print (f'facial_hair beard: {face.face_attributes.facial_hair.beard}')
        print (f'facial_hair sideburns: {face.face_attributes.facial_hair.sideburns}')
        print (f'glasses: {face.face_attributes.glasses}')
        print (f'emotion: {face.face_attributes.emotion}')
        print(" ==")

    print()

# Save this ID for use in Find Similar
first_image_face_ID = detected_faces[0].face_id

## Train the face API to recognize people

In [None]:
# Show the dataset
path = r"face-photos/train"
random_filenames = []
for train_img in os.listdir(path):
    random_filenames.append(os.path.join(path, train_img))

grid = AxesGrid(plt.figure(1, (20,20)), 111, nrows_ncols=(1, 6), axes_pad=0, label_mode="1")

i = 0
for img_name in random_filenames:
    im = plt.imread(img_name)
    grid[i].imshow(im,aspect='auto', extent=(0,0.8,0,1), alpha=1, origin='upper', zorder=-1)
    i = i+1

In [None]:
# Create a face group
PERSON_GROUP_ID = "tech-a11y-crew"
face_client.person_group.delete(person_group_id=PERSON_GROUP_ID)
face_client.person_group.create(person_group_id=PERSON_GROUP_ID, name=PERSON_GROUP_ID)

In [None]:
# Add persons and face photos to the group
path = r"face-photos/train"

for person in os.listdir(path):
    name = person.partition(".")[0]
    print("Adding:"+name)
    w = open(os.path.join(path,person), 'r+b')

    # Create a person
    person = face_client.person_group_person.create(PERSON_GROUP_ID, name)

    # Add a face to the person
    face_client.person_group_person.add_face_from_stream(PERSON_GROUP_ID, person.person_id, w)

In [None]:
# Train the person group
face_client.person_group.train(PERSON_GROUP_ID)

while (True):
    training_status = face_client.person_group.get_training_status(PERSON_GROUP_ID)
    print("Training status: {}.".format(training_status.status))
    if (training_status.status is TrainingStatusType.succeeded):
        break
    elif (training_status.status is TrainingStatusType.failed):
        face_client.person_group.delete(person_group_id=PERSON_GROUP_ID)
        sys.exit('Training the person group has failed.')
    time.sleep(2)

## Identify people in the image

In [None]:
# Detect faces
with open(os.path.join(mf_image), "rb") as image_stream:
    # Detect faces
    face_ids = []
    # We use detection model 3 to get better performance.
    faces = face_client.face.detect_with_stream(image_stream, detection_model='detection_03')
    for face in faces:
        face_ids.append(face.face_id)

# Indentify faces
results = face_client.face.identify(face_ids, PERSON_GROUP_ID)

identified_persons = {}

for person in results:
    for candidate in person.candidates:
        identified_person = face_client.person_group_person.get(PERSON_GROUP_ID,candidate.person_id)
        print("Found: "+identified_person.name)
        identified_persons[person.face_id] = identified_person

In [None]:
# Show the result

im = plt.imread(mf_image)

# Create figure and axes
fig = plt.figure(figsize = (im.shape[1]/70, im.shape[0]/70))
ax = plt.axes((0,0,1,1))

# Display the image
ax.imshow(im,origin='upper')

# Overlay the information
for face in faces:
    color = (np.random.rand(),np.random.rand(),np.random.rand())
    rect = patches.Rectangle((face.face_rectangle.left, face.face_rectangle.top), 
                             face.face_rectangle.width, face.face_rectangle.height, 
                             linewidth=3, edgecolor=color, facecolor='none')
    ax.add_patch(rect)

    if face.face_id in identified_persons:
        ax.text(
            (1/im.shape[1]*face.face_rectangle.left), 1-(1/im.shape[0]*face.face_rectangle.top), 
            "{}".format(identified_persons[face.face_id].name),
            horizontalalignment='left', verticalalignment='bottom', fontsize=16, color='w', backgroundcolor=color, transform=ax.transAxes
        )
ax.axis('off')
plt.show()

# 




## Read text in images

In [None]:
handwriting_image_url = "images/text.png"
display(IPythonImage(filename=handwriting_image_url))

In [None]:
print("===== Start =====")
# Call API with URL and raw response (allows you to get the operation location)
with open(os.path.join(handwriting_image_url), "rb") as image_stream:
    read_response = computervision_client.read_in_stream(image_stream,  raw=True)

read_operation_location = read_response.headers["Operation-Location"]
# Grab the ID from the URL
operation_id = read_operation_location.split("/")[-1]

# Call the "GET" API and wait for it to retrieve the results 
while True:
    read_result = computervision_client.get_read_result(operation_id)
    if read_result.status not in ['notStarted', 'running']:
        break
    time.sleep(1)


for text_result in read_result.analyze_result.read_results:
    for line in text_result.lines:
        print(line.text)

print("===== Done =====")

In [None]:
im = plt.imread(handwriting_image_url)

# Create figure and axes
fig = plt.figure(figsize = (im.shape[1]/100, im.shape[0]/100))
ax = plt.axes((0,0,1,1))

# Display the image
ax.imshow(im,origin='upper')

full_text = ""
for text_result in read_result.analyze_result.read_results:
    for line in text_result.lines:
        color = (np.random.rand(),np.random.rand(),np.random.rand())
        rect = patches.Rectangle((line.bounding_box[0], line.bounding_box[1]), 
                             line.bounding_box[2]-line.bounding_box[0], line.bounding_box[5]-line.bounding_box[1], 
                             linewidth=6, edgecolor=color, facecolor='none')
        ax.add_patch(rect)
        full_text+=line.text + " "
ax.axis('off')
plt.show()

In [None]:
speech_config = SpeechConfig(subscription=subscription_key, region="westeurope")
speech_config.speech_synthesis_language = "en-GB" 
speech_config.speech_synthesis_voice_name ="en-GB-LibbyNeural"

In [None]:
audio_file=f'{speech_config.speech_synthesis_voice_name}.wav'
audio_config = AudioOutputConfig(filename=audio_file)
synthesizer = SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
synthesizer.speak_text(full_text)

IPythonAudio(audio_file,autoplay=True)

In [None]:
endpoint = "https://api.cognitive.microsofttranslator.com"

# Add your location, also known as region. The default is global.
# This is required if using a Cognitive Services resource.
location = "westeurope"

path = '/translate'
constructed_url = endpoint + path

params = {
    'api-version': '3.0',
    'from': 'en',
    'to': ['nl', 'ar', 'af']
}

headers = {
    'Ocp-Apim-Subscription-Key': subscription_key,
    'Ocp-Apim-Subscription-Region': location,
    'Content-type': 'application/json',
    'X-ClientTraceId': str(uuid.uuid4())
}

# You can pass more than one object in body.
body = [{
    'text': full_text
}]

request = requests.post(constructed_url, params=params, headers=headers, json=body)
response = request.json()

for translation in response[0]['translations']:
    print(translation['text'])

#print(json.dumps(response, sort_keys=True, ensure_ascii=False, indent=4, separators=(',', ': ')))


In [None]:
translated_text = response[0]['translations'][2]['text']
print(translated_text)

In [None]:
speech_config = SpeechConfig(subscription=subscription_key, region="westeurope")
speech_config.speech_synthesis_language = "af-ZA" 
speech_config.speech_synthesis_voice_name = "af-ZA-AdriNeural"

audio_file=f'{speech_config.speech_synthesis_voice_name}.wav'
audio_config = AudioOutputConfig(filename=audio_file)
synthesizer = SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
synthesizer.speak_text(translated_text)

IPythonAudio(audio_file,autoplay=True)