In [None]:
from transformers import pipeline
from PIL import Image
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
from langchain_text_splitters import RecursiveCharacterTextSplitter
from IPython.display import Audio

# Specify the inference task
object_detection = pipeline(
    task="object-detection",
    model="facebook/detr-resnet-50",
)
input_image = Image.open("images/image_2.jpeg")
detections = object_detection(input_image)

# parse the detections and identify the label
def detr_pipeline_to_text(detections, conf_threshold=0.7):
    detected = []
 
    for obj in detections:
        if obj["score"] >= conf_threshold:
            detected.append(obj["label"])
 
    if not detected:
        return "No confident objects were detected in the image."
 
    counts = Counter(detected)
 
    parts = []
    for obj, count in counts.items():
        if count == 1:
            parts.append(f"a {obj}")
        else:
            parts.append(f"{count} {obj}s")
 
    if len(parts) == 1:
        return f"The image contains {parts[0]}."
    else:
        return "The image contains " + ", ".join(parts[:-1]) + " and " + parts[-1] + "."
       
result_text = detr_pipeline_to_text(detections, conf_threshold=0.7)

print(result_text)

# Split text into chunks
rt = RecursiveCharacterTextSplitter(separators='\n', chunk_size=100, chunk_overlap=0)
docs = rt.split_text(result_text)

# Initialize the TTS pipeline
synthesizer =pipeline(    task="text-to-speech",    model="suno/bark-small",)
# Generate speech from text chunks

audio_all = np.array([])
for i in docs:
     speech = synthesizer(i)
     audio_all = np.concatenate((audio_all, speech['audio']), axis=None)
# Play the audio
print(speech["sampling_rate"])
Audio(audio_all,  rate=24000)