In [1]:
!pip install pytesseract opencv-python pillow torch torchvision transformers gtts easyocr SpeechRecognition pyaudio ultralytics playsound ipywidgets





In [23]:
# All required imports
import cv2
import pytesseract
from PIL import Image
from gtts import gTTS
import os
import torch
from torchvision import transforms
from transformers import BlipProcessor, BlipForConditionalGeneration, pipeline
import easyocr
import speech_recognition as sr
from ultralytics import YOLO
import playsound

# Setup: Check CUDA availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Tesseract Path (Windows)
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Voice Command Recognition (Allow partial matches like 'text', 'scene', 'object')
def recognize_command():
    recognizer = sr.Recognizer()
    with sr.Microphone() as source:
        print("Say your command: 'text', 'scene', 'object'...")
        audio = recognizer.listen(source)
        try:
            command = recognizer.recognize_google(audio)
            print("Recognized Command:", command)
            return command.lower()
        except sr.UnknownValueError:
            print("Could not understand audio.")
            return ""
        except sr.RequestError:
            print("Request error.")
            return ""

# Manual Image Path Input
image_path = input("Enter full image file path (e.g., C:\\Users\\khush\\Pictures\\image.jpg): ").strip().strip('"')

# EasyOCR - Text Extraction
def extract_text_easyocr(image_path):
    reader = easyocr.Reader(['en'])
    results = reader.readtext(image_path)
    text = " ".join([res[1] for res in results])
    return text

# Image Preprocessing
def preprocess_image(image_path):
    image = Image.open(image_path).convert('RGB')
    return image

# Preprocess image
try:
    image = preprocess_image(image_path)
except Exception as e:
    print("Error opening image:", e)
    exit()

# Scene Description using BLIP
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

def generate_caption(image):
    inputs = processor(image, return_tensors="pt").to(device)
    output = model.generate(**inputs)
    caption = processor.decode(output[0], skip_special_tokens=True)
    return caption

# Sentiment Analysis Pipeline
classifier = pipeline("sentiment-analysis")

# Object Detection using YOLOv8
model_yolo = YOLO('yolov8n.pt')  # Nano version (lightweight)

def detect_objects_yolo(image_path):
    results = model_yolo(image_path)
    objects = []
    for result in results:
        boxes = result.boxes
        names = result.names
        for box in boxes:
            cls_id = int(box.cls[0])
            objects.append(names[cls_id])
    return list(set(objects))

# Text-to-Speech Output (TTS)
def speak(text):
    tts = gTTS(text=text, lang='en')
    tts.save("output.mp3")
    playsound.playsound("output.mp3")
    os.remove("output.mp3")

# Start Voice Command and Perform Actions
command = recognize_command()

if 'text' in command:
    text = extract_text_easyocr(image_path)
    print("Detected Text:", text)
    speak(f"The text says: {text}")

elif 'scene' in command or 'describe' in command:
    scene_caption = generate_caption(image)
    scene_sentiment = classifier(scene_caption)[0]['label']
    print("Scene Description:", scene_caption)
    print("Scene Mood:", scene_sentiment)
    speak(f"The scene shows: {scene_caption}. It feels {scene_sentiment}.")

elif 'object' in command or 'detect' in command:
    objects_detected = detect_objects_yolo(image_path)
    print("Objects Detected:", objects_detected)
    speak(f"I can see: {', '.join(objects_detected)}")

else:
    print("No valid command detected. Please say 'text', 'scene', or 'object'.")
    speak("Sorry, I did not catch a valid command.")

print("\n--- Visual Assistant Task Completed ---")


Using device: cpu


Enter full image file path (e.g., C:\Users\khush\Pictures\image.jpg):  "C:\Users\khush\OneDrive\Pictures\Screenshots\Screenshot 2024-05-26 134828.png"


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


Say your command: 'text', 'scene', 'object'...
Recognized Command: describe
Scene Description: a man wearing headphones and looking at the camera
Scene Mood: NEGATIVE

--- Visual Assistant Task Completed ---
