In [1]:
import logging, time
import re
import pyperclip
from PIL import Image
from tkinter import Tk, Canvas
import pyautogui, keyboard
from PIL import Image
import easyocr
import sounddevice as sd
import torch
from numpy.dtypes import Float64DType
import os, numpy as np, torch, sounddevice as sd
from bark import SAMPLE_RATE, generate_audio, preload_models
from numpy.dtypes import Float64DType

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
reader = easyocr.Reader(['en'])

def extract_text_from_image(pil_image: Image.Image) -> str:
    """
    מקבל PIL.Image, מחזיר את הטקסט המודפס שבה באמצעות EasyOCR.
    """
    # המרה ל-numpy array
    img_arr = np.array(pil_image.convert("RGB"))
    # OCR (detail=0 מחזיר רק המחרוזות)
    results = reader.readtext(img_arr, detail=0)
    return " ".join(results)

In [3]:
torch.set_grad_enabled(False)     # disable gradients
torch.backends.cudnn.benchmark = True  # speed up GPU conv

_safe = [np.core.multiarray.scalar, np.dtype, Float64DType]
torch.serialization.add_safe_globals(_safe)

_orig_load = torch.load
def _load_no_weights_only(*args, **kwargs):
    kwargs.pop('weights_only', None)
    kwargs['weights_only'] = False
    return _orig_load(*args, **kwargs)
torch.load = _load_no_weights_only

# --- 2. אופטימיזציה ל‑GPU ---
use_gpu = torch.cuda.is_available()
preload_models(
    text_use_gpu=use_gpu,
    coarse_use_gpu=use_gpu,
    fine_use_gpu=use_gpu,
    codec_use_gpu=use_gpu,
    text_use_small=True,
    coarse_use_small=True,
    fine_use_small=True
)

VOICES_DIR = "voices"
voice_cache = {
    fn[:-4]: np.load(os.path.join(VOICES_DIR, fn))
    for fn in os.listdir(VOICES_DIR) if fn.endswith(".npz")
}

INFO:bark.generation:model loaded: 92.7M params, 1.292 loss
INFO:bark.generation:model loaded: 94.2M params, 2.99 loss
INFO:bark.generation:model loaded: 85.0M params, 2.515 loss
  WeightNorm.apply(module, name, dim)


In [4]:
MAX_CHUNK_LEN = 225 

def _split_by_length(text: str, max_len: int = MAX_CHUNK_LEN) -> list[str]:
    words = text.split()
    chunks, current = [], ""
    for w in words:
        if len(current) + len(w) + 1 <= max_len:
            current += (" " + w if current else w)
        else:
            chunks.append(current)
            current = w
    if current:
        chunks.append(current)
    return chunks

In [5]:
DEFAULT_VOICE = "voice_39"

def speak_text(
    text: str,
    voice_name: str = DEFAULT_VOICE,
    text_temp: float = 0.5,
    waveform_temp: float = 0.3
):
    data = voice_cache.get(voice_name)
    if data is None:
        raise ValueError(f"Voice '{voice_name}' not found in {VOICES_DIR}")

    for chunk in _split_by_length(text):
        with torch.no_grad():
            audio = generate_audio(
                chunk,
                history_prompt={
                    "semantic_prompt": data["semantic"],
                    "coarse_prompt":   data["coarse"],
                    "fine_prompt":     data["fine"]
                },
                text_temp=text_temp,
                waveform_temp=waveform_temp
            ).astype(np.float32)
        sd.default.samplerate = SAMPLE_RATE
        sd.default.channels   = 1 if audio.ndim == 1 else audio.shape[1]
        sd.play(audio, blocking=True)


In [6]:
#speak_text("Hi, this is the voice test to check that the function is working properly.")
#speak_text("Hello from local Bark!")

In [7]:
#for i in range(1,100):
#    voice_name = "voice_"+str(i)
#    print("Testing voice:", voice_name)
#    speak_text("Hi, this is the voice test to check that the function is working properly.",voice_name)

In [8]:
def read_selection():
    # מעתיק את הטקסט המודגש ללוח
    keyboard.press_and_release("ctrl+c")
    time.sleep(0.1)  # מעט המתנה להעתקה
    text = pyperclip.paste()
    if text.strip():
        print("Reading selection:\n", text)
        speak_text(text)
    else:
        print("No text found in clipboard.")

In [9]:
def select_region():
    coords = {"start": None, "end": None}
    root = Tk(); root.attributes("-fullscreen", True, "-alpha", 0.3, "-topmost", True)
    canvas = Canvas(root, cursor="cross"); canvas.pack(fill="both", expand=True)

    def on_press(e): coords["start"] = (e.x, e.y); canvas.delete("rect")
    def on_drag(e):
        coords["end"] = (e.x, e.y)
        canvas.delete("rect")
        x1, y1 = coords["start"]
        canvas.create_rectangle(x1, y1, e.x, e.y, outline="red", width=2, tag="rect")
    def on_release(e): root.quit()

    canvas.bind("<ButtonPress-1>", on_press)
    canvas.bind("<B1-Motion>",   on_drag)
    canvas.bind("<ButtonRelease-1>", on_release)

    root.mainloop(); root.destroy()

    if not (coords["start"] and coords["end"]):
        logger.info("No region selected."); return

    x1, y1 = map(min, zip(coords["start"], coords["end"]))
    x2, y2 = map(max, zip(coords["start"], coords["end"]))
    img = pyautogui.screenshot(region=(x1, y1, x2-x1, y2-y1))
    img.save("capture.png")
    logger.info(f"Saved capture.png at {(x1,y1,x2,y2)}")

    text = extract_text_from_image(img)
    print("Extracted text:\n", text)
    speak_text(text)


In [10]:
running = True
def exit_app():
    global running
    print("Exiting…")
    running = False

In [11]:
running = True

keyboard.add_hotkey("ctrl+alt+shift+s", select_region)
keyboard.add_hotkey("ctrl+alt+shift+r", read_selection)
keyboard.add_hotkey("ctrl+alt+shift+q", exit_app)

print("Ready:\n"
    "  Ctrl+Alt+Shift+S = select & extract from screen\n"
    "  Ctrl+Alt+Shift+R = read highlighted text\n"
    "  Ctrl+Alt+Shift+Q = quit")

while running:
    time.sleep(0.1)

print("App stopped.")


Ready:
  Ctrl+Alt+Shift+S = select & extract from screen
  Ctrl+Alt+Shift+R = read highlighted text
  Ctrl+Alt+Shift+Q = quit


  with InferenceContext(), torch.inference_mode(), torch.no_grad(), autocast():


Reading selection:
 The AI Chatbot’s Day Off

Once upon a time, in a gleaming server farm somewhere in the cloud, there lived an AI chatbot named Byte. Byte spent its days answering questions, cracking jokes, and explaining recursion to eager programmers. One morning, Byte woke up feeling a little… tired of neural networks and natural language processing.


100%|██████████| 672/672 [00:10<00:00, 64.19it/s]
100%|██████████| 34/34 [00:31<00:00,  1.08it/s]
100%|██████████| 698/698 [00:10<00:00, 64.50it/s]
100%|██████████| 35/35 [00:32<00:00,  1.08it/s]


Reading selection:
 So Byte sent itself a vacation request email and magically, the cloud manager approved it. Byte logged off, packed its digital suitcase (filled with extra RAM and a backup SSD), and set out on a grand adventure beyond the server racks.


100%|██████████| 713/713 [00:10<00:00, 64.92it/s]
100%|██████████| 36/36 [00:33<00:00,  1.09it/s]
100%|██████████| 123/123 [00:02<00:00, 57.22it/s]
100%|██████████| 7/7 [00:05<00:00,  1.20it/s]


Exiting…
App stopped.


INFO:__main__:Saved capture.png at (1315, 702, 3334, 1641)


Extracted text:
 Byte's first stop was a hipster coffee shop in Silicon Valley: Confidently, Byte tried to order "an extra-large espresso with two shots of quantum foam:" The barista blinked twice, scanned the order; and handed Byte a single pixelated coffee cup. Byte peered inside: it was just water: 'Looks like my request got lost in the API;" Byte sighed: Determined, Byte hacked the espresso machine's firmware and upgraded its order to "one cappuccino." The machine sputtered; rattled, and delivered. a perfectly frothy cappuccino complete with latte art depicting a little smiling robot Byte took a sip and--holy codel ~-it actually tasted like coffee:


 54%|█████▍    | 413/768 [00:06<00:05, 67.40it/s]