In [1]:
import mss
import cv2
import numpy as np

def capture_screen():
    with mss.mss() as sct:
        monitor = sct.monitors[1]  # primary monitor
        img = np.array(sct.grab(monitor))
        img = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)
        return img


In [2]:
from ultralytics import YOLO

model = YOLO('yolov8n.pt')  # small general model (or a text-specific one)
img = capture_screen()
results = model.predict(img)



0: 416x640 1 tv, 1 laptop, 351.5ms
Speed: 69.2ms preprocess, 351.5ms inference, 5.4ms postprocess per image at shape (1, 3, 416, 640)


In [3]:
for box in results[0].boxes:
    x1, y1, x2, y2 = box.xyxy[0]  # coordinates
    conf = float(box.conf[0])
    cls = int(box.cls[0])
    print(f"Detected class {cls} at ({x1}, {y1}), ({x2}, {y2}) with conf {conf}")


Detected class 63 at (5.31024169921875, 16.26690673828125), (2867.127197265625, 1672.5948486328125) with conf 0.3313879072666168
Detected class 62 at (7.2861328125, 12.695663452148438), (2855.834228515625, 1707.6309814453125) with conf 0.27117815613746643


In [4]:
import easyocr
import pandas as pd
from tabulate import tabulate

reader = easyocr.Reader(['en'])

data = []  # store rows for the table

for box in results[0].boxes:
    x1, y1, x2, y2 = map(int, box.xyxy[0])
    roi = img[y1:y2, x1:x2]
    ocr_result = reader.readtext(roi)

    # EasyOCR returns: [(bbox, text, confidence), ...]
    for (bbox, text, conf) in ocr_result:
        # Compute the center of the bounding box for position
        xs = [p[0] for p in bbox]
        ys = [p[1] for p in bbox]
        center_x = int(sum(xs) / len(xs))
        center_y = int(sum(ys) / len(ys))

        data.append([text, center_x, center_y])

# Display as a formatted table
df = pd.DataFrame(data, columns=["Text Detected", "X-Axis", "Y-Axis"])
print(tabulate(df, headers="keys", tablefmt="fancy_grid", showindex=False))


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


╒════════════════════╤══════════╤══════════╕
│ Text Detected      │   X-Axis │   Y-Axis │
╞════════════════════╪══════════╪══════════╡
│ New folder         │      142 │       19 │
├────────────────────┼──────────┼──────────┤
│ Desktop            │      594 │      101 │
├────────────────────┼──────────┼──────────┤
│ New folder         │      795 │       97 │
├────────────────────┼──────────┼──────────┤
│ Search New folder  │     2286 │       99 │
├────────────────────┼──────────┼──────────┤
│ New                │      103 │      198 │
├────────────────────┼──────────┼──────────┤
│ [0   @             │      499 │      197 │
├────────────────────┼──────────┼──────────┤
│ Sort               │      876 │      197 │
├────────────────────┼──────────┼──────────┤
│ View               │     1054 │      198 │
├────────────────────┼──────────┼──────────┤
│ Details            │     2794 │      197 │
├────────────────────┼──────────┼──────────┤
│ Name               │      206 │      270 │
├─────────

In [5]:
while True:
    img = capture_screen()
    results = model.predict(img, conf=0.25, verbose=False)

    # Draw detections
    for box in results[0].boxes:
        x1, y1, x2, y2 = map(int, box.xyxy[0])
        cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)

    # Show image
    cv2.imshow("Screen Detection (press 'Q' to exit)", img)

    # Exit when pressing Q or q
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cv2.destroyAllWindows()
