<a href="https://colab.research.google.com/github/kifjj/altinha-play/blob/main/alta_infer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install ultralytics

from ultralytics import YOLO



In [None]:
!pip install --upgrade sahi ultralytics supervision



In [None]:
import cv2
import numpy as np
import supervision as sv
from ultralytics import YOLO

# VIDEO_PATH = '/content/altinha-beach-green-mq.mp4'
# VIDEO_PATH = '/content/alta-champion-open-8s.mp4'
VIDEO_PATH = '/content/altinha-beach-green-mq.mp4'


MODEL_PATH = '/content/altinha_best.pt'
OUTPUT_PATH = '/content/altinha-beach-green-mq-BEST_ONLY.mp4'

CONFIDENCE = 0.05   # raise this a bit now that we don’t need raw debug
IOU_NMS   = 0.5     # normal-ish NMS now


# Hit detection hyperparams
MIN_VERTICAL_AMPLITUDE = 3   # pixels, how "deep" the local max should be
MIN_FRAMES_BETWEEN_HITS = 8   # avoid double-counting almost identical frames
GAP_RESET_FRAMES = 30         # e.g. 1 second at 30 fps
SAVE_FRAMES = False           # set True if you still want frame_XXX.png

model = YOLO(MODEL_PATH)

box_annotator = sv.BoxAnnotator(
    thickness=2,
    color=sv.Color.from_hex("#00FF00")  # just one box = the chosen ball
)
label_annotator = sv.LabelAnnotator(
    text_scale=0.5,
    text_thickness=2,
    text_position=sv.Position.TOP_CENTER,
)

video_info = sv.VideoInfo.from_video_path(VIDEO_PATH)
fps = video_info.fps
frames_generator = sv.get_video_frames_generator(VIDEO_PATH)

print(f"Processing {VIDEO_PATH} using BEST detection per frame...\n")

# --- State for hit detection ---
# We'll keep the last 3 (frame_idx, y_center) where we had a detection
last_positions = []         # list of (frame_idx, y_center)
hit_frames = []             # list of frame_idx where we detected a hit
last_detection_frame = None # last frame index where we had ANY detection

with sv.VideoSink(target_path=OUTPUT_PATH, video_info=video_info) as sink:
    n_frame = 0
    for frame in frames_generator:
        n_frame += 1

        results = model(
            frame,
            verbose=False,
            conf=CONFIDENCE,
            iou=IOU_NMS,
        )[0]

        detections = sv.Detections.from_ultralytics(results)

        # If there are detections, keep only the best one (highest confidence)
        if len(detections) > 0:
            best_idx = int(np.argmax(detections.confidence))

            best_conf = float(detections.confidence[best_idx])

            if best_conf >= 0.08:
                # keep this detection
                detections = detections[best_idx:best_idx+1]  # slice keeps Detections object
            else:
                # treat as "no detection" for this frame
                detections = detections[0:0]  # empty Detections object


        else:
            # keep detections empty
            pass

        # Debug (optional)
        if len(detections) > 0:
            conf = float(detections.confidence[0])
            x1, y1, x2, y2 = detections.xyxy[0].tolist()

            y_center = 0.5 * (y1 + y2)

            # 1) Check for a long gap since last detection
            if last_detection_frame is not None:
                gap = n_frame - last_detection_frame
                if gap > GAP_RESET_FRAMES:
                    # Ball was off-screen (or undetected) for a long time:
                    # reset the local history so we don't accidentally
                    # connect trajectories across that gap.
                    last_positions.clear()

            last_detection_frame = n_frame


            last_positions.append((n_frame, y_center))
            if len(last_positions) > 3:
                last_positions.pop(0)  # keep only last 3


            # once we have 3 points, we can test the middle one for local max
            # 3) If we have 3 points, test middle one for local max of y
            if len(last_positions) == 3:
                (f0, y0), (f1, y1c), (f2, y2c) = last_positions

                going_down_then_up = (y0 < y1c) and (y2c < y1c)
                vertical_span = y1c - min(y0, y2c)

                if going_down_then_up and vertical_span >= MIN_VERTICAL_AMPLITUDE:
                    # enforce minimum gap between hits (to avoid double-counting
                    # small jitter near the same contact)
                    if not hit_frames or (f1 - hit_frames[-1]) >= MIN_FRAMES_BETWEEN_HITS:
                        hit_frames.append(f1)
                        t_sec = f1 / fps
                        print(
                            f"HIT detected at frame {f1} "
                            f"(t ≈ {t_sec:.2f}s), y={y1c:.1f}, span={vertical_span:.1f}"
                        )


            print(
                f"FRAME {n_frame:4d}: best conf={conf:.3f}, "
                f"bbox=({x1:.1f},{y1:.1f},{x2:.1f},{y2:.1f})"
            )
        else:
            print(f"FRAME {n_frame:4d}: no detection")

        # Draw
        # --- DRAW ---
        annotated_frame = frame.copy()

        # Draw ball box + confidence
        if len(detections) > 0:
            conf = float(detections.confidence[0])
            labels = [f"Ball {conf:.2f}"]
            annotated_frame = box_annotator.annotate(
                scene=annotated_frame,
                detections=detections,
            )
            annotated_frame = label_annotator.annotate(
                scene=annotated_frame,
                detections=detections,
                labels=labels,
            )

        # === TOP-LEFT HUD BOX WITH HIT COUNT ===
        hit_text = f"Hits: {len(hit_frames)}"

        # choose font & scale
        font = cv2.FONT_HERSHEY_SIMPLEX
        font_scale = 0.8
        thickness = 2

        # measure text size
        (text_width, text_height), baseline = cv2.getTextSize(
            hit_text, font, font_scale, thickness
        )

        # box position + padding
        pad_x, pad_y = 10, 10
        x1, y1 = 10, 10  # top-left corner of the box
        x2 = x1 + text_width + 2 * pad_x
        y2 = y1 + text_height + 2 * pad_y

        # draw filled rectangle (semi-opaque-ish look)
        cv2.rectangle(
            annotated_frame,
            (x1, y1),
            (x2, y2),
            (0, 0, 0),     # black box
            thickness=-1   # filled
        )

        # draw the text inside the box
        text_x = x1 + pad_x
        text_y = y1 + pad_y + text_height  # baseline is at bottom of text
        cv2.putText(
            annotated_frame,
            hit_text,
            (text_x, text_y),
            font,
            font_scale,
            (0, 255, 0),  # green text
            thickness,
            cv2.LINE_AA,
        )
        # =======================================

        sink.write_frame(annotated_frame)

print(f"\nDone! Video saved to {OUTPUT_PATH}")

print(f"Estimated number of hits/passes: {len(hit_frames)}")

# Also print times for manual inspection
print("\nHit frames and timestamps:")
for f in hit_frames:
    print(f"  frame {f}, t ≈ {f / fps:.2f}s")


Processing /content/altinha-beach-green-mq.mp4 using BEST detection per frame...

FRAME    1: best conf=0.942, bbox=(340.6,55.0,359.5,74.2)
FRAME    2: best conf=0.944, bbox=(325.6,54.5,344.5,73.3)
FRAME    3: best conf=0.902, bbox=(311.6,54.6,329.4,73.8)
FRAME    4: best conf=0.951, bbox=(295.2,54.9,314.3,75.1)
FRAME    5: best conf=0.948, bbox=(279.9,56.4,298.8,75.8)
FRAME    6: best conf=0.772, bbox=(265.6,59.2,283.7,77.7)
FRAME    7: best conf=0.785, bbox=(248.6,62.3,268.7,81.3)
FRAME    8: best conf=0.840, bbox=(235.0,66.3,254.4,83.3)
FRAME    9: best conf=0.839, bbox=(219.7,70.6,239.3,87.9)
FRAME   10: best conf=0.924, bbox=(205.5,75.0,224.0,92.5)
FRAME   11: best conf=0.907, bbox=(190.8,80.5,209.1,98.7)
FRAME   12: best conf=0.895, bbox=(174.7,87.3,193.6,106.3)
FRAME   13: best conf=0.941, bbox=(159.7,95.0,179.5,113.1)
FRAME   14: best conf=0.943, bbox=(146.1,103.2,165.1,122.3)
FRAME   15: best conf=0.912, bbox=(131.4,113.1,151.1,132.8)
FRAME   16: best conf=0.925, bbox=(117.4,1

KeyboardInterrupt: 