In [None]:
# CELL 1: Install dependencies
!pip install ultralytics supervision transformers[torch] accelerate decord datasets evaluate -q

In [None]:
# CELL 2: Import libraries
import os, cv2, csv
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from collections import defaultdict
from datetime import timedelta
from ultralytics import YOLO
from transformers import AutoImageProcessor, AutoModelForVideoClassification
from decord import VideoReader, cpu
import supervision as sv

In [None]:
from huggingface_hub import snapshot_download
snapshot_download(repo_id="cheesecz/shelf-segmentation", local_dir="shelf_model", local_dir_use_symlinks=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

'/content/shelf_model'

In [None]:
from ultralytics import YOLO

model = YOLO("shelf_model/best.pt")
print(model.names)


{0: 'Shelf'}


In [None]:
# CELL 3: Load models
person_model = YOLO('yolo11s.pt')
shelf_model = YOLO("shelf_model/best.pt")
action_model = AutoModelForVideoClassification.from_pretrained('haipradana/s-h-o-p-domain-adaptation')
image_processor = AutoImageProcessor.from_pretrained('haipradana/s-h-o-p-domain-adaptation')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
action_model.to(device).eval()
id2label = action_model.config.id2label

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:
# CELL 4: Merge consecutive predictions
def merge_consecutive_predictions(preds, min_duration_frames=0):
    if not preds: return []
    merged = []
    current = preds[0].copy()
    for nxt in preds[1:]:
        if nxt['pred'] == current['pred']:
            current['end'] = nxt['end']
        else:
            merged.append(current)
            current = nxt.copy()
    merged.append(current)
    return [e for e in merged if (e['end'] - e['start']) >= min_duration_frames]

In [None]:
# CELL 5: Segment shelf from a static image
def infer_shelf_segmentation(image_path):
    result = shelf_model(image_path)
    result[0].save(filename=f'result_shelf_{os.path.basename(image_path)}')
    return f'result_shelf_{os.path.basename(image_path)}'

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict

def generate_rak_timeline(interaction_csv_path, tracks, shelf_boxes_per_frame, fps, output_path='timeline_rak.png'):
    """
    Membuat timeline visual interaksi orang terhadap rak.
    Menyimpan hasil ke file PNG.

    Params:
        interaction_csv_path (str): path ke rak_interaksi.csv
        tracks (dict): hasil tracking orang (per ID)
        shelf_boxes_per_frame (dict): koordinat rak per frame
        fps (float): frame rate video
        output_path (str): path file PNG hasil
    """

    # Ambil daftar rak valid (sudah difilter)
    rak_df = pd.read_csv(interaction_csv_path)
    valid_raks = set(rak_df['rak_id'].tolist())

    # Bangun timeline per rak
    rak_timeline = defaultdict(list)
    for pid, dets in tracks.items():
        for d in dets:
            f = d['frame']
            x1, y1, x2, y2 = d['bbox']
            px, py = (x1 + x2) / 2, (y1 + y2) / 2
            for sb in shelf_boxes_per_frame.get(f, []):
                sx1, sy1, sx2, sy2 = sb
                rak_id = f"rak_{int(sx1)}_{int(sy1)}"
                if rak_id not in valid_raks:
                    continue
                if sx1 <= px <= sx2 and sy1 <= py <= sy2:
                    rak_timeline[rak_id].append(f)

    # Buat visualisasi
    plt.figure(figsize=(12, max(4, len(rak_timeline) * 0.4)))
    for i, (rak_id, frames) in enumerate(sorted(rak_timeline.items())):
        if not frames:
            continue
        frames = sorted(frames)
        start = frames[0]
        for j in range(1, len(frames)):
            if frames[j] != frames[j-1] + 1:
                plt.plot([start / fps, frames[j-1] / fps], [i, i], linewidth=6)
                start = frames[j]
        plt.plot([start / fps, frames[-1] / fps], [i, i], linewidth=6)
        plt.text(-1, i, rak_id, verticalalignment='center', fontsize=8)

    plt.xlabel('Time (seconds)')
    plt.title('Timeline Interaksi per Rak')
    plt.yticks([])
    plt.tight_layout()
    plt.savefig(output_path)
    plt.close()

    return output_path


In [None]:
# # CELL 6: Main video processing with all features + rak filter + display ID
# def full_video_analysis(video_path, output_dir):
#     vr = VideoReader(video_path, ctx=cpu(0))
#     fps = vr.get_avg_fps()
#     H, W, _ = vr[0].shape
#     out_path = os.path.join(output_dir, 'video_output.mp4')
#     vw = cv2.VideoWriter(out_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (W, H))
#     tracker = person_model.track(source=video_path, persist=True, tracker='bytetrack.yaml', classes=[0], stream=True)

#     tracks, raw_actions, heatmap_grid = defaultdict(list), defaultdict(list), np.zeros((20, 20))
#     shelf_boxes_per_frame = {}

#     for idx, result in enumerate(tracker):
#         frame = vr[idx].asnumpy()
#         frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
#         res_shelf = shelf_model(frame)
#         shelf_boxes = [b.xyxy[0].cpu().numpy() for b in res_shelf[0].boxes] if res_shelf[0].boxes else []
#         shelf_boxes_per_frame[idx] = shelf_boxes

#         if result.boxes.id is not None:
#             boxes = result.boxes.xyxy.cpu().numpy()
#             ids = result.boxes.id.int().cpu().tolist()
#             for box, pid in zip(boxes, ids):
#                 tracks[pid].append({'frame': idx, 'bbox': box})
#                 x, y = (box[0] + box[2])/2, (box[1] + box[3])/2
#                 gx, gy = min(int(x / W * 20), 19), min(int(y / H * 20), 19)
#                 heatmap_grid[gy, gx] += 1

#     # === FILTER RAK: hanya simpan rak yang muncul di minimal N frame
#     MIN_FRAME_RAK = 0
#     rak_counter = defaultdict(int)
#     for f, shelf_boxes in shelf_boxes_per_frame.items():
#         for sb in shelf_boxes:
#             sx1, sy1, sx2, sy2 = map(int, sb)
#             rak_id = f"rak_{int(sx1)}_{int(sy1)}"
#             rak_counter[rak_id] += 1
#     valid_rak_ids = {rak_id for rak_id, count in rak_counter.items() if count >= MIN_FRAME_RAK}

#     # === Action recognition
#     for pid, dets in tracks.items():
#         if len(dets) < 16: continue
#         for i in range(0, len(dets)-16+1, 8):
#             frames = vr.get_batch([d['frame'] for d in dets[i:i+16]]).asnumpy()
#             crops = [f[int(d['bbox'][1]):int(d['bbox'][3]), int(d['bbox'][0]):int(d['bbox'][2])] for f, d in zip(frames, dets[i:i+16])]
#             if not crops: continue
#             inputs = image_processor(crops, return_tensors='pt').to(device)
#             with torch.no_grad():
#                 out = action_model(**inputs)
#             pred = out.logits.argmax(-1).item()
#             raw_actions[pid].append({'start': dets[i]['frame'], 'end': dets[i+15]['frame'], 'pred': pred})

#     action_preds = {pid: merge_consecutive_predictions(plist, int(fps*0.4)) for pid, plist in raw_actions.items()}

#     # === Hitung interaksi ke rak
#     rak_interaksi = defaultdict(int)
#     for pid, dets in tracks.items():
#         for d in dets:
#             f = d['frame']
#             x1, y1, x2, y2 = d['bbox']
#             px, py = (x1+x2)/2, (y1+y2)/2
#             for sb in shelf_boxes_per_frame.get(f, []):
#                 sx1, sy1, sx2, sy2 = sb
#                 rak_id = f"rak_{int(sx1)}_{int(sy1)}"
#                 if rak_id not in valid_rak_ids:
#                     continue
#                 if sx1 <= px <= sx2 and sy1 <= py <= sy2:
#                     rak_interaksi[rak_id] += 1

#     # === Simpan rekap interaksi rak
#     pd.DataFrame(list(rak_interaksi.items()), columns=['rak_id', 'interaksi']).to_csv(
#         os.path.join(output_dir, 'rak_interaksi.csv'), index=False)
#     pd.DataFrame(sorted(rak_interaksi.items(), key=lambda x: -x[1]), columns=['rak_id', 'interaksi']).to_csv(
#         os.path.join(output_dir, 'rekomendasi_layout.csv'), index=False)

#     # === Save heatmap
#     plt.imshow(heatmap_grid, cmap='hot', interpolation='nearest')
#     plt.title('Heatmap of Visitor Presence')
#     plt.colorbar()
#     plt.tight_layout()
#     plt.savefig(os.path.join(output_dir, 'heatmap.png'))
#     plt.close()

#     # === Simpan log aksi
#     all_actions = []
#     for pid, acts in action_preds.items():
#         for a in acts:
#             all_actions.append([pid, a['start'], a['end'], id2label[a['pred']]])
#     pd.DataFrame(all_actions, columns=['id', 'start', 'end', 'action']).to_csv(
#         os.path.join(output_dir, 'action_log.csv'), index=False)
#     pd.DataFrame(pd.Series([x[3] for x in all_actions]).value_counts()).to_csv(
#         os.path.join(output_dir, 'action_summary.csv'))

#     # === Render video dengan rak & heatmap overlay
#     heatmap_annotator = sv.HeatMapAnnotator(position=sv.Position.BOTTOM_CENTER, opacity=0.3, radius=20, kernel_size=25)

#     for idx in range(len(vr)):
#         frame = vr[idx].asnumpy()
#         frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)

#         shelf_result = shelf_model(frame_bgr)
#         if hasattr(shelf_result[0], "masks") and shelf_result[0].masks is not None:
#             mask_img = shelf_result[0].masks.data.cpu().numpy().sum(axis=0)
#             mask_img = (mask_img > 0).astype(np.uint8) * 100
#             mask_img = cv2.resize(mask_img, (frame_bgr.shape[1], frame_bgr.shape[0]))  # resize to match frame
#             mask_img = cv2.applyColorMap(mask_img, cv2.COLORMAP_JET)
#             mask_img = cv2.cvtColor(mask_img, cv2.COLOR_BGR2RGB)  # pastikan sama format warna
#             frame_bgr = cv2.addWeighted(frame_bgr, 1.0, mask_img, 0.4, 0)


#         # Rak overlay (filtered + labeled)
#         for sb in shelf_boxes_per_frame.get(idx, []):
#             x1, y1, x2, y2 = map(int, sb)
#             rak_id = f"rak_{x1}_{y1}"
#             if rak_id not in valid_rak_ids:
#                 continue
#             cv2.rectangle(frame_bgr, (x1, y1), (x2, y2), (255, 0, 0), 2)
#             cv2.putText(frame_bgr, rak_id, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)

#         # Tracking & action label
#         current_tracks = [t for pid, dets in tracks.items() for t in dets if t['frame'] == idx]
#         for t in current_tracks:
#             x1, y1, x2, y2 = map(int, t['bbox'])
#             t_frame, t_bbox = t['frame'], t['bbox']
#             pid = next(
#                 (pid for pid, dets in tracks.items()
#                  if any(d['frame'] == t_frame and np.allclose(d['bbox'], t_bbox) for d in dets)),
#                 None
#             )
#             label = f"ID {pid}"
#             for a in action_preds.get(pid, []):
#                 if a['start'] <= idx <= a['end']:
#                     label += f" | {id2label[a['pred']]}"
#                     break
#             cv2.rectangle(frame_bgr, (x1, y1), (x2, y2), (0, 255, 0), 2)
#             cv2.putText(frame_bgr, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

#         # Overlay heatmap
#         detections = sv.Detections(
#             xyxy=np.array([t['bbox'] for t in current_tracks]),
#             confidence=np.ones(len(current_tracks)),
#             class_id=np.zeros(len(current_tracks))
#         )
#         frame_bgr = heatmap_annotator.annotate(scene=frame_bgr.copy(), detections=detections)

#         vw.write(frame_bgr)

#     vw.release()

#         # === Generate timeline visual interaksi per rak
#     generate_rak_timeline(
#         interaction_csv_path=os.path.join(output_dir, 'rak_interaksi.csv'),
#         tracks=tracks,
#         shelf_boxes_per_frame=shelf_boxes_per_frame,
#         fps=fps,
#         output_path=os.path.join(output_dir, 'timeline_rak.png')
#     )

#     return out_path

In [None]:
# CELL 6 : Main video processing -- with stable shelf_id + display
from shapely.geometry import box as shp_box   # ← untuk IoU cepat

def iou_xyxy(a, b):
    """a,b : (x1,y1,x2,y2) -> IoU 0-1"""
    inter = shp_box(*a).intersection(shp_box(*b)).area
    union = shp_box(*a).union(shp_box(*b)).area
    return inter / union if union else 0

def full_video_analysis(video_path, output_dir):
    vr  = VideoReader(video_path, ctx=cpu(0))
    fps = vr.get_avg_fps()
    print(f"FPS video = {fps:.2f}")
    H,W,_ = vr[0].shape
    out_path = os.path.join(output_dir,'video_output.mp4')
    vw  = cv2.VideoWriter(out_path, cv2.VideoWriter_fourcc(*'mp4v'), fps,(W,H))

    tracker   = person_model.track(source=video_path, persist=True,
                                   tracker='bytetrack.yaml', classes=[0], stream=True)

    tracks, raw_actions = defaultdict(list), defaultdict(list)
    heatmap_grid        = np.zeros((20,20))

    shelf_boxes_per_frame = {}                # idx -> [(shelf_id, xyxy)]
    shelf_last_box        = {}                # shelf_id -> last xyxy
    next_shelf_idx        = 1
    IOU_TH = 0.5                               # threshold “geser dikit” dianggap sama rak

    # ---------- PASS 1 : deteksi + tracking ----------
    for f_idx, result in enumerate(tracker):
        frame = vr[f_idx].asnumpy()
        res_shelf = shelf_model(frame)

        assigned = []
        raw_boxes = [b.xyxy[0].cpu().numpy() for b in res_shelf[0].boxes] if res_shelf[0].boxes else []

        for box in raw_boxes:
            cur = tuple(map(int, box))
            best_iou,best_id = 0,None
            for sid, prev in shelf_last_box.items():
                val = iou_xyxy(cur, prev)
                if val>best_iou:
                    best_iou,best_id = val, sid
            if best_iou>=IOU_TH:
                shelf_last_box[best_id] = cur
                assigned.append((best_id,cur))
            else:
                sid = f"shelf_{next_shelf_idx}"
                next_shelf_idx += 1
                shelf_last_box[sid] = cur
                assigned.append((sid,cur))

        shelf_boxes_per_frame[f_idx] = assigned

        # people tracks
        if result.boxes.id is not None:
            boxes = result.boxes.xyxy.cpu().numpy()
            ids   = result.boxes.id.int().cpu().tolist()
            for box,pid in zip(boxes,ids):
                tracks[pid].append({'frame': f_idx, 'bbox': box, 'pid': pid})
                cx,cy = (box[0]+box[2])/2,(box[1]+box[3])/2
                gx,gy = min(int(cx/W*20),19), min(int(cy/H*20),19)
                heatmap_grid[gy, gx] += 1

    # ---------- Action recognition ----------
    for pid,dets in tracks.items():
        if len(dets)<16: continue
        for i in range(0,len(dets)-15,8):
            clip_frames = [d['frame'] for d in dets[i:i+16]]
            imgs = vr.get_batch(clip_frames).asnumpy()
            crops = [img[int(d['bbox'][1]):int(d['bbox'][3]),
                         int(d['bbox'][0]):int(d['bbox'][2])] for img,d in zip(imgs,dets[i:i+16])]
            if not crops: continue
            inp = image_processor(crops, return_tensors='pt').to(device)
            pred = action_model(**inp).logits.argmax(-1).item()
            raw_actions[pid].append({'start':dets[i]['frame'],'end':dets[i+15]['frame'],'pred':pred})

    action_preds = {pid:merge_consecutive_predictions(v,int(fps*0.4))
                    for pid,v in raw_actions.items()}

    # ---------- Hitung interaksi ke rak ----------
    shelf_interaksi = defaultdict(int)
    for pid,dets in tracks.items():
        for d in dets:
            f = d['frame']; x1,y1,x2,y2 = d['bbox']
            cx,cy = (x1+x2)/2,(y1+y2)/2
            for sid,(sx1,sy1,sx2,sy2) in shelf_boxes_per_frame.get(f,[]):
                if sx1<=cx<=sx2 and sy1<=cy<=sy2:
                    shelf_interaksi[sid]+=1

    pd.DataFrame(list(shelf_interaksi.items()),
                 columns=['shelf_id','interaksi']).to_csv(
                 os.path.join(output_dir,'rak_interaksi.csv'), index=False)

    # ---------- Heatmap gambar ----------
    plt.imshow(heatmap_grid,cmap='hot',interpolation='nearest')
    plt.title('Heatmap of Visitor Presence'); plt.colorbar(); plt.tight_layout()
    plt.savefig(os.path.join(output_dir,'heatmap.png')); plt.close()


    # ---------- Rekap aksi (log + summary) ----------
    all_actions = []
    for pid, acts in action_preds.items():
        for a in acts:
            all_actions.append([pid, a['start'], a['end'], id2label[a['pred']]])

    pd.DataFrame(all_actions,
                 columns=['id', 'start', 'end', 'action']).to_csv(
                 os.path.join(output_dir, 'action_log.csv'), index=False)

    pd.DataFrame(pd.Series([row[3] for row in all_actions])
                 .value_counts()).to_csv(
                 os.path.join(output_dir, 'action_summary.csv'))

    # ---------- Action ↔ Shelf mapping ----------
    action_shelf = []          # baris detail
    shelf_action_counter = defaultdict(int)

    for pid, acts in action_preds.items():
        for seg in acts:
            s, e, act_id = seg['start'], seg['end'], seg['pred']
            act_label     = id2label[act_id]

            # frame-frame di rentang aksi
            for f in range(s, e+1):
                # pusat bbox orang frame-f
                det = next((d for d in tracks[pid] if d['frame']==f), None)
                if det is None: continue
                x1,y1,x2,y2 = det['bbox']; cx,cy = (x1+x2)/2,(y1+y2)/2

                for sid,(sx1,sy1,sx2,sy2) in shelf_boxes_per_frame.get(f, []):
                    if sx1<=cx<=sx2 and sy1<=cy<=sy2:
                        action_shelf.append([pid, f, sid, act_label])
                        shelf_action_counter[(sid, act_label)] += 1
                        break   # satu rak saja cukup

    # simpan detail
    pd.DataFrame(action_shelf,
                 columns=['pid', 'frame', 'shelf_id', 'action']).to_csv(
                 os.path.join(output_dir, 'action_shelf_log.csv'), index=False)

    # simpan ringkasan jumlah
    pd.DataFrame(
        [{'shelf_id':k[0], 'action':k[1], 'count':v}
         for k,v in shelf_action_counter.items()]
    ).to_csv(os.path.join(output_dir,'action_shelf_summary.csv'), index=False)


    # ---------- Rekomendasi layout (rak tersibuk → tersunyi) ----------
    pd.DataFrame(sorted(shelf_interaksi.items(),
                        key=lambda x: -x[1]),
                 columns=['shelf_id', 'interaksi']).to_csv(
                 os.path.join(output_dir, 'rekomendasi_layout.csv'), index=False)


    # ---------- Film dengan overlay ----------
    heatmap_ann = sv.HeatMapAnnotator(position=sv.Position.BOTTOM_CENTER,
                                      opacity=0.3,radius=20,kernel_size=25)

    for f_idx in range(len(vr)):
        frame = vr[f_idx].asnumpy()
        frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)

        # draw shelves
        for sid,(x1,y1,x2,y2) in shelf_boxes_per_frame.get(f_idx,[]):
            cv2.rectangle(frame_bgr,(x1,y1),(x2,y2),(255,0,0),2)
            cv2.putText(frame_bgr,sid,(x1,y1-5),
                        cv2.FONT_HERSHEY_SIMPLEX,0.5,(255,0,0),2)

        # draw persons
        cur_tracks=[t for pid,v in tracks.items() for t in v if t['frame']==f_idx]
        for t in cur_tracks:
            x1,y1,x2,y2 = map(int,t['bbox'])
            pid = t['pid']
            label=f"ID {pid}"
            for a in action_preds.get(pid,[]):
                if a['start']<=f_idx<=a['end']:
                    label+=f" | {id2label[a['pred']]}"
                    break
            cv2.rectangle(frame_bgr,(x1,y1),(x2,y2),(0,255,0),2)
            cv2.putText(frame_bgr,label,(x1,y1-10),
                        cv2.FONT_HERSHEY_SIMPLEX,0.5,(0,255,0),2)

        dets=sv.Detections(xyxy=np.array([t['bbox'] for t in cur_tracks]),
                           confidence=np.ones(len(cur_tracks)),
                           class_id=np.zeros(len(cur_tracks)))
        frame_bgr = heatmap_ann.annotate(scene=frame_bgr.copy(), detections=dets)
        vw.write(frame_bgr)

    vw.release()
    return out_path

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
video_path = "/content/drive/MyDrive/datathon_2025/videos/multiperson.mp4"
output_dir = "/content/output"

# Jalankan analisis lengkap
os.makedirs(output_dir, exist_ok=True)
video_result = full_video_analysis(video_path, output_dir)

In [None]:
# df_log = pd.read_csv(os.path.join(output_dir, "action_shelf_log.csv"))

# dwell_rows = []
# for (pid, shelf), grp in df_log.groupby(['pid', 'shelf_id']):
#     frames = sorted(grp['frame'].tolist())
#     start = frames[0]
#     prev  = frames[0]
#     for fr in frames[1:]:
#         if fr != prev + 1:          # terputus → segmen selesai
#             dwell_rows.append([pid, shelf, start, prev,
#                                prev - start + 1, (prev - start + 1)/fps])
#             start = fr
#         prev = fr
#     # segmen terakhir
#     dwell_rows.append([pid, shelf, start, prev,
#                        prev - start + 1, (prev - start + 1)/fps])

# pd.DataFrame(dwell_rows,
#              columns=['pid','shelf_id','start_frame','end_frame',
#                       'frames','seconds']).to_csv(
#              os.path.join(output_dir, 'dwell_time.csv'), index=False)


In [None]:
# from google.colab import files
# from PIL import Image

# # Upload file gambar dari lokal
# uploaded = files.upload()  # kamu bisa pilih file dari komputer

# # Ambil nama file pertama yang di-upload
# image_path = list(uploaded.keys())[0]

# # Jalankan fungsi segmentasi
# hasil_path = infer_shelf_segmentation(image_path)

# # Tampilkan hasilnya
# Image.open(hasil_path).show()
