In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.models.video import r3d_18, R3D_18_Weights
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os
import cv2
from tqdm import tqdm
from pathlib import Path
import numpy as np

In [3]:
folder = r"/content/drive/MyDrive/project/Shop DataSet"
count = sum(len(files) for _, _, files in os.walk(folder))
print("Total files:", count)

Total files: 855


# Data Cleaning

## Data Object

In [4]:
#  Video Dataset
# -------------------------
class VideoDataset(Dataset):
    def __init__(self, root_dir, num_frames=16, train=True):
        self.root_dir = root_dir
        self.classes = sorted(os.listdir(root_dir))
        self.videos = []
        self.num_frames = num_frames
        self.train = train

        self.class_to_idx = {cls_name: idx for idx, cls_name in enumerate(self.classes)}

        for label, class_name in enumerate(self.classes):
            class_path = os.path.join(root_dir, class_name)
            for file in os.listdir(class_path):
                if file.endswith(('.mp4', '.avi', '.mov')):
                    self.videos.append({
                        'path': os.path.join(class_path, file),
                        'label': label
                    })

        self.transform = transforms.Compose([
            transforms.Resize((256, 256)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.5, 0.5, 0.5],
                                 std=[0.5, 0.5, 0.5])
        ])

    def __len__(self):
        return len(self.videos)

    def read_video(self, path):
        cap = cv2.VideoCapture(path)
        frames = []
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(frame)
        cap.release()
        return frames

    def sample_frames(self, frames):
        total = len(frames)
        if total == 0:
            return None
        indices = np.linspace(0, total - 1, self.num_frames, dtype=int)
        return [frames[i] for i in indices]

    def __getitem__(self, idx):
        video_info = self.videos[idx]
        frames = self.read_video(video_info['path'])
        frames = self.sample_frames(frames)
        if frames is None:
            raise ValueError(f"No frames found in {video_info['path']}")
        frames = [self.transform(Image.fromarray(f)) for f in frames]
        video_tensor = torch.stack(frames)  # [T, C, H, W]
        label = torch.tensor(video_info['label'])
        return video_tensor, label, video_info['path']


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

weights = R3D_18_Weights.DEFAULT
model = r3d_18(weights=weights)
model.fc = nn.Identity()  # remove classification layer
model = model.to(device)
model = model.eval()

In [14]:
@torch.no_grad()
def extract_and_save_features(dataset, save_dir):
    os.makedirs(save_dir, exist_ok=True)
    loader = DataLoader(dataset, batch_size=4, shuffle=False, num_workers = 4, pin_memory = True)

    for videos, labels, paths in tqdm(loader, desc="Extracting video features"):
        videos = videos.to(device)
        videos = videos.permute(0, 2, 1, 3, 4)
        feats = F.normalize(model(videos), dim=1)  # [B, 512]

        for feat, path in zip(feats, paths):
            rel_path = Path(path).relative_to(dataset.root_dir)
            save_path = Path(save_dir) / rel_path.with_suffix(".pt")
            save_path.parent.mkdir(parents=True, exist_ok=True)
            torch.save(feat.cpu(), save_path)



In [15]:
dataset = VideoDataset("/content/drive/MyDrive/project/Shop DataSet")
extract_and_save_features(dataset, "/content/drive/MyDrive/project/features")

Extracting video features: 100%|██████████| 214/214 [09:53<00:00,  2.77s/it]


855

In [4]:
from pathlib import Path
from sklearn.model_selection import train_test_split
import shutil
import torch
import torch.nn.functional as F

In [38]:
FEATURES_DIR = Path(r"/content/drive/MyDrive/project/features")
ORIGINAL_DATASET = Path(r"/content/drive/MyDrive/project/Shop DataSet")
OUTPUT_DIR = Path(r"/content/drive/MyDrive/project/clean Shop Dataset")
THRESHOLD = 0.999  # similarity threshold for duplicates
VAL_RATIO = 0.2   # 20% validation split

CLASSES = ["shop lifters", "non shop lifters"]

In [39]:
def load_features(class_name):
    class_dir = FEATURES_DIR / class_name
    paths = list(class_dir.rglob("*.pt"))
    feats = [torch.load(p) for p in paths]
    feats = torch.stack(feats)
    feats = F.normalize(feats, dim=1)
    return feats, paths

def find_duplicates(feats, paths, threshold=0.95):
    sim_matrix = feats @ feats.T  # cosine similarity (since normalized)
    sim_matrix.fill_diagonal_(0)
    to_remove = set()

    for i in range(sim_matrix.size(0)):
        if i in to_remove:
            continue
        duplicates = (sim_matrix[i] > threshold).nonzero(as_tuple=True)[0].tolist()
        to_remove.update(duplicates)

    keep_indices = [i for i in range(len(paths)) if i not in to_remove]

    print(f"number of duplicates {len(to_remove)}, number of to keep videos {len(keep_indices)} ")
    return keep_indices

def copy_videos(video_paths, output_dir):
  for src_path in video_paths:
      rel = src_path.relative_to(ORIGINAL_DATASET)
      dst = output_dir / rel
      dst.parent.mkdir(parents=True, exist_ok=True)
      if not dst.exists():  # avoid overwriting
          shutil.copy2(src_path, dst)

In [40]:
OUTPUT_DIR.mkdir(parents = True, exist_ok = True)
for class_name in CLASSES:
    print(f"\nProcessing class: {class_name}")
    feats, feat_paths = load_features(class_name)

    video_paths = [
        ORIGINAL_DATASET / class_name / (feat_path.stem + ".mp4")
        for feat_path in feat_paths
    ]

    # Find duplicates
    keep_indices = find_duplicates(feats, video_paths, threshold=THRESHOLD)
    clean_paths = [video_paths[i] for i in keep_indices]

    # Split into train / validation
    train_paths, val_paths = train_test_split(clean_paths, test_size=VAL_RATIO, random_state=42)

    # Copy to new clean dataset
    copy_videos(train_paths, OUTPUT_DIR / "train")
    copy_videos(val_paths, OUTPUT_DIR / "validation")

print("\n✅ Done! Clean dataset created at:", OUTPUT_DIR)


Processing class: shop lifters
number of duplicates 76, number of to keep videos 248 

Processing class: non shop lifters
number of duplicates 253, number of to keep videos 278 

✅ Done! Clean dataset created at: /content/drive/MyDrive/project/clean Shop Dataset


In [19]:
from pathlib import Path

FEATURES_DIR = Path(r"/content/drive/MyDrive/project/features")
for class_name in ["shop lifters", "non shop lifters"]:
    class_dir = FEATURES_DIR / class_name
    print(class_dir, "exists:", class_dir.exists())
    print("Number of .pt files:", len(list(class_dir.rglob("*.pt"))))

/content/drive/MyDrive/project/features/shop lifters exists: True
Number of .pt files: 324
/content/drive/MyDrive/project/features/non shop lifters exists: True
Number of .pt files: 531
