<a href="https://colab.research.google.com/github/j00lee/SignLingo/blob/main/MS_Citizen_Data_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting up Google Drive Access

In [None]:
# Set up Google Drive access
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Processing the Videos

In [None]:
import cv2
import os
from multiprocessing import Pool
from tqdm import tqdm

In [None]:
# These are the paths
videos_path = '/content/drive/MyDrive/videos'
frames_output_path = '/content/drive/MyDrive/frames'

# Create the frames output folder if it doesn't exist
os.makedirs(frames_output_path, exist_ok=True)

# How often to take a frame
frame_skip = 20

# List all the video files
video_files = [f for f in os.listdir(videos_path) if f.endswith('.mp4')]

# Process each video
def process_video(video_file):
    try:
        video_path = os.path.join(videos_path, video_file)

        # Create a folder for this video's frames
        video_name = os.path.splitext(video_file)[0]
        video_frames_folder = os.path.join(frames_output_path, video_name)
        os.makedirs(video_frames_folder, exist_ok=True)

        cap = cv2.VideoCapture(video_path)
        frame_idx = 0
        saved_idx = 0

        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            if frame_idx % frame_skip == 0:
                # Optional resize
                # frame = cv2.resize(frame, resize_shape)

                frame_filename = os.path.join(video_frames_folder, f'frame_{saved_idx:04d}.jpg')
                cv2.imwrite(frame_filename, frame)
                saved_idx += 1

            frame_idx += 1

        cap.release()

    except Exception as e:
        print(f"Failed processing {video_file}: {e}")

# Number of processes (workers)
num_workers = 4

# Start multiprocessing pool
with Pool(num_workers) as p:
    list(tqdm(p.imap_unordered(process_video, video_files), total=len(video_files)))


100%|██████████| 83399/83399 [6:42:55<00:00,  3.45it/s]


# Making the train, val, test splits

In [None]:
import os
import shutil
import pandas as pd
from tqdm import tqdm
from multiprocessing import Pool

# Paths
frames_root = '/content/drive/MyDrive/frames'
dataset_root = '/content/drive/MyDrive/dataset'
os.makedirs(dataset_root, exist_ok=True)

# Create subfolders
splits = ['train', 'val', 'test']
for split in splits:
    os.makedirs(os.path.join(dataset_root, split), exist_ok=True)

# Load CSVs
train_videos = pd.read_csv('/content/drive/MyDrive/splits/train.csv')
val_videos = pd.read_csv('/content/drive/MyDrive/splits/val.csv')
test_videos = pd.read_csv('/content/drive/MyDrive/splits/test.csv')

# Get clean video lists (remove .mp4)
train_list = [os.path.splitext(x)[0] for x in train_videos['Video file'].tolist()]
val_list = [os.path.splitext(x)[0] for x in val_videos['Video file'].tolist()]
test_list = [os.path.splitext(x)[0] for x in test_videos['Video file'].tolist()]

# Prepare (video_name, split_name) pairs
all_videos = [(v, 'train') for v in train_list] + \
             [(v, 'val') for v in val_list] + \
             [(v, 'test') for v in test_list]

# Function to move/copy a single video folder
def move_single_video(args):
    video_name, split_name = args
    src = os.path.join(frames_root, video_name)
    dst = os.path.join(dataset_root, split_name, video_name)

    if os.path.exists(src):
        try:
            shutil.copytree(src, dst)  # or shutil.move(src, dst) if you want to move
        except Exception as e:
            print(f"❗ Error moving {video_name}: {e}")
    else:
        print(f"⚠️ Warning: {src} does not exist!")

# Parallel moving using multiprocessing
num_workers = 1  # You can tune this (4, 6, 8 depending on colab/mac resources)

with Pool(num_workers) as p:
    list(tqdm(p.imap_unordered(move_single_video, all_videos), total=len(all_videos)))

print("✅ Done splitting the dataset with multiprocessing!")


In [None]:
import os
import shutil
import pandas as pd
from tqdm import tqdm
from multiprocessing import Pool

# === STEP 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# === STEP 2: Copy frames from Drive to Colab local disk (/content/frames_local)
print("📂 Copying frames from Drive to local disk...")
!mkdir -p /content/frames_local

source_root = '/content/drive/MyDrive/frames'
dest_root = '/content/frames_local'

# Create destination folder if not exists
os.makedirs(dest_root, exist_ok=True)

# List all folders in frames/
frame_folders = os.listdir(source_root)

# Copy each folder one by one (with progress bar)
for folder in tqdm(frame_folders, desc="Copying folders to local disk"):
    src_folder = os.path.join(source_root, folder)
    dst_folder = os.path.join(dest_root, folder)

    if os.path.isdir(src_folder):
        try:
            shutil.copytree(src_folder, dst_folder)
        except Exception as e:
            print(f"Error copying {folder}: {e}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
📂 Copying frames from Drive to local disk...


Copying folders to local disk:  29%|██▉       | 23989/83399 [01:44<00:46, 1268.48it/s]

Error copying 8035548352616286-SODA: [('/content/drive/MyDrive/frames/8035548352616286-SODA/frame_0003.jpg', '/content/frames_local/8035548352616286-SODA/frame_0003.jpg', '[Errno 107] Transport endpoint is not connected'), ('/content/drive/MyDrive/frames/8035548352616286-SODA/frame_0004.jpg', '/content/frames_local/8035548352616286-SODA/frame_0004.jpg', "[Errno 107] Transport endpoint is not connected: '/content/drive/MyDrive/frames/8035548352616286-SODA/frame_0004.jpg'"), ('/content/drive/MyDrive/frames/8035548352616286-SODA', '/content/frames_local/8035548352616286-SODA', "[Errno 107] Transport endpoint is not connected: '/content/drive/MyDrive/frames/8035548352616286-SODA'")]


Copying folders to local disk: 100%|██████████| 83399/83399 [01:45<00:00, 792.01it/s] 


In [None]:
drive.flush_and_unmount()
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# === STEP 3: Setup local working paths
frames_root = '/content/frames_local'
dataset_root = '/content/dataset'  # Local directory for processing

# Create train/val/test folders
for split in ['train', 'val', 'test']:
    os.makedirs(os.path.join(dataset_root, split), exist_ok=True)

# === STEP 4: Load CSVs
train_videos = pd.read_csv('/content/drive/MyDrive/splits/train.csv')
val_videos = pd.read_csv('/content/drive/MyDrive/splits/val.csv')
test_videos = pd.read_csv('/content/drive/MyDrive/splits/test.csv')

# Get video lists (remove ".mp4" extension)
train_list = [os.path.splitext(x)[0] for x in train_videos['Video file'].tolist()]
val_list = [os.path.splitext(x)[0] for x in val_videos['Video file'].tolist()]
test_list = [os.path.splitext(x)[0] for x in test_videos['Video file'].tolist()]

# All videos to process
all_videos = [(v, 'train') for v in train_list] + \
             [(v, 'val') for v in val_list] + \
             [(v, 'test') for v in test_list]

# === STEP 5: Move each video folder locally
# Add missing counter
from multiprocessing import Value, Lock

missing_counter = Value('i', 0)  # Shared counter across processes
counter_lock = Lock()

def move_single_video(args):
    video_name, split_name = args
    src = os.path.join(frames_root, video_name)
    dst = os.path.join(dataset_root, split_name, video_name)

    if os.path.exists(src):
        try:
            shutil.copytree(src, dst)
        except Exception as e:
            print(f"❗ Error moving {video_name}: {e}")
    else:
        print(f"⚠️ Warning: {src} does not exist!")
        with counter_lock:
            missing_counter.value += 1

# Multiprocessing copy with few workers (safe for local disk)
print("🚀 Splitting dataset into train/val/test on local disk...")
num_workers = 2

with Pool(num_workers) as p:
    list(tqdm(p.imap_unordered(move_single_video, all_videos), total=len(all_videos)))

# === NEW: After moving, report how many missing
print(f"⚡ Total missing videos: {missing_counter.value}")

print("✅ Done splitting dataset locally!")

# === STEP 6: Zip each split folder
print("📦 Zipping train/val/test folders...")
for split in ['train', 'val', 'test']:
    split_path = os.path.join(dataset_root, split)
    zip_path = f'/content/{split}.zip'
    shutil.make_archive(base_name=zip_path.replace('.zip', ''), format='zip', root_dir=split_path)
    print(f"✅ Zipped {split}")

# === STEP 7: Create destination folder on Drive if needed
!mkdir -p /content/drive/MyDrive/dataset_zips

# === STEP 8: Move zipped files back to Google Drive
print("⬆️ Uploading zipped splits back to Google Drive...")
!mv /content/train.zip /content/drive/MyDrive/dataset_zips/train.zip
!mv /content/val.zip /content/drive/MyDrive/dataset_zips/val.zip
!mv /content/test.zip /content/drive/MyDrive/dataset_zips/test.zip

print("🎉 All done! Train/Val/Test splits are zipped and saved in Drive!")

🚀 Splitting dataset into train/val/test on local disk...


  0%|          | 0/83399 [00:00<?, ?it/s]








  0%|          | 11/83399 [00:00<12:39, 109.76it/s]









  0%|          | 32/83399 [00:00<08:25, 164.82it/s]







  0%|          | 49/83399 [00:00<09:20, 148.73it/s]









  0%|          | 65/83399 [00:00<09:18, 149.18it/s]









  0%|          | 81/83399 [00:00<09:16, 149.78it/s]









  0%|          | 98/83399 [00:00<08:56, 155.34it/s]









  0%|          | 114/83399 [00:00<09:22, 148.11it/s]








  0%|          | 130/83399 [00:00<09:22, 148.02it/s]












  0%|          | 148/83399 [00:00<09:09, 151.52it/s]








  0%|          | 164/83399 [00:01<09:11, 150.87it/s]








  0%|          | 183/83399 [00:01<08:44, 158.61it/s]










  0%|          | 202/83399 [00:01<08:22, 165.58it/s]









  0%|          | 221/83399 [00:01<08:04, 171.62it/s]












  0%|          | 239/83399 [00:01<08:03, 172.04it/s]









  0%|          | 257/83399 [00:01<08:05, 171.19it/s]









  0%|          | 275/83399 [00:01<07:59, 173.33it/s]










  0%|          | 293/83399 [00:01<08:59, 153.90it/s]









  0%|          | 310/83399 [00:01<08:49, 156.97it/s]








  0%|          | 332/83399 [00:02<08:03, 171.75it/s]










  0%|          | 353/83399 [00:02<07:41, 179.86it/s]












  0%|          | 373/83399 [00:02<07:42, 179.33it/s]










  0%|          | 396/83399 [00:02<07:17, 189.56it/s]









  0%|          | 416/83399 [00:02<07:42, 179.49it/s]









  1%|          | 435/83399 [00:02<07:44, 178.51it/s]











  1%|          | 454/83399 [00:02<07:51, 175.84it/s]








  1%|          | 472/83399 [00:02<08:00, 172.61it/s]









  1%|          | 492/83399 [00:02<07:42, 179.43it/s]









  1%|          | 511/83399 [00:03<08:18, 166.43it/s]








  1%|          | 529/83399 [00:03<08:07, 169.95it/s]










  1%|          | 551/83399 [00:03<07:35, 181.79it/s]










  1%|          | 570/83399 [00:03<07:40, 179.68it/s]









  1%|          | 589/83399 [00:03<08:39, 159.56it/s]








  1%|          | 606/83399 [00:03<08:35, 160.48it/s]









  1%|          | 627/83399 [00:03<08:10, 168.90it/s]










  1%|          | 645/83399 [00:03<08:48, 156.49it/s]








  1%|          | 666/83399 [00:04<08:06, 170.08it/s]












  1%|          | 684/83399 [00:04<08:38, 159.54it/s]









  1%|          | 701/83399 [00:04<08:43, 158.05it/s]










  1%|          | 718/83399 [00:04<08:38, 159.54it/s]











  1%|          | 735/83399 [00:04<08:37, 159.78it/s]









  1%|          | 752/83399 [00:04<09:19, 147.62it/s]










  1%|          | 771/83399 [00:04<08:51, 155.58it/s]








  1%|          | 793/83399 [00:04<08:00, 171.85it/s]









  1%|          | 811/83399 [00:04<08:15, 166.52it/s]









  1%|          | 831/83399 [00:05<08:01, 171.36it/s]








  1%|          | 850/83399 [00:05<07:51, 175.08it/s]









  1%|          | 868/83399 [00:05<08:01, 171.49it/s]











  1%|          | 886/83399 [00:05<07:55, 173.37it/s]










  1%|          | 910/83399 [00:05<07:08, 192.41it/s]










  1%|          | 933/83399 [00:05<06:46, 202.67it/s]










  1%|          | 954/83399 [00:05<07:23, 186.02it/s]










  1%|          | 973/83399 [00:05<07:36, 180.49it/s]









  1%|          | 992/83399 [00:05<08:12, 167.47it/s]










  1%|          | 1016/83399 [00:06<07:25, 184.86it/s]










  1%|          | 1036/83399 [00:06<07:23, 185.67it/s]










  1%|▏         | 1058/83399 [00:06<07:10, 191.41it/s]








  1%|▏         | 1078/83399 [00:06<07:05, 193.52it/s]








  1%|▏         | 1098/83399 [00:06<07:09, 191.45it/s]








  1%|▏         | 1118/83399 [00:06<07:28, 183.45it/s]










  1%|▏         | 1137/83399 [00:06<07:39, 179.20it/s]








  1%|▏         | 1156/83399 [00:06<08:09, 168.00it/s]










  1%|▏         | 1173/83399 [00:06<08:18, 165.09it/s]








  1%|▏         | 1190/83399 [00:07<08:16, 165.72it/s]








  1%|▏         | 1207/83399 [00:07<08:38, 158.47it/s]








  1%|▏         | 1224/83399 [00:07<08:30, 161.06it/s]









  1%|▏         | 1241/83399 [00:07<08:27, 161.74it/s]







  2%|▏         | 1260/83399 [00:07<08:05, 169.29it/s]









  2%|▏         | 1279/83399 [00:07<07:52, 173.80it/s]











  2%|▏         | 1297/83399 [00:07<07:54, 173.14it/s]









  2%|▏         | 1318/83399 [00:07<07:29, 182.79it/s]









  2%|▏         | 1337/83399 [00:07<08:10, 167.38it/s]










  2%|▏         | 1355/83399 [00:08<08:33, 159.75it/s]








  2%|▏         | 1373/83399 [00:08<08:23, 162.90it/s]









  2%|▏         | 1390/83399 [00:08<08:21, 163.57it/s]









  2%|▏         | 1407/83399 [00:08<08:25, 162.06it/s]








  2%|▏         | 1425/83399 [00:08<08:20, 163.72it/s]








  2%|▏         | 1442/83399 [00:08<08:47, 155.27it/s]








  2%|▏         | 1458/83399 [00:08<09:30, 143.59it/s]










  2%|▏         | 1476/83399 [00:08<09:11, 148.60it/s]






  2%|▏         | 1492/83399 [00:08<09:10, 148.74it/s]








  2%|▏         | 1510/83399 [00:09<08:45, 155.74it/s]









  2%|▏         | 1528/83399 [00:09<08:25, 161.91it/s]









  2%|▏         | 1545/83399 [00:09<08:24, 162.29it/s]










  2%|▏         | 1565/83399 [00:09<08:00, 170.32it/s]









  2%|▏         | 1583/83399 [00:09<07:57, 171.20it/s]









  2%|▏         | 1601/83399 [00:09<07:54, 172.24it/s]










  2%|▏         | 1619/83399 [00:09<08:06, 167.95it/s]







  2%|▏         | 1636/83399 [00:09<08:09, 167.18it/s]











  2%|▏         | 1655/83399 [00:09<07:54, 172.35it/s]









  2%|▏         | 1673/83399 [00:09<07:50, 173.61it/s]









  2%|▏         | 1693/83399 [00:10<07:32, 180.39it/s]









  2%|▏         | 1712/83399 [00:10<07:35, 179.41it/s]






  2%|▏         | 1731/83399 [00:10<07:27, 182.39it/s]











  2%|▏         | 1751/83399 [00:10<07:16, 186.94it/s]











  2%|▏         | 1770/83399 [00:10<07:22, 184.30it/s]







  2%|▏         | 1791/83399 [00:10<07:07, 191.05it/s]











  2%|▏         | 1811/83399 [00:10<07:39, 177.45it/s]









  2%|▏         | 1829/83399 [00:10<08:06, 167.62it/s]











  2%|▏         | 1846/83399 [00:10<08:15, 164.51it/s]










  2%|▏         | 1866/83399 [00:11<07:48, 174.07it/s]












  2%|▏         | 1886/83399 [00:11<07:42, 176.40it/s]








  2%|▏         | 1904/83399 [00:11<08:31, 159.43it/s]








  2%|▏         | 1923/83399 [00:11<08:18, 163.56it/s]










  2%|▏         | 1941/83399 [00:11<08:07, 167.01it/s]









  2%|▏         | 1958/83399 [00:11<08:11, 165.81it/s]









  2%|▏         | 1975/83399 [00:11<08:28, 160.10it/s]









  2%|▏         | 1992/83399 [00:11<09:00, 150.69it/s]











  2%|▏         | 2010/83399 [00:11<08:44, 155.22it/s]








  2%|▏         | 2031/83399 [00:12<08:06, 167.15it/s]











  2%|▏         | 2050/83399 [00:12<08:01, 169.04it/s]









  2%|▏         | 2070/83399 [00:12<07:42, 175.84it/s]










  3%|▎         | 2088/83399 [00:12<08:07, 166.73it/s]








  3%|▎         | 2106/83399 [00:12<08:02, 168.37it/s]








  3%|▎         | 2125/83399 [00:12<07:48, 173.54it/s]










  3%|▎         | 2143/83399 [00:12<08:02, 168.24it/s]









  3%|▎         | 2160/83399 [00:12<08:14, 164.26it/s]










  3%|▎         | 2177/83399 [00:12<08:16, 163.61it/s]








  3%|▎         | 2196/83399 [00:13<07:58, 169.63it/s]







  3%|▎         | 2214/83399 [00:13<08:01, 168.59it/s]









  3%|▎         | 2231/83399 [00:13<08:03, 168.04it/s]









  3%|▎         | 2252/83399 [00:13<07:36, 177.94it/s]










  3%|▎         | 2271/83399 [00:13<07:31, 179.62it/s]











  3%|▎         | 2289/83399 [00:13<08:02, 168.16it/s]







  3%|▎         | 2306/83399 [00:13<08:26, 160.05it/s]









  3%|▎         | 2323/83399 [00:13<08:31, 158.42it/s]









  3%|▎         | 2339/83399 [00:13<08:40, 155.84it/s]









  3%|▎         | 2359/83399 [00:14<08:12, 164.43it/s]









  3%|▎         | 2381/83399 [00:14<07:31, 179.41it/s]










  3%|▎         | 2401/83399 [00:14<07:29, 180.04it/s]








  3%|▎         | 2420/83399 [00:14<07:28, 180.73it/s]










  3%|▎         | 2439/83399 [00:14<07:50, 172.18it/s]










  3%|▎         | 2457/83399 [00:14<08:08, 165.61it/s]









  3%|▎         | 2474/83399 [00:14<08:05, 166.52it/s]








  3%|▎         | 2491/83399 [00:14<08:21, 161.33it/s]








  3%|▎         | 2508/83399 [00:14<08:17, 162.48it/s]











  3%|▎         | 2525/83399 [00:15<08:13, 163.87it/s]











  3%|▎         | 2547/83399 [00:15<07:36, 177.21it/s]










  3%|▎         | 2565/83399 [00:15<07:39, 176.04it/s]










  3%|▎         | 2583/83399 [00:15<08:15, 163.17it/s]








  3%|▎         | 2603/83399 [00:15<07:52, 171.13it/s]







  3%|▎         | 2621/83399 [00:15<07:59, 168.46it/s]








  3%|▎         | 2638/83399 [00:15<08:31, 157.85it/s]









  3%|▎         | 2655/83399 [00:15<08:31, 157.81it/s]







  3%|▎         | 2671/83399 [00:15<09:15, 145.32it/s]










  3%|▎         | 2687/83399 [00:16<09:12, 145.99it/s]









  3%|▎         | 2705/83399 [00:16<08:50, 152.00it/s]











  3%|▎         | 2727/83399 [00:16<08:04, 166.39it/s]









  3%|▎         | 2747/83399 [00:16<07:47, 172.46it/s]











  3%|▎         | 2765/83399 [00:16<07:43, 173.83it/s]







  3%|▎         | 2784/83399 [00:16<07:38, 175.72it/s]









  3%|▎         | 2802/83399 [00:16<08:12, 163.78it/s]











  3%|▎         | 2822/83399 [00:16<07:45, 172.97it/s]










  3%|▎         | 2840/83399 [00:16<08:02, 166.99it/s]









  3%|▎         | 2858/83399 [00:17<08:05, 165.85it/s]








  3%|▎         | 2877/83399 [00:17<07:55, 169.20it/s]










  3%|▎         | 2895/83399 [00:17<08:19, 161.17it/s]









  3%|▎         | 2913/83399 [00:17<08:11, 163.88it/s]








  4%|▎         | 2930/83399 [00:17<08:19, 160.96it/s]









  4%|▎         | 2947/83399 [00:17<08:35, 156.03it/s]












  4%|▎         | 2967/83399 [00:17<08:08, 164.54it/s]







  4%|▎         | 2985/83399 [00:17<07:57, 168.44it/s]











  4%|▎         | 3002/83399 [00:17<08:08, 164.50it/s]









  4%|▎         | 3024/83399 [00:18<07:32, 177.48it/s]










  4%|▎         | 3042/83399 [00:18<08:08, 164.50it/s]







  4%|▎         | 3059/83399 [00:18<08:14, 162.36it/s]









  4%|▎         | 3076/83399 [00:18<08:23, 159.54it/s]









  4%|▎         | 3093/83399 [00:18<08:33, 156.27it/s]









  4%|▎         | 3113/83399 [00:18<08:00, 166.94it/s]









  4%|▍         | 3130/83399 [00:18<08:27, 158.05it/s]









  4%|▍         | 3147/83399 [00:18<08:19, 160.73it/s]









  4%|▍         | 3166/83399 [00:18<07:56, 168.35it/s]










  4%|▍         | 3183/83399 [00:19<08:00, 167.02it/s]








  4%|▍         | 3200/83399 [00:19<09:07, 146.57it/s]








  4%|▍         | 3216/83399 [00:19<09:03, 147.54it/s]









  4%|▍         | 3232/83399 [00:19<08:59, 148.55it/s]








  4%|▍         | 3250/83399 [00:19<08:48, 151.56it/s]






  4%|▍         | 3267/83399 [00:19<08:32, 156.22it/s]









  4%|▍         | 3284/83399 [00:19<08:20, 160.09it/s]








  4%|▍         | 3302/83399 [00:19<08:04, 165.28it/s]








  4%|▍         | 3319/83399 [00:19<08:19, 160.20it/s]










  4%|▍         | 3336/83399 [00:20<08:20, 159.96it/s]








  4%|▍         | 3354/83399 [00:20<08:07, 164.10it/s]








  4%|▍         | 3371/83399 [00:20<08:15, 161.44it/s]








  4%|▍         | 3389/83399 [00:20<08:01, 166.18it/s]









  4%|▍         | 3406/83399 [00:20<07:58, 167.16it/s]








  4%|▍         | 3423/83399 [00:20<08:06, 164.51it/s]












  4%|▍         | 3440/83399 [00:20<08:36, 154.76it/s]









  4%|▍         | 3461/83399 [00:20<07:55, 168.00it/s]








  4%|▍         | 3479/83399 [00:20<07:49, 170.23it/s]










  4%|▍         | 3497/83399 [00:21<08:47, 151.50it/s]












  4%|▍         | 3519/83399 [00:21<07:55, 167.92it/s]






  4%|▍         | 3537/83399 [00:21<08:23, 158.60it/s]












  4%|▍         | 3555/83399 [00:21<08:11, 162.53it/s]








  4%|▍         | 3574/83399 [00:21<07:57, 167.22it/s]









  4%|▍         | 3591/83399 [00:21<08:35, 154.84it/s]









  4%|▍         | 3608/83399 [00:21<08:22, 158.68it/s]










  4%|▍         | 3625/83399 [00:21<09:01, 147.31it/s]








  4%|▍         | 3641/83399 [00:21<09:38, 137.83it/s]









  4%|▍         | 3656/83399 [00:22<09:48, 135.59it/s]









  4%|▍         | 3670/83399 [00:22<09:53, 134.44it/s]







  4%|▍         | 3685/83399 [00:22<09:39, 137.50it/s]









  4%|▍         | 3700/83399 [00:22<09:34, 138.75it/s]










  4%|▍         | 3716/83399 [00:22<09:12, 144.15it/s]







  4%|▍         | 3731/83399 [00:22<09:33, 138.95it/s]






  4%|▍         | 3746/83399 [00:22<09:37, 137.88it/s]












  5%|▍         | 3764/83399 [00:22<08:59, 147.57it/s]








  5%|▍         | 3779/83399 [00:22<09:25, 140.86it/s]








  5%|▍         | 3797/83399 [00:23<08:55, 148.58it/s]









  5%|▍         | 3815/83399 [00:23<08:44, 151.71it/s]






  5%|▍         | 3831/83399 [00:23<09:35, 138.27it/s]








  5%|▍         | 3850/83399 [00:23<08:45, 151.38it/s]







  5%|▍         | 3866/83399 [00:23<09:07, 145.33it/s]










  5%|▍         | 3883/83399 [00:23<08:44, 151.71it/s]









  5%|▍         | 3899/83399 [00:23<08:50, 149.77it/s]









  5%|▍         | 3915/83399 [00:23<09:22, 141.40it/s]








  5%|▍         | 3930/83399 [00:23<09:50, 134.50it/s]









  5%|▍         | 3949/83399 [00:24<08:59, 147.27it/s]










  5%|▍         | 3968/83399 [00:24<08:30, 155.52it/s]







  5%|▍         | 3990/83399 [00:24<07:47, 169.74it/s]










  5%|▍         | 4008/83399 [00:24<07:46, 170.35it/s]









  5%|▍         | 4026/83399 [00:24<08:15, 160.17it/s]










  5%|▍         | 4043/83399 [00:24<08:52, 149.03it/s]









  5%|▍         | 4059/83399 [00:24<09:02, 146.16it/s]








  5%|▍         | 4077/83399 [00:24<08:38, 153.10it/s]









  5%|▍         | 4095/83399 [00:24<08:14, 160.34it/s]








  5%|▍         | 4114/83399 [00:25<08:06, 163.07it/s]








  5%|▍         | 4131/83399 [00:25<08:41, 151.97it/s]









  5%|▍         | 4147/83399 [00:25<09:26, 139.94it/s]








  5%|▍         | 4162/83399 [00:25<09:45, 135.24it/s]










  5%|▌         | 4176/83399 [00:25<10:03, 131.34it/s]









  5%|▌         | 4190/83399 [00:25<10:43, 123.07it/s]






  5%|▌         | 4205/83399 [00:25<10:20, 127.58it/s]










  5%|▌         | 4222/83399 [00:25<09:31, 138.44it/s]










  5%|▌         | 4243/83399 [00:26<08:30, 155.05it/s]








  5%|▌         | 4265/83399 [00:26<07:40, 171.83it/s]










  5%|▌         | 4285/83399 [00:26<07:23, 178.34it/s]








  5%|▌         | 4304/83399 [00:26<07:22, 178.81it/s]










  5%|▌         | 4323/83399 [00:26<07:44, 170.27it/s]







  5%|▌         | 4341/83399 [00:26<07:40, 171.69it/s]








  5%|▌         | 4359/83399 [00:26<08:06, 162.37it/s]










  5%|▌         | 4376/83399 [00:26<09:43, 135.54it/s]






  5%|▌         | 4391/83399 [00:27<09:36, 137.13it/s]








  5%|▌         | 4407/83399 [00:27<09:12, 142.87it/s]








  5%|▌         | 4423/83399 [00:27<08:56, 147.08it/s]











  5%|▌         | 4442/83399 [00:27<08:30, 154.61it/s]







  5%|▌         | 4460/83399 [00:27<08:14, 159.53it/s]








  5%|▌         | 4477/83399 [00:27<09:30, 138.37it/s]









  5%|▌         | 4495/83399 [00:27<08:51, 148.36it/s]








  5%|▌         | 4511/83399 [00:27<10:01, 131.15it/s]








  5%|▌         | 4528/83399 [00:27<09:30, 138.27it/s]







  5%|▌         | 4547/83399 [00:28<08:49, 149.04it/s]









  5%|▌         | 4563/83399 [00:28<08:39, 151.81it/s]








  5%|▌         | 4579/83399 [00:28<08:54, 147.60it/s]







  6%|▌         | 4598/83399 [00:28<08:18, 158.04it/s]









  6%|▌         | 4615/83399 [00:28<08:08, 161.36it/s]








  6%|▌         | 4632/83399 [00:28<08:03, 163.01it/s]










  6%|▌         | 4649/83399 [00:28<08:04, 162.50it/s]









  6%|▌         | 4669/83399 [00:28<07:54, 165.99it/s]









  6%|▌         | 4689/83399 [00:28<07:30, 174.82it/s]









  6%|▌         | 4707/83399 [00:29<07:34, 172.96it/s]







  6%|▌         | 4725/83399 [00:29<07:52, 166.35it/s]









  6%|▌         | 4744/83399 [00:29<07:39, 171.05it/s]










  6%|▌         | 4763/83399 [00:29<07:34, 172.97it/s]







  6%|▌         | 4781/83399 [00:29<07:37, 171.76it/s]







  6%|▌         | 4799/83399 [00:29<08:01, 163.07it/s]








  6%|▌         | 4816/83399 [00:29<07:59, 163.83it/s]











  6%|▌         | 4833/83399 [00:29<08:23, 156.12it/s]







  6%|▌         | 4849/83399 [00:29<08:40, 150.92it/s]








  6%|▌         | 4870/83399 [00:30<07:50, 166.89it/s]











  6%|▌         | 4889/83399 [00:30<07:33, 173.04it/s]









  6%|▌         | 4907/83399 [00:30<07:53, 165.76it/s]










  6%|▌         | 4924/83399 [00:30<08:06, 161.32it/s]








  6%|▌         | 4945/83399 [00:30<07:38, 170.93it/s]









  6%|▌         | 4963/83399 [00:30<07:34, 172.52it/s]









  6%|▌         | 4981/83399 [00:30<08:08, 160.55it/s]








  6%|▌         | 5000/83399 [00:30<07:58, 163.93it/s]








  6%|▌         | 5017/83399 [00:30<07:58, 163.71it/s]







  6%|▌         | 5034/83399 [00:30<07:54, 165.21it/s]









  6%|▌         | 5051/83399 [00:31<07:55, 164.88it/s]










  6%|▌         | 5069/83399 [00:31<07:51, 166.07it/s]










  6%|▌         | 5086/83399 [00:31<07:50, 166.50it/s]








  6%|▌         | 5104/83399 [00:31<07:44, 168.47it/s]









  6%|▌         | 5121/83399 [00:31<08:01, 162.41it/s]








  6%|▌         | 5138/83399 [00:31<08:45, 148.92it/s]









  6%|▌         | 5159/83399 [00:31<08:01, 162.43it/s]












  6%|▌         | 5177/83399 [00:31<08:04, 161.46it/s]










  6%|▌         | 5198/83399 [00:31<07:41, 169.48it/s]







  6%|▋         | 5217/83399 [00:32<07:39, 170.23it/s]








  6%|▋         | 5235/83399 [00:32<08:20, 156.29it/s]













  6%|▋         | 5251/83399 [00:32<08:39, 150.32it/s]








  6%|▋         | 5270/83399 [00:32<08:14, 158.07it/s]










  6%|▋         | 5286/83399 [00:32<08:19, 156.29it/s]






  6%|▋         | 5303/83399 [00:32<08:22, 155.40it/s]









  6%|▋         | 5319/83399 [00:32<08:22, 155.33it/s]







  6%|▋         | 5335/83399 [00:32<08:43, 149.12it/s]








  6%|▋         | 5351/83399 [00:33<08:50, 147.23it/s]







  6%|▋         | 5366/83399 [00:33<09:51, 131.94it/s]









  6%|▋         | 5384/83399 [00:33<09:10, 141.79it/s]








  6%|▋         | 5402/83399 [00:33<08:40, 149.99it/s]









  6%|▋         | 5418/83399 [00:33<08:47, 147.93it/s]







  7%|▋         | 5433/83399 [00:33<08:58, 144.86it/s]









  7%|▋         | 5448/83399 [00:33<09:13, 140.91it/s]









  7%|▋         | 5464/83399 [00:33<08:58, 144.59it/s]









  7%|▋         | 5480/83399 [00:33<08:44, 148.68it/s]








  7%|▋         | 5495/83399 [00:34<08:53, 146.02it/s]








  7%|▋         | 5510/83399 [00:34<08:55, 145.43it/s]








  7%|▋         | 5527/83399 [00:34<08:41, 149.42it/s]









  7%|▋         | 5547/83399 [00:34<08:03, 160.91it/s]









  7%|▋         | 5564/83399 [00:34<09:03, 143.12it/s]






  7%|▋         | 5579/83399 [00:34<09:27, 137.11it/s]









  7%|▋         | 5593/83399 [00:34<10:18, 125.81it/s]









  7%|▋         | 5609/83399 [00:34<09:41, 133.75it/s]









  7%|▋         | 5623/83399 [00:34<10:28, 123.80it/s]










  7%|▋         | 5640/83399 [00:35<09:40, 133.95it/s]









  7%|▋         | 5656/83399 [00:35<09:14, 140.21it/s]








  7%|▋         | 5672/83399 [00:35<08:57, 144.73it/s]










  7%|▋         | 5690/83399 [00:35<08:26, 153.44it/s]









  7%|▋         | 5707/83399 [00:35<08:22, 154.50it/s]









  7%|▋         | 5723/83399 [00:35<08:59, 143.84it/s]








  7%|▋         | 5738/83399 [00:35<09:48, 131.91it/s]










  7%|▋         | 5754/83399 [00:35<09:31, 135.87it/s]







  7%|▋         | 5768/83399 [00:36<10:43, 120.58it/s]









  7%|▋         | 5786/83399 [00:36<09:35, 134.91it/s]










  7%|▋         | 5805/83399 [00:36<08:52, 145.69it/s]







  7%|▋         | 5821/83399 [00:36<09:28, 136.38it/s]






  7%|▋         | 5836/83399 [00:36<09:22, 137.83it/s]






  7%|▋         | 5852/83399 [00:36<09:15, 139.63it/s]





  7%|▋         | 5867/83399 [00:36<09:14, 139.87it/s]








  7%|▋         | 5882/83399 [00:36<10:39, 121.28it/s]







  7%|▋         | 5895/83399 [00:36<10:35, 121.93it/s]







  7%|▋         | 5908/83399 [00:37<10:59, 117.50it/s]










  7%|▋         | 5924/83399 [00:37<10:07, 127.52it/s]






  7%|▋         | 5938/83399 [00:37<10:20, 124.93it/s]








  7%|▋         | 5952/83399 [00:37<10:01, 128.82it/s]












  7%|▋         | 5976/83399 [00:37<08:10, 157.84it/s]









  7%|▋         | 5993/83399 [00:37<08:02, 160.58it/s]







  7%|▋         | 6010/83399 [00:37<08:22, 154.13it/s]










  7%|▋         | 6026/83399 [00:37<09:08, 141.14it/s]








  7%|▋         | 6043/83399 [00:37<08:41, 148.40it/s]







  7%|▋         | 6060/83399 [00:38<08:21, 154.07it/s]








  7%|▋         | 6078/83399 [00:38<08:00, 161.04it/s]









  7%|▋         | 6095/83399 [00:38<07:53, 163.26it/s]









  7%|▋         | 6112/83399 [00:38<07:49, 164.76it/s]












  7%|▋         | 6129/83399 [00:38<08:32, 150.82it/s]








  7%|▋         | 6145/83399 [00:38<08:55, 144.37it/s]







  7%|▋         | 6160/83399 [00:38<09:24, 136.93it/s]








  7%|▋         | 6174/83399 [00:38<09:32, 134.93it/s]










  7%|▋         | 6192/83399 [00:38<08:49, 145.69it/s]










  7%|▋         | 6207/83399 [00:39<08:57, 143.58it/s]









  7%|▋         | 6224/83399 [00:39<08:36, 149.39it/s]








  7%|▋         | 6240/83399 [00:39<08:41, 147.93it/s]









  8%|▊         | 6255/83399 [00:39<09:04, 141.73it/s]








  8%|▊         | 6270/83399 [00:39<09:18, 137.98it/s]







  8%|▊         | 6284/83399 [00:39<09:19, 137.78it/s]







  8%|▊         | 6300/83399 [00:39<09:14, 139.10it/s]








  8%|▊         | 6314/83399 [00:39<10:21, 123.99it/s]






  8%|▊         | 6327/83399 [00:39<10:16, 124.96it/s]








  8%|▊         | 6340/83399 [00:40<10:43, 119.77it/s]








  8%|▊         | 6356/83399 [00:40<10:04, 127.48it/s]





  8%|▊         | 6369/83399 [00:40<10:02, 127.84it/s]









  8%|▊         | 6386/83399 [00:40<09:22, 136.96it/s]










  8%|▊         | 6402/83399 [00:40<08:58, 142.92it/s]








  8%|▊         | 6420/83399 [00:40<08:37, 148.74it/s]







  8%|▊         | 6435/83399 [00:40<08:54, 144.00it/s]









  8%|▊         | 6450/83399 [00:40<09:23, 136.44it/s]







  8%|▊         | 6464/83399 [00:40<09:32, 134.35it/s]






  8%|▊         | 6478/83399 [00:41<09:27, 135.64it/s]










  8%|▊         | 6495/83399 [00:41<08:52, 144.55it/s]







  8%|▊         | 6510/83399 [00:41<09:31, 134.51it/s]







  8%|▊         | 6525/83399 [00:41<09:20, 137.27it/s]








  8%|▊         | 6541/83399 [00:41<09:02, 141.71it/s]






  8%|▊         | 6556/83399 [00:41<08:56, 143.13it/s]








  8%|▊         | 6572/83399 [00:41<08:49, 145.14it/s]












  8%|▊         | 6592/83399 [00:41<08:17, 154.26it/s]











  8%|▊         | 6611/83399 [00:41<08:01, 159.37it/s]







  8%|▊         | 6627/83399 [00:42<08:10, 156.52it/s]








  8%|▊         | 6643/83399 [00:42<09:20, 136.95it/s]








  8%|▊         | 6658/83399 [00:42<09:19, 137.14it/s]






  8%|▊         | 6672/83399 [00:42<09:26, 135.33it/s]










  8%|▊         | 6690/83399 [00:42<08:45, 145.85it/s]








  8%|▊         | 6709/83399 [00:42<08:11, 155.97it/s]










  8%|▊         | 6727/83399 [00:42<07:56, 160.94it/s]









  8%|▊         | 6744/83399 [00:42<07:51, 162.52it/s]









  8%|▊         | 6761/83399 [00:42<07:47, 163.86it/s]









  8%|▊         | 6778/83399 [00:43<08:00, 159.60it/s]









  8%|▊         | 6795/83399 [00:43<08:23, 152.16it/s]








  8%|▊         | 6811/83399 [00:43<08:31, 149.78it/s]









  8%|▊         | 6827/83399 [00:43<08:40, 147.23it/s]








  8%|▊         | 6845/83399 [00:43<08:09, 156.24it/s]







  8%|▊         | 6863/83399 [00:43<07:52, 161.86it/s]






  8%|▊         | 6881/83399 [00:43<07:41, 165.94it/s]









  8%|▊         | 6898/83399 [00:43<07:38, 166.86it/s]









  8%|▊         | 6915/83399 [00:43<08:02, 158.42it/s]








  8%|▊         | 6932/83399 [00:44<08:00, 159.26it/s]








  8%|▊         | 6950/83399 [00:44<07:52, 161.81it/s]





  8%|▊         | 6967/83399 [00:44<08:41, 146.61it/s]












  8%|▊         | 6987/83399 [00:44<08:02, 158.51it/s]








  8%|▊         | 7004/83399 [00:44<08:04, 157.78it/s]







  8%|▊         | 7020/83399 [00:44<08:36, 147.78it/s]







  8%|▊         | 7036/83399 [00:44<08:41, 146.36it/s]






  8%|▊         | 7051/83399 [00:44<08:44, 145.64it/s]







  8%|▊         | 7071/83399 [00:44<08:02, 158.14it/s]











  9%|▊         | 7091/83399 [00:45<07:36, 167.00it/s]








  9%|▊         | 7108/83399 [00:45<08:14, 154.42it/s]









  9%|▊         | 7124/83399 [00:45<08:22, 151.81it/s]







  9%|▊         | 7140/83399 [00:45<09:01, 140.95it/s]








  9%|▊         | 7156/83399 [00:45<09:01, 140.89it/s]







  9%|▊         | 7171/83399 [00:45<10:20, 122.84it/s]








  9%|▊         | 7184/83399 [00:45<10:21, 122.60it/s]









  9%|▊         | 7205/83399 [00:45<08:46, 144.81it/s]










  9%|▊         | 7221/83399 [00:46<08:48, 144.21it/s]








  9%|▊         | 7236/83399 [00:46<09:15, 137.15it/s]








  9%|▊         | 7251/83399 [00:46<09:34, 132.65it/s]











  9%|▊         | 7273/83399 [00:46<08:09, 155.68it/s]






  9%|▊         | 7290/83399 [00:46<08:44, 145.10it/s]











  9%|▉         | 7310/83399 [00:46<07:59, 158.61it/s]









  9%|▉         | 7327/83399 [00:46<08:14, 153.96it/s]









  9%|▉         | 7343/83399 [00:46<08:49, 143.58it/s]







  9%|▉         | 7358/83399 [00:47<09:52, 128.28it/s]








  9%|▉         | 7372/83399 [00:47<10:21, 122.30it/s]









  9%|▉         | 7385/83399 [00:47<11:05, 114.30it/s]






  9%|▉         | 7397/83399 [00:47<11:47, 107.35it/s]






  9%|▉         | 7410/83399 [00:47<11:15, 112.56it/s]









  9%|▉         | 7429/83399 [00:47<09:33, 132.35it/s]










  9%|▉         | 7447/83399 [00:47<08:54, 142.17it/s]











  9%|▉         | 7464/83399 [00:47<08:42, 145.39it/s]







  9%|▉         | 7479/83399 [00:47<09:00, 140.53it/s]







  9%|▉         | 7494/83399 [00:48<09:44, 129.83it/s]







  9%|▉         | 7508/83399 [00:48<10:35, 119.47it/s]






  9%|▉         | 7521/83399 [00:48<10:33, 119.82it/s]








  9%|▉         | 7539/83399 [00:48<09:24, 134.34it/s]








  9%|▉         | 7557/83399 [00:48<08:44, 144.59it/s]











  9%|▉         | 7576/83399 [00:48<08:21, 151.05it/s]








  9%|▉         | 7595/83399 [00:48<07:49, 161.52it/s]









  9%|▉         | 7615/83399 [00:48<07:36, 165.98it/s]










  9%|▉         | 7632/83399 [00:48<07:36, 165.95it/s]











  9%|▉         | 7649/83399 [00:49<07:40, 164.54it/s]











  9%|▉         | 7666/83399 [00:49<07:43, 163.23it/s]








  9%|▉         | 7684/83399 [00:49<07:39, 164.62it/s]








  9%|▉         | 7701/83399 [00:49<07:38, 165.11it/s]








  9%|▉         | 7719/83399 [00:49<07:30, 168.05it/s]








  9%|▉         | 7736/83399 [00:49<07:54, 159.51it/s]





  9%|▉         | 7742/83399 [00:49<08:05, 155.89it/s]



Process ForkPoolWorker-2:
Process ForkPoolWorker-1:



KeyboardInterrupt: 

In [None]:
drive.flush_and_unmount()
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import shutil
import pandas as pd
from tqdm import tqdm
from multiprocessing import Pool

# === Paths
frames_root = '/content/drive/MyDrive/frames'  # <-- work directly from Drive
dataset_root = '/content/drive/MyDrive/dataset'  # <-- split inside Drive

# Create train/val/test folders
for split in ['train', 'val', 'test']:
    os.makedirs(os.path.join(dataset_root, split), exist_ok=True)

# === Load CSVs
train_videos = pd.read_csv('/content/drive/MyDrive/splits/train.csv')
val_videos = pd.read_csv('/content/drive/MyDrive/splits/val.csv')
test_videos = pd.read_csv('/content/drive/MyDrive/splits/test.csv')

# === Get video lists (remove ".mp4" extension)
train_list = [os.path.splitext(x)[0] for x in train_videos['Video file'].tolist()]
val_list = [os.path.splitext(x)[0] for x in val_videos['Video file'].tolist()]
test_list = [os.path.splitext(x)[0] for x in test_videos['Video file'].tolist()]

# Combine all needed moves
all_videos = [(v, 'train') for v in train_list] + \
             [(v, 'val') for v in val_list] + \
             [(v, 'test') for v in test_list]

# === Move each video folder directly on Drive
# Add missing counter
from multiprocessing import Value, Lock

missing_counter = Value('i', 0)
counter_lock = Lock()

def move_single_video(args):
    video_name, split_name = args
    src = os.path.join(frames_root, video_name)
    dst = os.path.join(dataset_root, split_name, video_name)

    if os.path.exists(src):
        try:
            shutil.copytree(src, dst)
        except Exception as e:
            print(f"❗ Error moving {video_name}: {e}")
    else:
        print(f"⚠️ Warning: {src} does not exist!")
        with counter_lock:
            missing_counter.value += 1

# Multiprocessing with careful number of workers (small to avoid Drive throttling)
print("🚀 Splitting dataset into train/val/test directly on Drive...")
num_workers = 2  # Safe for Drive access

with Pool(num_workers) as p:
    list(tqdm(p.imap_unordered(move_single_video, all_videos), total=len(all_videos)))

print(f"⚡ Total missing videos: {missing_counter.value}")
print("✅ Done splitting dataset directly on Drive!")


🚀 Splitting dataset into train/val/test directly on Drive...


 91%|█████████ | 76088/83399 [11:53:27<1:51:07,  1.10it/s]