<a href="https://colab.research.google.com/github/j00lee/SignLingo/blob/main/Resizing_the_Gloss_Set.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Resizing the Gloss Vocab Size

In [None]:
import os
import re
from collections import defaultdict

# === Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# === Step 2: Set up paths
base_path = '/content/drive/MyDrive/ASL Project/final_dataset'
splits = ['train', 'val', 'test']
split_dirs = {split: os.path.join(base_path, split) for split in splits}

# === Step 3: Gloss extraction function
def extract_cleaned_gloss(filename):
    try:
        basename = filename.replace('.jpg', '')  # remove extension
        gloss_raw = basename.split('-')[1]       # e.g. 'GLASS 3'
        gloss_clean = re.sub(r'\s*\d+$', '', gloss_raw)  # 'GLASS 3' -> 'GLASS'
        return gloss_clean.strip().upper()
    except IndexError:
        print(f"⚠️ Could not parse gloss from: {filename}")
        return None

# === Step 4: Count images per gloss across splits
gloss_counts = defaultdict(int)

for split, path in split_dirs.items():
    print(f"📁 Scanning {split} folder...")
    for fname in os.listdir(path):
        fpath = os.path.join(path, fname)
        if not os.path.isfile(fpath):
            continue
        gloss = extract_cleaned_gloss(fname)
        if gloss:
            gloss_counts[gloss] += 1

# === Step 5: Convert to sorted list
sorted_counts = sorted(gloss_counts.items(), key=lambda x: x[1], reverse=True)

# === Step 6: Show sample
print("\n🔢 Top 20 glosses by total image count:")
for gloss, count in sorted_counts[:20]:
    print(f"{gloss}: {count} images")

# === Step 7: Save to file
import pandas as pd

df = pd.DataFrame(sorted_counts, columns=['Gloss', 'ImageCount'])
output_path = '/content/drive/MyDrive/ASL Project/gloss_freq_exploration/total_gloss_image_counts.csv'
df.to_csv(output_path, index=False)
print(f"\n✅ Saved gloss image counts to: {output_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
📁 Scanning train folder...
📁 Scanning val folder...
📁 Scanning test folder...

🔢 Top 20 glosses by total image count:
SHAVE: 68 images
ENVELOPE: 63 images
STRAWBERRY: 60 images
BANANA: 60 images
HANDSOME: 59 images
ERASE: 59 images
NECKLACE: 58 images
FORK: 57 images
SANDWICH: 55 images
COOL: 55 images
FOLLOW: 55 images
GLOVES: 51 images
SHOULDER: 50 images
POP: 50 images
SCARF: 49 images
DOG: 48 images
CRY: 48 images
STAND: 45 images
PATIENT: 42 images
TOY: 42 images

✅ Saved gloss image counts to: /content/drive/MyDrive/ASL Project/gloss_freq_exploration/total_gloss_image_counts.csv


# Moving the "clean" and "dirty" images back to one folder

In [None]:
import os
import shutil
from tqdm import tqdm

# === Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# === Step 2: Define paths
base_path = '/content/drive/MyDrive/ASL Project/'
final_clean_dir = os.path.join(base_path, 'final_dataset')
dirty_dir = os.path.join(base_path, 'best_frames')
output_base = os.path.join(base_path, 'best_frames')  # merged folders live here

# === Step 3: Create merged folders and copy from both sources
splits = ['train', 'val', 'test']
image_counts = {}

for split in splits:
    clean_path = os.path.join(final_clean_dir, split)
    dirty_path = os.path.join(dirty_dir, f'dirty_{split}')
    merged_path = os.path.join(output_base, f'total_{split}')
    os.makedirs(merged_path, exist_ok=True)

    count = 0
    print(f"\n📂 Combining clean + dirty into: total_{split}/")

    # Copy clean images
    for fname in tqdm(os.listdir(clean_path), desc=f'Copying from final_dataset/{split}'):
        src = os.path.join(clean_path, fname)
        dst = os.path.join(merged_path, fname)
        if os.path.isfile(src):
            shutil.copy2(src, dst)
            count += 1

    # Copy dirty images
    for fname in tqdm(os.listdir(dirty_path), desc=f'Copying from dirty_{split}'):
        src = os.path.join(dirty_path, fname)
        dst = os.path.join(merged_path, fname)
        if os.path.isfile(src):
            shutil.copy2(src, dst)
            count += 1

    image_counts[split] = count
    print(f"✅ total_{split}: {count} images")

# === Step 4: Summary
print("\n📊 Combined image counts:")
for split in splits:
    print(f"total_{split}: {image_counts[split]} images")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

📂 Combining clean + dirty into: total_train/


Copying from final_dataset/train:   1%|          | 121/12149 [01:19<2:11:42,  1.52it/s]


KeyboardInterrupt: 