# Data Preparation for Sign Language Detection

This notebook prepares the dataset for YOLO training.

In [None]:
import sys
import os
sys.path.append('../')

from src.data.yolo_preprocessing import YOLODataProcessor

In [None]:
# Set up paths
BASE_DIR = os.path.abspath('../')
DATA_DIR = os.path.join(BASE_DIR, 'data', 'processed')
ANNOTATIONS_DIR = os.path.join(DATA_DIR, 'Annotations')
IMAGES_DIR = os.path.join(DATA_DIR, 'images')
MODELS_DIR = os.path.join(BASE_DIR, 'data', 'models')
YOLO_DIR = os.path.join(MODELS_DIR, 'yolo_dataset')

# Create necessary directories
os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(YOLO_DIR, exist_ok=True)

print(f"Annotations directory: {ANNOTATIONS_DIR}")
print(f"Images directory: {IMAGES_DIR}")
print(f"Models directory: {MODELS_DIR}")

In [None]:
# Initialize data processor
processor = YOLODataProcessor(ANNOTATIONS_DIR, IMAGES_DIR)

# Parse annotations
df, all_classes = processor.parse_xml_annotations()

print(f"Dataset shape: {df.shape}")
print(f"Classes found: {all_classes}")
print(f"\nClass distribution:")
print(df['class'].value_counts())

In [None]:
# Create train/validation split
train_df, val_df = processor.create_train_val_split(df, test_size=0.2)

print(f"Training set: {len(train_df)} annotations, {train_df['image_name'].nunique()} images")
print(f"Validation set: {len(val_df)} annotations, {val_df['image_name'].nunique()} images")

In [None]:
# Prepare YOLO dataset
class_mapping = processor.prepare_yolo_dataset(train_df, val_df, all_classes, YOLO_DIR)

# Save class names for later use
import json
with open(os.path.join(MODELS_DIR, 'class_names.json'), 'w') as f:
    json.dump(all_classes, f)

print(f"\nYOLO dataset prepared in: {YOLO_DIR}")
print(f"Class mapping: {class_mapping}")
print('Data preparation complete!')