# Traffic Light Dataset - Exploratory Data Analysis

This notebook analyzes the training dataset structure, which contains 13 different video sequences (dayClip1-13), each with sequential frames of traffic light images.


In [1]:
import os
from glob import glob
from pathlib import Path
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Dataset path
DATA_ROOT = "/Users/jeremyky/Documents/ae/training_dataset"


## 1. Dataset Structure Overview


In [None]:
# Get all dayClip directories
clip_dirs = sorted([d for d in Path(DATA_ROOT).iterdir() if d.is_dir() and d.name.startswith('dayClip')])

print(f"Total number of video sequences (clips): {len(clip_dirs)}\n")

# Collect information about each clip
clip_info = []

for clip_dir in clip_dirs:
    frames_dir = clip_dir / "frames"
    if frames_dir.exists():
        # Get all jpg files
        frames = sorted(glob(str(frames_dir / "*.jpg")))
        clip_info.append({
            'Clip Name': clip_dir.name,
            'Number of Frames': len(frames),
            'Frames Directory': str(frames_dir)
        })

# Create a DataFrame
df_clips = pd.DataFrame(clip_info)
print(df_clips.to_string(index=False))

print(f"\n{'='*60}")
print(f"TOTAL FRAMES ACROSS ALL CLIPS: {df_clips['Number of Frames'].sum():,}")
print(f"Average frames per clip: {df_clips['Number of Frames'].mean():.1f}")
print(f"Median frames per clip: {df_clips['Number of Frames'].median():.1f}")
print(f"Min frames in a clip: {df_clips['Number of Frames'].min()}")
print(f"Max frames in a clip: {df_clips['Number of Frames'].max()}")
print(f"{'='*60}")


## 2. Distribution of Frames Across Clips


In [None]:
# Visualize the distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Bar plot
axes[0].bar(df_clips['Clip Name'], df_clips['Number of Frames'], color='steelblue', alpha=0.7)
axes[0].set_xlabel('Clip Name', fontsize=12)
axes[0].set_ylabel('Number of Frames', fontsize=12)
axes[0].set_title('Number of Frames per Clip', fontsize=14, fontweight='bold')
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(axis='y', alpha=0.3)

# Add value labels on bars
for i, v in enumerate(df_clips['Number of Frames']):
    axes[0].text(i, v + 50, str(v), ha='center', va='bottom', fontsize=9)

# Pie chart showing proportion
axes[1].pie(df_clips['Number of Frames'], labels=df_clips['Clip Name'], autopct='%1.1f%%', startangle=90)
axes[1].set_title('Proportion of Total Frames per Clip', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()


## 3. Sample Images from Each Clip

Let's visualize the first frame from each clip to get a sense of the data variety.


In [None]:
# Display first frame from each clip
fig, axes = plt.subplots(4, 4, figsize=(16, 16))
axes = axes.flatten()

for idx, clip_dir in enumerate(clip_dirs):
    frames_dir = clip_dir / "frames"
    frames = sorted(glob(str(frames_dir / "*.jpg")))
    
    if frames:
        # Load first frame
        img = Image.open(frames[0])
        axes[idx].imshow(img)
        axes[idx].set_title(f"{clip_dir.name}\n({len(frames)} frames)", fontsize=10, fontweight='bold')
        axes[idx].axis('off')

# Hide the last subplot if we have fewer than 16 clips
for idx in range(len(clip_dirs), len(axes)):
    axes[idx].axis('off')

plt.suptitle('First Frame from Each Video Clip', fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.show()


## 4. Image Properties Analysis

Analyze dimensions, file sizes, and color properties of the images.


In [None]:
# Sample images from each clip for analysis
image_properties = []

for clip_dir in clip_dirs:
    frames_dir = clip_dir / "frames"
    frames = sorted(glob(str(frames_dir / "*.jpg")))
    
    # Sample up to 10 frames from each clip for analysis
    sample_size = min(10, len(frames))
    sampled_frames = np.random.choice(frames, sample_size, replace=False)
    
    for frame_path in sampled_frames:
        img = Image.open(frame_path)
        file_size_kb = os.path.getsize(frame_path) / 1024
        
        image_properties.append({
            'Clip': clip_dir.name,
            'Width': img.width,
            'Height': img.height,
            'Mode': img.mode,
            'File Size (KB)': file_size_kb
        })

df_props = pd.DataFrame(image_properties)

print("Image Properties Summary:")
print("="*60)
print(f"Total images sampled: {len(df_props)}")
print(f"\nDimensions:")
print(f"  Width: {df_props['Width'].unique()}")
print(f"  Height: {df_props['Height'].unique()}")
print(f"  Image Mode: {df_props['Mode'].unique()}")
print(f"\nFile Sizes:")
print(f"  Average: {df_props['File Size (KB)'].mean():.2f} KB")
print(f"  Min: {df_props['File Size (KB)'].min():.2f} KB")
print(f"  Max: {df_props['File Size (KB)'].max():.2f} KB")
print("="*60)


## 5. Sequential Frame Visualization

Visualize a sequence of frames from one clip to see the temporal progression.


In [None]:
# Pick a clip with many frames for sequential visualization
clip_to_visualize = clip_dirs[0]  # Change index to visualize different clips
frames_dir = clip_to_visualize / "frames"
frames = sorted(glob(str(frames_dir / "*.jpg")))

# Select evenly spaced frames
num_frames_to_show = 12
indices = np.linspace(0, len(frames)-1, num_frames_to_show, dtype=int)

fig, axes = plt.subplots(3, 4, figsize=(16, 12))
axes = axes.flatten()

for i, idx in enumerate(indices):
    img = Image.open(frames[idx])
    axes[i].imshow(img)
    axes[i].set_title(f"Frame {idx} / {len(frames)-1}", fontsize=10)
    axes[i].axis('off')

plt.suptitle(f'Sequential Frames from {clip_to_visualize.name}', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()


## 6. Color Distribution Analysis

Analyze the RGB color distributions across sample images.


In [None]:
# Sample random images from across all clips
all_frames = []
for clip_dir in clip_dirs:
    frames_dir = clip_dir / "frames"
    frames = glob(str(frames_dir / "*.jpg"))
    all_frames.extend(frames)

# Randomly sample images for color analysis
sample_images = np.random.choice(all_frames, min(100, len(all_frames)), replace=False)

# Collect RGB statistics
r_values, g_values, b_values = [], [], []

for img_path in sample_images:
    img = np.array(Image.open(img_path))
    r_values.extend(img[:,:,0].flatten()[::100])  # Sample every 100th pixel to reduce memory
    g_values.extend(img[:,:,1].flatten()[::100])
    b_values.extend(img[:,:,2].flatten()[::100])

# Plot histograms
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].hist(r_values, bins=50, color='red', alpha=0.7, edgecolor='black')
axes[0].set_title('Red Channel Distribution', fontweight='bold')
axes[0].set_xlabel('Pixel Value')
axes[0].set_ylabel('Frequency')

axes[1].hist(g_values, bins=50, color='green', alpha=0.7, edgecolor='black')
axes[1].set_title('Green Channel Distribution', fontweight='bold')
axes[1].set_xlabel('Pixel Value')
axes[1].set_ylabel('Frequency')

axes[2].hist(b_values, bins=50, color='blue', alpha=0.7, edgecolor='black')
axes[2].set_title('Blue Channel Distribution', fontweight='bold')
axes[2].set_xlabel('Pixel Value')
axes[2].set_ylabel('Frequency')

plt.suptitle('RGB Channel Distributions (Sampled Pixels)', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

print(f"Color Statistics (sampled from {len(sample_images)} images):")
print(f"Red   - Mean: {np.mean(r_values):.2f}, Std: {np.std(r_values):.2f}")
print(f"Green - Mean: {np.mean(g_values):.2f}, Std: {np.std(g_values):.2f}")
print(f"Blue  - Mean: {np.mean(b_values):.2f}, Std: {np.std(b_values):.2f}")


## 7. Average Image per Clip

Create an "average frame" for each clip to see the overall composition.


In [None]:
# Compute average image for each clip (using a sample of frames)
fig, axes = plt.subplots(4, 4, figsize=(16, 16))
axes = axes.flatten()

for idx, clip_dir in enumerate(clip_dirs):
    frames_dir = clip_dir / "frames"
    frames = sorted(glob(str(frames_dir / "*.jpg")))
    
    # Sample frames to average (max 50 to avoid memory issues)
    sample_size = min(50, len(frames))
    sampled = np.random.choice(frames, sample_size, replace=False)
    
    # Load and average
    images = [np.array(Image.open(f)) for f in sampled]
    avg_img = np.mean(images, axis=0).astype(np.uint8)
    
    axes[idx].imshow(avg_img)
    axes[idx].set_title(f"{clip_dir.name}\n(avg of {sample_size} frames)", fontsize=10, fontweight='bold')
    axes[idx].axis('off')

# Hide unused subplots
for idx in range(len(clip_dirs), len(axes)):
    axes[idx].axis('off')

plt.suptitle('Average Image per Clip', fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.show()


## 8. Dataset Summary & Recommendations


In [None]:
print("=" * 70)
print("DATASET SUMMARY")
print("=" * 70)
print(f"\nTotal Video Sequences: {len(clip_dirs)}")
print(f"Total Frames: {df_clips['Number of Frames'].sum():,}")
print(f"\nFrame Distribution:")
print(f"  - Shortest clip: {df_clips.loc[df_clips['Number of Frames'].idxmin(), 'Clip Name']} "
      f"({df_clips['Number of Frames'].min()} frames)")
print(f"  - Longest clip: {df_clips.loc[df_clips['Number of Frames'].idxmax(), 'Clip Name']} "
      f"({df_clips['Number of Frames'].max()} frames)")
print(f"  - Average: {df_clips['Number of Frames'].mean():.1f} frames per clip")

print(f"\nImage Properties:")
print(f"  - Dimensions: {df_props['Width'].iloc[0]} × {df_props['Height'].iloc[0]} pixels")
print(f"  - Color Mode: {df_props['Mode'].iloc[0]}")
print(f"  - Average File Size: {df_props['File Size (KB)'].mean():.2f} KB")

total_size_mb = (df_clips['Number of Frames'].sum() * df_props['File Size (KB)'].mean()) / 1024
print(f"\nEstimated Total Dataset Size: {total_size_mb:.2f} MB")

print("\n" + "=" * 70)
print("RECOMMENDATIONS FOR TRAINING")
print("=" * 70)
print("""
1. Data Imbalance: Some clips have significantly more frames than others.
   Consider stratified sampling or weighting during training.

2. Image Resize: Current images may need resizing to 256×256 for your model.
   Verify dimensions match expected input size.

3. Temporal Information: Since these are sequential frames, you could:
   - Use data augmentation (but avoid breaking temporal consistency)
   - Consider frame sampling strategies (every Nth frame)
   - Explore temporal models if needed

4. Train/Val Split: Consider splitting by CLIP rather than by individual frames
   to avoid data leakage (consecutive frames are very similar).

5. Data Augmentation: Be careful with augmentations that might change
   traffic light meaning (e.g., color jittering could be problematic).
""")
