In [2]:
from google.colab import userdata
import os

os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')
os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')


! kaggle datasets download -d haitonthat/data-afterprocess-p2

Downloading data-afterprocess-p2.zip to /content
 99% 844M/857M [00:08<00:00, 108MB/s] 
100% 857M/857M [00:09<00:00, 98.9MB/s]


In [None]:
!unzip "/content/data-afterprocess-p2.zip"

In [5]:
import os
import csv

def count_videos(dataset_path):
    """Count the number of videos in each class of the dataset and organize by class."""
    stats = {}
    for split in ['train', 'val']:
        split_path = os.path.join(dataset_path, split)
        if os.path.exists(split_path):
            for class_name in os.listdir(split_path):
                class_path = os.path.join(split_path, class_name)
                if os.path.isdir(class_path):
                    videos = [f for f in os.listdir(class_path) if f.endswith(('.mp4', '.MOV', '.mov', '.avi'))]
                    if class_name not in stats:
                        stats[class_name] = {'train': 0, 'val': 0}
                    stats[class_name][split] += len(videos)
    return stats

def save_stats_to_csv(stats, output_csv):
    """Save the dataset statistics to a CSV file with a structure that includes serial numbers and a total column."""
    with open(output_csv, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['#', 'Class', 'Train', 'Val', 'Total'])
        serial_number = 1
        for class_name, splits in sorted(stats.items()):
            total_videos = splits['train'] + splits['val']
            writer.writerow([serial_number, class_name, splits['train'], splits['val'], total_videos])
            serial_number += 1

dataset_path = '/content/content/output'
output_csv = 'dataset_statistics.csv'

# Count videos and save the statistics
stats = count_videos(dataset_path)
save_stats_to_csv(stats, output_csv)

print(f'Statistics saved to {output_csv}')


Statistics saved to dataset_statistics.csv


In [4]:
import os
import csv

def count_videos(dataset_path):
    """Count the number of videos in each class of the dataset,
    distinguishing between mp4 and non-mp4 formats."""
    stats = {}
    for split in ['train', 'val']:
        split_path = os.path.join(dataset_path, split)
        if os.path.exists(split_path):
            for class_name in os.listdir(split_path):
                class_path = os.path.join(split_path, class_name)
                if os.path.isdir(class_path):
                    videos = os.listdir(class_path)
                    mp4_count = len([f for f in videos if f.endswith('.mp4')])
                    non_mp4_count = len([f for f in videos if not f.endswith('.mp4')])
                    if class_name not in stats:
                        stats[class_name] = {'train': {'mp4': 0, 'non-mp4': 0}, 'val': {'mp4': 0, 'non-mp4': 0}}
                    stats[class_name][split]['mp4'] += mp4_count
                    stats[class_name][split]['non-mp4'] += non_mp4_count
    return stats

def save_stats_to_csv(stats, output_csv):
    """Save the dataset statistics to a CSV file, including information about mp4 and non-mp4 formats."""
    with open(output_csv, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['#', 'Class', 'Train MP4', 'Train Non-MP4', 'Val MP4', 'Val Non-MP4', 'Total MP4', 'Total Non-MP4'])
        serial_number = 1
        for class_name, splits in sorted(stats.items()):
            train_mp4 = splits['train']['mp4']
            train_non_mp4 = splits['train']['non-mp4']
            val_mp4 = splits['val']['mp4']
            val_non_mp4 = splits['val']['non-mp4']
            total_mp4 = train_mp4 + val_mp4
            total_non_mp4 = train_non_mp4 + val_non_mp4
            writer.writerow([serial_number, class_name, train_mp4, train_non_mp4, val_mp4, val_non_mp4, total_mp4, total_non_mp4])
            serial_number += 1

dataset_path = '/content/content/output'
output_csv = 'dataset_statistics.csv'

# Count videos and save the statistics
stats = count_videos(dataset_path)
save_stats_to_csv(stats, output_csv)

print(f'Statistics saved to {output_csv}')


Statistics saved to dataset_statistics.csv
