In [None]:
# Install required libraries
!pip install kagglehub pandas plotly

import os
import pandas as pd
import plotly.express as px
import kagglehub
from google.colab import drive

# Step 1: Mount Google Drive (consistent with saving outputs later)
drive.mount('/content/drive')

# Step 2: Download the dataset using kagglehub
# Replace with the appropriate Kaggle dataset identifier
dataset_path = kagglehub.dataset_download("toxicmender/20bn-jester")
print("Path to dataset files:", dataset_path)

# Step 3: Verify the directory structure (consistent with manual verification from original code)
for root, dirs, files in os.walk(dataset_path):
    print(f"Directory: {root}")
    for file in files:
        print(f"  File: {file}")

# Step 4: Define dataset paths dynamically
root_directory = dataset_path  # Using the downloaded and extracted directory
labels_csv_path = os.path.join(root_directory, 'labels.csv')
train_csv_path = os.path.join(root_directory, 'Train.csv')
val_csv_path = os.path.join(root_directory, 'Validation.csv')
test_csv_path = os.path.join(root_directory, 'Test.csv')

print("Labels CSV Path:", labels_csv_path)
print("Train CSV Path:", train_csv_path)
print("Validation CSV Path:", val_csv_path)
print("Test CSV Path:", test_csv_path)

# Step 5: Load DataLoader with dynamic paths
# Ensure that lib/data_loader.py is uploaded to Colab or accessible
from lib.data_loader import DataLoader

data = DataLoader(labels_csv_path, train_csv_path, val_csv_path, test_csv_path)

# Step 6: Preview training data
print(data.train_df.head(10))

# Step 7: Summary of data
print('Training data: ', len(data.train_df))
print('Validation data: ', len(data.val_df))
print('Test data: ', len(data.test_df))

# Step 8: Plot class distribution
def plot_class_distribution(df):
    count = df['label'].value_counts()
    count_df = pd.DataFrame({'Hand Gesture': count.index, 'Number of Videos': count.values})
    fig = px.bar(count_df, x='Hand Gesture', y='Number of Videos', color='Hand Gesture')
    fig.show()

# Plot distributions (consistent with original code logic)
plot_class_distribution(data.train_df)
plot_class_distribution(data.val_df)

# Step 9: Filter and process chosen classes
chosen_classes = [
    'Swiping Left',
    'Swiping Right',
    'Swiping Down',
    'Swiping Up',
    'Sliding Two Fingers Down',
    'Sliding Two Fingers Up',
    'Thumb Down',
    'Thumb Up',
    'Stop Sign',
    'No gesture'
]

# Save chosen classes (consistent with original logic)
extracted_label_df = pd.DataFrame(chosen_classes)
extracted_label_path = os.path.join(root_directory, 'labels_extracted.csv')
extracted_label_df.to_csv(extracted_label_path, index=False, header=False)
print(f"Extracted labels saved to {extracted_label_path}")

# Reload DataLoader with extracted labels
data = DataLoader(extracted_label_path, train_csv_path, val_csv_path)

# Step 10: Extract specific subsets of data
def extract_data(original_df, labels, start, end):
    extracted_df = pd.DataFrame(columns=['video_id', 'label'])
    for label in labels:
        extracted_df = extracted_df.append(original_df[original_df['label'] == label].iloc[start:end+1])
    return extracted_df

# Extract train and validation subsets
extracted_train_df = extract_data(data.train_df, chosen_classes, 1, 800)
extracted_val_df = extract_data(data.val_df, chosen_classes, 1, 200)

# Save processed subsets to Google Drive (consistent with original saving workflow)
output_path = '/content/drive/My Drive/JesterDataset/processed/'
os.makedirs(output_path, exist_ok=True)
extracted_train_path = os.path.join(output_path, 'train_extracted.csv')
extracted_val_path = os.path.join(output_path, 'validation_extracted.csv')
extracted_train_df.to_csv(extracted_train_path, index=False)
extracted_val_df.to_csv(extracted_val_path, index=False)

print(f"Processed train data saved to {extracted_train_path}")
print(f"Processed validation data saved to {extracted_val_path}")

# Step 11: Reload DataLoader with processed data
data = DataLoader(extracted_label_path, extracted_train_path, extracted_val_path)

# Plot distributions for the extracted subsets (consistent with your original flow)
plot_class_distribution(data.train_df)
plot_class_distribution(data.val_df)


Mounted at /content/drive
Downloading from https://www.kaggle.com/api/v1/datasets/download/toxicmender/20bn-jester?dataset_version_number=3...


100%|██████████| 11.9G/11.9G [02:23<00:00, 89.2MB/s]

Extracting files...





[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  File: 00003.jpg
  File: 00015.jpg
  File: 00010.jpg
  File: 00037.jpg
  File: 00017.jpg
  File: 00005.jpg
  File: 00021.jpg
  File: 00001.jpg
  File: 00020.jpg
  File: 00011.jpg
  File: 00025.jpg
  File: 00029.jpg
  File: 00034.jpg
  File: 00002.jpg
  File: 00024.jpg
  File: 00006.jpg
  File: 00012.jpg
  File: 00027.jpg
Directory: /root/.cache/kagglehub/datasets/toxicmender/20bn-jester/versions/3/Validation/107557
  File: 00014.jpg
  File: 00028.jpg
  File: 00023.jpg
  File: 00019.jpg
  File: 00036.jpg
  File: 00007.jpg
  File: 00013.jpg
  File: 00026.jpg
  File: 00031.jpg
  File: 00004.jpg
  File: 00022.jpg
  File: 00009.jpg
  File: 00008.jpg
  File: 00018.jpg
  File: 00035.jpg
  File: 00030.jpg
  File: 00033.jpg
  File: 00016.jpg
  File: 00032.jpg
  File: 00003.jpg
  File: 00015.jpg
  File: 00010.jpg
  File: 00037.jpg
  File: 00017.jpg
  File: 00005.jpg
  File: 00021.jpg
  File: 00001.jpg
  File: 00020.jpg
  File: 000

ModuleNotFoundError: No module named 'lib.data_loader'

In [None]:
# Mount Google Drive
from google.colab import drive
import sys
import os

drive.mount('/content/drive')

# Define the path to the lib folder
lib_path = '/content/drive/My Drive/lib'  # Update this path to your actual lib folder location
sys.path.append(lib_path)

# Import DataLoader
from data_loader import DataLoader

# Define dataset paths
root_directory = '/content/drive/My Drive/dataset'
labels_csv_path = os.path.join(root_directory, 'labels.csv')
train_csv_path = os.path.join(root_directory, 'Train.csv')
val_csv_path = os.path.join(root_directory, 'Validation.csv')
test_csv_path = os.path.join(root_directory, 'Test.csv')

# Initialize DataLoader with the correct paths
data = DataLoader(labels_csv_path, train_csv_path, val_csv_path, test_csv_path)



# Preview the training data
print("Training Data Sample:")
print(data.train_df.head(10))

# Summary of data
print(f"Training data: {len(data.train_df)}")
print(f"Validation data: {len(data.val_df)}")
print(f"Test data: {len(data.test_df)}")

# Function to plot class distribution
def plot_class_distribution(df):
    count = df['label'].value_counts()
    count_df = pd.DataFrame({'Hand Gesture': count.index, 'Number of Videos': count.values})
    fig = px.bar(count_df, x='Hand Gesture', y='Number of Videos', color='Hand Gesture')
    fig.show()

# Plot class distributions
print("Class Distribution in Training Data:")
plot_class_distribution(data.train_df)

print("Class Distribution in Validation Data:")
plot_class_distribution(data.val_df)

# Define chosen classes for filtering
chosen_classes = [
    'Swiping Left',
    'Swiping Right',
    'Swiping Down',
    'Swiping Up',
    'Sliding Two Fingers Down',
    'Sliding Two Fingers Up',
    'Thumb Down',
    'Thumb Up',
    'Stop Sign',
    'No gesture'
]

# Save chosen classes to a CSV
extracted_label_df = pd.DataFrame(chosen_classes)
extracted_label_path = os.path.join(root_directory, 'labels_extracted.csv')
extracted_label_df.to_csv(extracted_label_path, index=False, header=False)
print(f"Extracted labels saved to: {extracted_label_path}")

# Reload DataLoader with filtered labels
data = DataLoader(extracted_label_path, train_csv_path, val_csv_path)

# Function to extract specific subsets of data
def extract_data(original_df, labels, start, end):
    extracted_df = pd.DataFrame(columns=['video_id', 'label'])
    for label in labels:
        subset = original_df[original_df['label'] == label].iloc[start:end+1]
        extracted_df = pd.concat([extracted_df, subset], ignore_index=True)
    return extracted_df


# Extract subsets for training and validation
extracted_train_df = extract_data(data.train_df, chosen_classes, 1, 800)
extracted_val_df = extract_data(data.val_df, chosen_classes, 1, 200)

# Save extracted subsets
output_path = '/content/drive/My Drive/dataset/processed/'
os.makedirs(output_path, exist_ok=True)
extracted_train_path = os.path.join(output_path, 'train_extracted.csv')
extracted_val_path = os.path.join(output_path, 'validation_extracted.csv')
extracted_train_df.to_csv(extracted_train_path, index=False)
extracted_val_df.to_csv(extracted_val_path, index=False)

print(f"Extracted training data saved to: {extracted_train_path}")
print(f"Extracted validation data saved to: {extracted_val_path}")


# Reload DataLoader with processed data
data = DataLoader(extracted_label_path, extracted_train_path, extracted_val_path)

# Plot distributions for the extracted subsets
print("Class Distribution in Extracted Training Data:")
plot_class_distribution(data.train_df)

print("Class Distribution in Extracted Validation Data:")
plot_class_distribution(data.val_df)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training Data Sample:
   video_id                        label
0         1           Doing other things
1         3     Pushing Two Fingers Away
2         6             Drumming Fingers
3        11     Sliding Two Fingers Down
4        14            Pushing Hand Away
5        17                 Shaking Hand
6        20           Doing other things
7        28       Pulling Two Fingers In
8        31                    Stop Sign
9        34  Zooming In With Two Fingers
Training data: 50420
Validation data: 7047
Test data: 6981
Class Distribution in Training Data:


Class Distribution in Validation Data:


Extracted labels saved to: /content/drive/My Drive/dataset/labels_extracted.csv
Extracted training data saved to: /content/drive/My Drive/dataset/processed/train_extracted.csv
Extracted validation data saved to: /content/drive/My Drive/dataset/processed/validation_extracted.csv
Class Distribution in Extracted Training Data:


Class Distribution in Extracted Validation Data:


In [None]:
import os

# Define the dataset path
dataset_path = '/root/.cache/kagglehub/datasets/toxicmender/20bn-jester/versions/3'

# Check for CSV files only
print("CSV files in the dataset directory:")
for root, dirs, files in os.walk(dataset_path):
    for file in files:
        if file.endswith('.csv'):
            print(f"File: {os.path.join(root, file)}")


CSV files in the dataset directory:
File: /root/.cache/kagglehub/datasets/toxicmender/20bn-jester/versions/3/Validation.csv
File: /root/.cache/kagglehub/datasets/toxicmender/20bn-jester/versions/3/Train.csv
File: /root/.cache/kagglehub/datasets/toxicmender/20bn-jester/versions/3/Test.csv


In [None]:
import pandas as pd

In [None]:
# Install and import required libraries
!pip install plotly
import plotly.express as px


