In [4]:
import os
import zipfile
import urllib.request
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import TensorDataset, DataLoader

# Directory where the dataset will be downloaded and extracted
DATA_DIR = 'datasets'
os.makedirs(DATA_DIR, exist_ok=True)

def download_dataset(dataset_name, url):
    """
    Downloads and extracts a zip file containing the dataset.
    """
    zip_path = os.path.join(DATA_DIR, f"{dataset_name}.zip")
    extract_path = os.path.join(DATA_DIR, dataset_name)

    # Download the dataset
    print(f"Downloading {dataset_name} from {url}...")
    urllib.request.urlretrieve(url, zip_path)

    # Extract the zip file
    print(f"Extracting {dataset_name}...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
        file_names = zip_ref.namelist()
        print(f"Files in the zip: {file_names}")

    # Remove the zip file after extraction
    os.remove(zip_path)
    print(f"Dataset {dataset_name} extracted to {extract_path}.")
    return extract_path, file_names

def load_data(file_path, encoding='ISO-8859-1'):
    """
    Loads the dataset from a CSV file and returns a pandas DataFrame.
    """
    print(f"Loading data from {file_path}")
    df = pd.read_csv(file_path, encoding=encoding, on_bad_lines='skip')
    return df

def preprocess_data(df, batch_size=64, time_steps=30):
    """
    Preprocesses the data:
    - Drops Date and Time columns.
    - Uses all remaining features.
    - Normalizes the features.
    - Reshapes data into 3D tensors (samples, time_steps, features).
    - Splits into training, validation, and test sets.
    """
    print("Starting preprocessing...")

    # Drop Date and Time columns
    df = df.drop(columns=['Date', 'Time'])
    print("Dropped Date and Time columns.")

    # Use all remaining columns as features
    X = df.drop(columns=['Room_Occupancy_Count'])
    y = df['Room_Occupancy_Count']
    features = X.shape[1]  # Set features dynamically based on remaining columns
    print(f"Total features used: {features}")

    # Ensure total samples are compatible with reshaping
    total_samples = (len(X) // time_steps) * time_steps
    X, y = X.iloc[:total_samples], y.iloc[:total_samples]
    print(f"Adjusted dataset size: {X.shape}, Labels size: {y.shape}")

    # Normalize features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    print("Features normalized.")

    # Reshape X into (samples, time_steps, features)
    X = X.reshape(-1, time_steps, features)
    y = y.values.reshape(-1, time_steps)[:, 0]  # Ensure y matches the number of sequences in X
    print(f"Reshaped X to: {X.shape}, Reshaped y to: {y.shape}")

    # Split into training, validation, and test sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
    print("Data split into training, validation, and test sets.")

    # Convert data to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.int64)

    X_valid = torch.tensor(X_valid, dtype=torch.float32)
    y_valid = torch.tensor(y_valid, dtype=torch.int64)

    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_test = torch.tensor(y_test, dtype=torch.int64)

    # Output dataset shapes
    print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
    print(f"X_valid shape: {X_valid.shape}, y_valid shape: {y_valid.shape}")
    print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

    # Create DataLoaders
    train_dataset = TensorDataset(X_train, y_train)
    valid_dataset = TensorDataset(X_valid, y_valid)
    test_dataset = TensorDataset(X_test, y_test)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, drop_last=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=True)

    print("Preprocessing complete.")
    return train_loader, valid_loader, test_loader, X_train, X_valid, X_test, y_train, y_valid, y_test

# Main function to run the entire process
if __name__ == "__main__":
    dataset_name = 'RoomOccupancy'
    dataset_url = 'https://archive.ics.uci.edu/static/public/864/room+occupancy+estimation.zip'

    # Download and extract the dataset
    extract_path, file_names = download_dataset(dataset_name, dataset_url)

    # Load the data
    main_file_path = os.path.join(extract_path, file_names[0])  # Assuming the first file is the main CSV
    df = load_data(main_file_path)

    # Preprocess the data
    train_loader, valid_loader, test_loader, X_train, X_valid, X_test, y_train, y_valid, y_test = preprocess_data(df)

    # Output the number of classes
    n_classes = len(torch.unique(y_train))
    print(f"Number of classes: {n_classes}")


Downloading RoomOccupancy from https://archive.ics.uci.edu/static/public/864/room+occupancy+estimation.zip...
Extracting RoomOccupancy...
Files in the zip: ['Occupancy_Estimation.csv']
Dataset RoomOccupancy extracted to datasets/RoomOccupancy.
Loading data from datasets/RoomOccupancy/Occupancy_Estimation.csv
Starting preprocessing...
Dropped Date and Time columns.
Total features used: 16
Adjusted dataset size: (10110, 16), Labels size: (10110,)
Features normalized.
Reshaped X to: (337, 30, 16), Reshaped y to: (337,)
Data split into training, validation, and test sets.
X_train shape: torch.Size([202, 30, 16]), y_train shape: torch.Size([202])
X_valid shape: torch.Size([67, 30, 16]), y_valid shape: torch.Size([67])
X_test shape: torch.Size([68, 30, 16]), y_test shape: torch.Size([68])
Preprocessing complete.
Number of classes: 4


In [9]:
import os
import zipfile
import urllib.request
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

# Directory where datasets will be downloaded and extracted
DATA_DIR = 'datasets'
os.makedirs(DATA_DIR, exist_ok=True)

def download_dataset(dataset_name, url):
    """
    Downloads and extracts a zip file containing the dataset.
    """
    zip_path = os.path.join(DATA_DIR, f"{dataset_name}.zip")
    extract_path = os.path.join(DATA_DIR, dataset_name)

    # Download the dataset
    print("Starting download...")
    urllib.request.urlretrieve(url, zip_path)
    print("Download complete.")

    # Extract the zip file
    print("Extracting dataset...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

    # Remove the zip file after extraction
    os.remove(zip_path)
    print("Extraction complete.")
    return extract_path

def load_emg_data(directory):
    """
    Loads and cleans EMG data by reading files, skipping metadata rows, and selecting numeric data.
    Assumes that each file is in a plain text format with space-delimited values.
    """
    data_frames = []

    for root, _, files in os.walk(directory):
        for file_name in files:
            if file_name.endswith('.txt'):
                file_path = os.path.join(root, file_name)
                try:
                    # Read the file, skipping rows with metadata or headers
                    df = pd.read_csv(file_path, sep='\s+', header=None, skiprows=3)

                    # Check if the DataFrame has valid numeric content
                    if df.empty or not pd.to_numeric(df.iloc[:, 1], errors='coerce').notna().all():
                        continue

                    # Convert all data to numeric
                    df = df.apply(pd.to_numeric, errors='coerce')  # Ensure all data is numeric
                    data_frames.append(df)
                except pd.errors.ParserError:
                    continue

    if not data_frames:
        raise ValueError("No valid numeric data files found for concatenation.")

    # Concatenate and reset index
    full_df = pd.concat(data_frames, axis=0).reset_index(drop=True)
    full_df.columns = [f'feature_{i}' for i in range(full_df.shape[1] - 1)] + ['label']  # Assign last column as label

    print("Data loaded successfully.")
    return full_df

def preprocess_data(df, batch_size=64, time_steps=30):
    """
    Preprocesses the EMG data:
    - Reshapes the dataset into sequences with time steps and features.
    - Splits into train, validation, and test sets.
    - Normalizes the features.
    - Converts them into PyTorch tensors.
    - Creates DataLoaders for supervised tasks (with labels).
    """
    print("Starting preprocessing...")
    # Assume the last column is the label and the rest are features
    X = df.iloc[:, :-1]  # All columns except the last
    y = df.iloc[:, -1]   # Last column as labels
    num_features = X.shape[1]

    # Adjust the data to make it divisible by the number of time steps
    total_samples = (len(X) // time_steps) * time_steps
    X, y = X.iloc[:total_samples], y.iloc[:total_samples]

    # Reshape X into (num_sequences, time_steps, num_features)
    X = X.values.reshape(-1, time_steps, num_features)
    y = y.values.reshape(-1, time_steps)[:, 0]  # Use the first label of each sequence as the label for that sequence

    # Split into training, validation, and test sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    # Normalize features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train.reshape(-1, num_features)).reshape(-1, time_steps, num_features)
    X_valid = scaler.transform(X_valid.reshape(-1, num_features)).reshape(-1, time_steps, num_features)
    X_test = scaler.transform(X_test.reshape(-1, num_features)).reshape(-1, time_steps, num_features)

    # Convert to tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.long)
    X_valid = torch.tensor(X_valid, dtype=torch.float32)
    y_valid = torch.tensor(y_valid, dtype=torch.long)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_test = torch.tensor(y_test, dtype=torch.long)

    # Print the shapes of the datasets
    print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
    print(f"X_valid shape: {X_valid.shape}, y_valid shape: {y_valid.shape}")
    print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

    print(f"\nEach set details:")
    print(f"- Training set: {X_train.shape[0]} sequences, {time_steps} time steps, {num_features} features per step.")
    print(f"- Validation set: {X_valid.shape[0]} sequences, {time_steps} time steps, {num_features} features per step.")
    print(f"- Test set: {X_test.shape[0]} sequences, {time_steps} time steps, {num_features} features per step.")

    # Create DataLoaders
    train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True, drop_last=True)
    valid_loader = DataLoader(TensorDataset(X_valid, y_valid), batch_size=batch_size, shuffle=False, drop_last=True)
    test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=batch_size, shuffle=False, drop_last=True)

    return train_loader, valid_loader, test_loader, X_train, X_valid, X_test, y_train, y_valid, y_test

def main():
    dataset_name = 'EMG_Gestures'
    dataset_url = 'https://archive.ics.uci.edu/static/public/481/emg+data+for+gestures.zip'

    # Step 1: Download and Extract
    extract_path = download_dataset(dataset_name, dataset_url)

    # Step 2: Load Data
    df = load_emg_data(extract_path)  # Ensure labels are in the last column

    # Step 3: Preprocess Data into Training, Validation, and Testing sets
    train_loader, valid_loader, test_loader, X_train, X_valid, X_test, y_train, y_valid, y_test = preprocess_data(df)

    # Step 4: Output the Number of Classes (If Labels are present)
    n_classes = len(torch.unique(y_train))
    print(f"Number of classes: {n_classes}")

if __name__ == "__main__":
    main()


Starting download...
Download complete.
Extracting dataset...
Extraction complete.
Data loaded successfully.
Starting preprocessing...
X_train shape: torch.Size([84754, 30, 9]), y_train shape: torch.Size([84754])
X_valid shape: torch.Size([28252, 30, 9]), y_valid shape: torch.Size([28252])
X_test shape: torch.Size([28252, 30, 9]), y_test shape: torch.Size([28252])

Each set details:
- Training set: 84754 sequences, 30 time steps, 9 features per step.
- Validation set: 28252 sequences, 30 time steps, 9 features per step.
- Test set: 28252 sequences, 30 time steps, 9 features per step.
Number of classes: 8


In [11]:
import os
import zipfile
import urllib.request
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import TensorDataset, DataLoader

# Directory where the dataset will be downloaded and extracted
DATA_DIR = 'datasets'
os.makedirs(DATA_DIR, exist_ok=True)

def download_dataset(dataset_name, url):
    """
    Downloads and extracts a zip file containing the dataset.
    """
    zip_path = os.path.join(DATA_DIR, f"{dataset_name}.zip")
    extract_path = os.path.join(DATA_DIR, dataset_name)

    # Download the dataset
    urllib.request.urlretrieve(url, zip_path)

    # Extract the zip file
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

    # Remove the zip file after extraction
    os.remove(zip_path)
    return extract_path

def load_data(directory):
    """
    Loads and cleans the dataset from text files in the specified directory.
    """
    data_frames = []

    for root, _, files in os.walk(directory):
        for file_name in files:
            if file_name.endswith('.txt'):
                file_path = os.path.join(root, file_name)
                try:
                    df = pd.read_csv(file_path, delimiter=';', header=None, on_bad_lines='skip')
                    df = df.apply(pd.to_numeric, errors='coerce').dropna()
                    data_frames.append(df)
                except pd.errors.ParserError:
                    continue
                except ValueError:
                    continue

    if not data_frames:
        raise ValueError("No valid data files found for concatenation.")

    full_df = pd.concat(data_frames, axis=0).reset_index(drop=True)
    X = full_df.iloc[:, :-1]
    y = full_df.iloc[:, -1]
    return X, y

def preprocess_data(X, y, batch_size=64, time_steps=30):
    """
    Preprocesses the data:
    - Reshapes the dataset into sequences with time steps and features.
    - Normalizes the features.
    - Splits into training, validation, and test sets.
    - Converts them into PyTorch tensors.
    - Creates DataLoaders for training, validation, and testing.
    """
    num_features = X.shape[1]

    # Adjust dataset size to ensure it's divisible by the time steps
    total_samples = (len(X) // time_steps) * time_steps
    X, y = X.iloc[:total_samples], y.iloc[:total_samples]

    # Reshape X into (num_sequences, time_steps, num_features)
    X = X.values.reshape(-1, time_steps, num_features)
    y = y.values.reshape(-1, time_steps)[:, 0]  # Use the first label of each sequence as the sequence label

    # Split the data into train, validation, and test sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    # Normalize features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train.reshape(-1, num_features)).reshape(-1, time_steps, num_features)
    X_valid = scaler.transform(X_valid.reshape(-1, num_features)).reshape(-1, time_steps, num_features)
    X_test = scaler.transform(X_test.reshape(-1, num_features)).reshape(-1, time_steps, num_features)

    # Convert data to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.long)
    X_valid = torch.tensor(X_valid, dtype=torch.float32)
    y_valid = torch.tensor(y_valid, dtype=torch.long)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_test = torch.tensor(y_test, dtype=torch.long)

    # Print the shapes and structured details
    print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
    print(f"X_valid shape: {X_valid.shape}, y_valid shape: {y_valid.shape}")
    print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
    print(f"\nEach set details:")
    print(f"- Training set: {X_train.shape[0]} sequences, {time_steps} time steps, {num_features} features per step.")
    print(f"- Validation set: {X_valid.shape[0]} sequences, {time_steps} time steps, {num_features} features per step.")
    print(f"- Test set: {X_test.shape[0]} sequences, {time_steps} time steps, {num_features} features per step.")

    # Create DataLoaders
    train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True, drop_last=True)
    valid_loader = DataLoader(TensorDataset(X_valid, y_valid), batch_size=batch_size, shuffle=False, drop_last=True)
    test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=batch_size, shuffle=False, drop_last=True)

    return train_loader, valid_loader, test_loader, X_train, X_valid, X_test, y_train, y_valid, y_test

# Main function to run the entire process
if __name__ == "__main__":
    dataset_name = 'PhysicalTherapyExercises'
    dataset_url = 'https://archive.ics.uci.edu/static/public/730/physical+therapy+exercises+dataset.zip'

    # Download and extract the dataset
    extract_path = download_dataset(dataset_name, dataset_url)

    # Load the data
    X, y = load_data(extract_path)

    # Preprocess the data
    train_loader, valid_loader, test_loader, X_train, X_valid, X_test, y_train, y_valid, y_test = preprocess_data(X, y)

    # Output the number of classes
    n_classes = len(torch.unique(y_train))
    print(f"Number of classes: {n_classes}")


X_train shape: torch.Size([27562, 30, 9]), y_train shape: torch.Size([27562])
X_valid shape: torch.Size([9187, 30, 9]), y_valid shape: torch.Size([9187])
X_test shape: torch.Size([9188, 30, 9]), y_test shape: torch.Size([9188])

Each set details:
- Training set: 27562 sequences, 30 time steps, 9 features per step.
- Validation set: 9187 sequences, 30 time steps, 9 features per step.
- Test set: 9188 sequences, 30 time steps, 9 features per step.
Number of classes: 3
