In [1]:
from pathlib import Path
import os, shutil
import pandas as pd
from typing import Tuple, List, Dict
import argparse

from steps import (
    SplitGuaranteeingAllClassesPerSplit,
    BalanceToMinimumClass,
    BalanceToMinimumClassAndUser,
    FilterByCommonRows,
)

from pipelines import match_columns, pipelines

# Set the seed for reproducibility
import numpy as np
import random
import traceback

np.random.seed(42)
random.seed(42)
# pd.np.random.seed(42)


In [23]:
# Dictionary of dataset paths
dataset_paths: Dict[str, str] = {
    "KuHar": "KuHar/1.Raw_time_domian_data",
    "MotionSense": "MotionSense/A_DeviceMotion_data",
    "WISDM": "WISDM/wisdm-dataset/raw/phone",
    "UCI": "UCI/RawData",
    "RealWorld": "RealWorld/realworld2016_dataset",
}

In [25]:
def read_kuhar(kuhar_dir_path: str) -> pd.DataFrame:
    """Read the Kuhar dataset and return a DataFrame with the data (coming from all CSV files)
    The returned dataframe has the following columns:
    - accel-x: Acceleration on the x axis
    - accel-y: Acceleration on the y axis
    - accel-z: Acceleration on the z axis
    - gyro-x: Angular velocity on the x axis
    - gyro-y: Angular velocity on the y axis
    - gyro-z: Angular velocity on the z axis
    - accel-start-time: Start time of the acceleration window
    - gyro-start-time: Start time of the gyroscope window
    - activity code: Activity code
    - index: Index of the sample coming from the csv
    - user: User code
    - serial: Serial number of the activity
    - csv: Name of the CSV file

    Parameters
    ----------
    kuhar_dir_path : str
        Path to the Kuhar dataset

    Returns
    -------
    pd.DataFrame
        DataFrame with the data from the Kuhar dataset
    """
    kuhar_dir_path = Path(kuhar_dir_path)

    # Create a dictionary with the data types of each column
    feature_dtypes = {
        "accel-start-time": np.float32,
        "accel-x": np.float32,
        "accel-y": np.float32,
        "accel-z": np.float32,
        "gyro-start-time": np.float32,
        "gyro-x": np.float32,
        "gyro-y": np.float32,
        "gyro-z": np.float32,
    }

    dfs = []
    for i, f in enumerate(sorted(kuhar_dir_path.rglob("*.csv"))):
        # Get the name of the activity (folder name, e.g. 5.Lay)
        # Get the name of the CSV file (ex.: 1052_F_1.csv)
        # Split the activity number and the name (ex.: [5, 'Lay'])
        activity_no, activity_name = f.parents[0].name.split(".")
        activity_no = int(activity_no)

        # Split the user code, the activity type and the serial number (ex.: [1055, 'G', 1])
        csv_splitted = f.stem.split("_")
        user = int(csv_splitted[0])
        serial = "_".join(csv_splitted[2:])

        # Read the CSV file
        df = pd.read_csv(
            f, names=list(feature_dtypes.keys()), dtype=feature_dtypes
        )

        # Remove dataframes that contain NaN
        if df.isnull().values.any():
            continue

        # Only reordering the columns (no column is removed)
        df = df[
            [
                "accel-x",
                "accel-y",
                "accel-z",
                "gyro-x",
                "gyro-y",
                "gyro-z",
                "accel-start-time",
                "gyro-start-time",
            ]
        ]

        # ----- Add auxiliary columns and metadata ------
        # Since it is a simple instant of time (without duration), the start and end time are the same
        df["accel-end-time"] = df["accel-start-time"]
        df["gyro-end-time"] = df["gyro-start-time"]
        # Add the activity code column
        df["activity code"] = activity_no
        # Add the index column (index of the sample in the CSV file)
        df["index"] = range(len(df))
        # Add the user column
        df["user"] = user
        # Add the serial column (the serial number of the activity)
        df["serial"] = serial
        # Add the csv column (the name of the CSV file)
        df["csv"] = "/".join(f.parts[-2:])
        # ----------------------------------------------------
        dfs.append(df)
    return pd.concat(dfs)

In [26]:
dataset_readers: Dict[str, callable] = {
    "KuHar": read_kuhar,
}

In [27]:
dataset = "KuHar" 
reader = dataset_readers[dataset]
path = Path(f"data/original/{dataset_paths[dataset]}")
raw_dataset = reader(path)

In [35]:
print(raw_dataset.shape)
raw_dataset.head()

(6299460, 15)


Unnamed: 0,accel-x,accel-y,accel-z,gyro-x,gyro-y,gyro-z,accel-start-time,gyro-start-time,accel-end-time,gyro-end-time,activity code,index,user,serial,csv
0,0.00423,0.005034,-0.020325,-4.3e-05,0.012474,-0.00088,0.003,0.005,0.003,0.005,0,0,1001,1,0.Stand/1001_A_1.csv
1,-0.00028,0.011153,-0.022526,-0.000654,0.011252,-0.001491,0.013,0.014,0.013,0.014,0,1,1001,1,0.Stand/1001_A_1.csv
2,-0.013218,0.026723,-0.023402,-0.002486,0.002089,-0.002101,0.021,0.023,0.021,0.023,0,2,1001,1,0.Stand/1001_A_1.csv
3,-0.006041,0.035018,-0.004268,-0.00493,0.005143,-0.00088,0.031,0.033,0.031,0.033,0,3,1001,1,0.Stand/1001_A_1.csv
4,0.019178,0.045794,-0.002172,-0.007373,0.005754,-0.00088,0.041,0.043,0.041,0.043,0,4,1001,1,0.Stand/1001_A_1.csv


In [36]:
raw_dataset.columns.tolist()


['accel-x',
 'accel-y',
 'accel-z',
 'gyro-x',
 'gyro-y',
 'gyro-z',
 'accel-start-time',
 'gyro-start-time',
 'accel-end-time',
 'gyro-end-time',
 'activity code',
 'index',
 'user',
 'serial',
 'csv']