In [33]:
import os, glob, random
import numpy as np
import pandas as pd

# =====================================================
# Parameters
# =====================================================
# Sensor expected rates

backup_folder = r"C:\Users\Malik Haider\Documents\HUMCARE\DATASET_FILLING\DS_AF"

expected_rows = {
    "glass_accelerometer": 5, "glass_gyroscope": 5, "glass_magnetometer": 5,
    "phone_acceleromter_calibrated": 100, "phone_accelerometer": 500,
    "phone_gravity": 200, "phone_linear_acceleration": 100,
    "phone_gyroscope_uncalibrated": 500, "phone_gyroscope": 500,
    "phone_magnetometer_uncalibrated": 100, "phone_magnetometer": 100,
    "phone_interrupt_gyroscope": 100, "watch_accelerometer": 100,
    "watch_gyroscope_uncalibrated": 200, "watch_linear_acceleration": 100,
    "watch_gyroscope": 100, "watch_magnetometer_uncalibrated": 100,
    "watch_magnetometer": 100, "watch_gravity": 100,
}

selected_activities = {
    "bending", "standing_up_from_sitting", "sitting_down_from_standing", "squatting", "open_door", "close_door", 
    "put_on_floor", "pick_from_floor", "laying_down_from_sitting", "standing_up_from_laying", "open_bag", "open_big_box", 
    "close_lid_by_rotation", "plugin", "throw_out", "eat_small_things", "drink_water", "fall_forward", "fall_right", 
    "fall_backward", "fall_left", "fall_forward_when_trying_to_sit_down", "fall_backward_while_trying_to_sit_down", 
    "fall_forward_while_trying_to_stand_up", "fall_backward_while_trying_to_stand_up"
}  # activities to process

report_rows = []

MIN_FACTOR = 4        # check against 4s worth of data
THRESHOLD = 0.7       # 70% rule
SIGMA_FACTOR = 0.001  # noise factor for backup replacement


# =====================================================
# Helper functions
# =====================================================
def load_backup(sensor_name, activity):
    """Pick a random backup file for the same sensor + activity."""
    pattern = os.path.join(backup_folder, "**", activity, f"{sensor_name}_e*.csv")
    files = glob.glob(pattern, recursive=True)
    return random.choice(files) if files else None


def add_noise(df, sigma_factor=SIGMA_FACTOR):
    noisy = df.copy()
    for col in ["x", "y", "z"]:
        if col in df.columns:
            sigma = df[col].std() * sigma_factor
            noisy[col] = df[col] + np.random.normal(0, sigma, size=len(df))
    return noisy


def repair_file(path, activity):
    fname = os.path.basename(path)
    sensor = fname.split("_e")[0]  # e.g. glass_accelerometer
    exp_rows = expected_rows.get(sensor, None)
    if exp_rows is None:
        return

    try:
        # Read strictly as 4 columns (no header in your files)
        df = pd.read_csv(path, header=None)
        if df.shape[1] < 4:
            raise ValueError(f"File {fname} has less than 4 columns")

        df = df.iloc[:, :4]
        df.columns = ["timestamp", "x", "y", "z"]
        rows_before = len(df)

        # Calculate thresholds
        min_rows = exp_rows * 4
        threshold_rows = int(min_rows * THRESHOLD)

        if rows_before >= min_rows:
            # ✅ Already valid, just trim if longer
            # df = df.iloc[:exp_rows]
            action, sigma, backup_src = "valid_can_be_trimmed", "NA", "NA"

        elif rows_before >= threshold_rows:
            # ✅ Pad with repeated rows
            missing = min_rows - rows_before
            pad = df.head(missing).copy()
            df = pd.concat([df, pad], ignore_index=True)
            action, sigma, backup_src = f"padded_{missing}", "NA", "NA"

        else:
            # ✅ Replace with noisy backup
            backup_src = load_backup(sensor, activity)
            if not backup_src:
                raise FileNotFoundError(f"No backup found for {sensor} in {activity}")

            df = pd.read_csv(backup_src, header=None)
            df = df.iloc[:, :4]
            df.columns = ["timestamp", "x", "y", "z"]
            df = add_noise(df)
            action, sigma = "backup_replaced", SIGMA_FACTOR
            rows_before = len(df)

        # 🔑 Ensure exact number of rows
        df = df.iloc[:exp_rows*4].copy()
        while len(df) < exp_rows*4:
            pad = df.head((exp_rows*4) - len(df)).copy()
            df = pd.concat([df, pad], ignore_index=True)

        # 🔑 Regenerate timestamps
        start = df["timestamp"].iloc[0] if not df.empty else 0
        df["timestamp"] = np.linspace(start, start + 5000, (exp_rows*4))

        # Save back without header
        df.to_csv(path, index=False, header=False)

        # Logging
        if(action!= "valid_can_be_trimmed"):
            report_rows.append([path, rows_before, len(df), action, sigma, backup_src])
            print(f"✅ {action}: {path}")

    except Exception as e:
        report_rows.append([path, "NA", "NA", f"error:{e}", "NA", "NA"])
        print(f"❌ Error on {path}: {e}")

# =====================================================
# Main dataset processing
# =====================================================
def process_dataset(base):
    for subject in os.listdir(base):
        subj_path = os.path.join(base, subject)
        if not os.path.isdir(subj_path):
            continue

        for activity in os.listdir(subj_path):
            if activity.lower() not in selected_activities:
                print("skipped activity", activity)
                continue
            act_path = os.path.join(subj_path, activity)
            if not os.path.isdir(act_path):
                continue

            for file in glob.glob(os.path.join(act_path, "*_e*.csv")):
                repair_file(file, activity)


In [35]:
base = r"C:\Users\Malik Haider\Documents\HUMCARE\DATASET_FILLING\test_data"
if not os.path.exists(base):
    print("❌ Path does not exist.")
else:
    process_dataset(base)
    # Save log
    pd.DataFrame(report_rows, columns=[
        "filepath","rows_before","rows_after","action","noise_sigma","backup_source"
    ]).to_csv(r"C:\Users\Malik Haider\Documents\HUMCARE\DATASET_FILLING\test_data\repair_log.csv", index=False)
    print("\n📑 Repair log saved to repair_log.csv")

✅ padded_123: C:\Users\Malik Haider\Documents\HUMCARE\DATASET_FILLING\test_data\areeba\bending\phone_gravity_e0.csv
✅ padded_134: C:\Users\Malik Haider\Documents\HUMCARE\DATASET_FILLING\test_data\areeba\bending\phone_gravity_e5.csv
❌ Error on C:\Users\Malik Haider\Documents\HUMCARE\DATASET_FILLING\test_data\areeba\bending\phone_interrupt_gyroscope_e0.csv: No backup found for phone_interrupt_gyroscope in bending
❌ Error on C:\Users\Malik Haider\Documents\HUMCARE\DATASET_FILLING\test_data\areeba\bending\phone_interrupt_gyroscope_e1.csv: No backup found for phone_interrupt_gyroscope in bending
❌ Error on C:\Users\Malik Haider\Documents\HUMCARE\DATASET_FILLING\test_data\areeba\bending\phone_interrupt_gyroscope_e2.csv: No backup found for phone_interrupt_gyroscope in bending
❌ Error on C:\Users\Malik Haider\Documents\HUMCARE\DATASET_FILLING\test_data\areeba\bending\phone_interrupt_gyroscope_e3.csv: No backup found for phone_interrupt_gyroscope in bending
❌ Error on C:\Users\Malik Haider\Doc

In [31]:
import os, glob
import pandas as pd

# =====================================================
# Sensor expected rates (samples per second)
# =====================================================
expected_rows = {
    "glass_accelerometer": 5, "glass_gyroscope": 5, "glass_magnetometer": 5,
    "phone_acceleromter_calibrated": 100, "phone_accelerometer": 500,
    "phone_gravity": 200, "phone_linear_acceleration": 100,
    "phone_gyroscope_uncalibrated": 500, "phone_gyroscope": 500,
    "phone_magnetometer_uncalibrated": 100, "phone_magnetometer": 100,
    "phone_interrupt_gyroscope": 100, "watch_accelerometer": 100,
    "watch_gyroscope_uncalibrated": 200, "watch_linear_acceleration": 100,
    "watch_gyroscope": 100, "watch_magnetometer_uncalibrated": 100,
    "watch_magnetometer": 100, "watch_gravity": 100,
}

# Minimum rows required = rate * 4 seconds
min_required = {k: v * 4 for k, v in expected_rows.items()}

report_rows = []

# =====================================================
# Function to check one file
# =====================================================
def check_file(path):
    fname = os.path.basename(path)
    sensor = fname.split("_e")[0]
    min_rows = min_required.get(sensor, None)
    if min_rows is None:
        return

    try:
        df = pd.read_csv(path, header=None)
        rows = len(df)
        if rows < min_rows:
            os.remove(path)
            report_rows.append([path, sensor, rows, min_rows])
            print(f"⚠️ {path} has {rows} rows (needs at least {min_rows})")
    except Exception as e:
        os.remove(path)
        report_rows.append([path, sensor, "error", str(e)])
        print(f"❌ Error reading {path}: {e}")

# =====================================================
# Main loop
# =====================================================
base = r"C:\Users\Malik Haider\Documents\HUMCARE\DATASET_FILLING\DS_AF"  # change to your dataset path

for file in glob.glob(base + r"\**\*_e*.csv", recursive=True):
    check_file(file)

# Save log
log_path = r"C:\Users\Malik Haider\Documents\HUMCARE\DATASET_FILLING\DS_AF\low_sample_log.csv"
pd.DataFrame(report_rows, columns=["filepath","sensor","rows","min_required"]).to_csv(log_path, index=False)

print(f"\n✅ Log saved to {log_path}")



✅ Log saved to C:\Users\Malik Haider\Documents\HUMCARE\DATASET_FILLING\DS_AF\low_sample_log.csv
