"""
Implements the filling/synchronization plan exactly as specified.

Key behaviors:
 - processes CSVs that DO NOT contain "_e" in filename
 - timestamps are epoch milliseconds (int)
 - CSVs have no header; columns: timestamp,x,y,z
 - shift-forward alignment using reference sensors (accelerometer/gyroscope/magnetometer)
 - append copies of earliest data repeatedly until ~180s (180000 ms)
 - trim to exactly target (within tolerance)
 - sort non-monotonic timestamps, fix duplicates (+1 ms)
 - skip files < 10s, log them
 - trim files > 180s and log them in Already_OK.txt
 - write single report.csv with required columns
 - supports --dry-run to preview without overwriting
"""

In [1]:
import os
import sys
import argparse
import math
import csv
from glob import glob
from collections import defaultdict
import pandas as pd

In [3]:
base_path = r"C:\Users\Malik Haider\Documents\HUMCARE\DATASET_FILLING\test_data"
dry_run = False  # or False

In [5]:
# -------------------------
# USER-CONFIG: update this dictionary with sensor names and their rates (Hz)
# Only sensors listed here will be synchronized/processed.
# Match is substring-based (case-insensitive) against filenames.
sensor_rates = {
    "glass_accelerometer": 5,
    "glass_gyroscope": 5,
    "glass_magnetometer": 5,
    "phone_acceleromter_calibrated" : 100,
    "phone_accelerometer": 500,
    "phone_gravity": 200,
    "phone_linear_acceleration": 100,
    "phone_gyroscope_uncalibrated": 500,
    "phone_gyroscope": 500,
    "phone_magnetometer_uncalibrated": 100,
    "phone_magnetometer":  100,
    "phone_interrupt_gyroscope": 100,
    "watch_accelerometer": 100,
    "watch_gyroscope_uncalibrated": 200,
    "watch_linear_acceleration": 100,
    "watch_gyroscope": 100,
    "watch_magnetometer_uncalibrated": 100,
    "watch_magnetometer": 100,
    "watch_gravity": 100,
}
# -------------------------

# Constants from the plan
TARGET_MS = 170_000
MIN_DURATION_MS = 10_000
TOLERANCE_MS = 2000  # ±100 ms acceptable
AVG_DELTA_N = 5
ROWCOUNT_MISMATCH_THRESHOLD = 0.03  # 3%

# Reference categories for computing reference_start
REFERENCE_KEYWORDS = ("accelerometer", "gyroscope", "magnetometer")

# Log filenames
REPORT_FILENAME = "report.csv"
MIN_LENGTH_FILENAME = "min_length.txt"
ALREADY_OK_FILENAME = "Already_OK.txt"

# Report columns
REPORT_COLS = [
    "file_path",
    "sensor_name",
    "original_rows",
    "final_rows",
    "original_start_ms",
    "original_end_ms",
    "final_start_ms",
    "final_end_ms",
    "original_duration_ms",
    "final_duration_ms",
    "sampling_rate_hz",
    "expected_final_rows",
    "action_taken",
    "warnings",
]

ALLOWED_ACTIVITIES = [
    "quick_walk",
    "jogging",
    "laying",
    "reading",
    "sitting",
    "slow_walk",
    "standing",
    "talk_using_phone",
    "typing",
    "walking",
    "clean_the_table",
]  # only process these


In [7]:
def read_csv_no_header(path):
    """
    Robust reader for CSVs with no header and at least 4 columns:
    - reads only first 4 columns (usecols=[0,1,2,3])
    - coerces timestamp to numeric, drops bad rows
    - converts seconds -> milliseconds if needed
    - returns (df, stats) where stats includes counts of dropped rows
    """
    import os
    stats = {
        "rows_total": 0,
        "rows_after_drop_ts": 0,
        "rows_after_drop_axes": 0,
        "rows_dropped_invalid_timestamp": 0,
        "rows_dropped_all_axes": 0,
        "read_warnings": ""
    }
    try:
        df = pd.read_csv(
            path,
            header=None,
            usecols=[0, 1, 2, 3],
            names=["timestamp", "x", "y", "z"],
            skipinitialspace=True,
            engine="python",
            na_values=["", "NA", "nan", "NaN"]
        )

        df = df.dropna(how="all")
        stats["rows_total"] = len(df)
        if df.empty:
            stats["rows_after_drop_ts"] = 0
            stats["rows_after_drop_axes"] = 0
            return df.reset_index(drop=True), stats

        # Coerce timestamp to numeric (float), drop rows that fail
        df["timestamp"] = pd.to_numeric(df["timestamp"], errors="coerce")
        before = len(df)
        df = df[df["timestamp"].notna()].copy()
        stats["rows_dropped_invalid_timestamp"] = before - len(df)
        stats["rows_after_drop_ts"] = len(df)

        if df.empty:
            stats["read_warnings"] = "all_rows_invalid_timestamps"
            return df.reset_index(drop=True), stats

        # Detect seconds vs milliseconds heuristic
        max_ts = float(df["timestamp"].max())
        if max_ts < 1e11:  # treat as seconds -> convert to ms
            df["timestamp"] = df["timestamp"] * 1000.0

        df["timestamp"] = df["timestamp"].round().astype("int64")

        # Coerce axes to numeric
        df["x"] = pd.to_numeric(df["x"], errors="coerce")
        df["y"] = pd.to_numeric(df["y"], errors="coerce")
        df["z"] = pd.to_numeric(df["z"], errors="coerce")

        before_axes = len(df)
        # drop rows where all x,y,z are NaN
        df = df.dropna(how="all", subset=["x", "y", "z"])
        stats["rows_dropped_all_axes"] = before_axes - len(df)
        stats["rows_after_drop_axes"] = len(df)

        return df.reset_index(drop=True), stats

    except Exception as e:
        raise RuntimeError(f"Failed to read {path}: {e}")


In [9]:
def list_activity_folders(base_path):
    subjects = [d for d in glob(os.path.join(base_path, "*")) if os.path.isdir(d)]
    for subj in subjects:
        activities = [d for d in glob(os.path.join(subj, "*")) if os.path.isdir(d)]
        for a in activities:
            yield a


def find_sensor_files_in_folder(folder, sensor_dict):
    """
    Returns mapping sensor_key -> filepath for sensors present in this folder
    matching by substring (case-insensitive).
    """
    files = glob(os.path.join(folder, "*.csv"))
    lower_to_path = {os.path.basename(fp).lower(): fp for fp in files}
    mapping = {}
    filename_to_sensor = {}
    for fname_lower, path in lower_to_path.items():
        if "_e" in fname_lower:
            continue  # skip files containing _e
        for sensor_key in sensor_dict.keys():
            if sensor_key.lower() in fname_lower:
                mapping[sensor_key] = path
                filename_to_sensor[fname_lower] = sensor_key
                break
    return mapping


def sort_and_fix_duplicates(df, actions, warnings):
    """Sort by timestamp and ensure strictly increasing timestamps by adding small epsilon increments."""
    need_sort = not df["timestamp"].is_monotonic_increasing
    if need_sort:
        df = df.sort_values("timestamp").reset_index(drop=True)
        actions.append("sorted_timestamps")
        warnings.append("non_monotonic_sorted")

    # Compute adaptive epsilon based on average delta
    avg_delta = compute_avg_delta_ms_from(df)
    # epsilon: small fraction of avg_delta but at least 1 ms
    epsilon = max(1, int(max(1, round(avg_delta / 10.0))))

    ts = df["timestamp"].values.astype("int64")
    if len(ts) >= 2:
        prev = ts[0]
        changed = False
        for i in range(1, len(ts)):
            if ts[i] <= prev:
                ts[i] = prev + epsilon
                changed = True
            prev = ts[i]
        if changed:
            df["timestamp"] = ts
            actions.append("duplicates_fixed")
            warnings.append(f"duplicates_fixed_epsilon_{epsilon}ms")
    return df



def compute_avg_delta_ms_from(df, n=AVG_DELTA_N):
    """
    Compute average sampling interval (ms) from the provided dataframe.
    Returns integer ms >= 1.
    """
    if df is None or len(df) < 2:
        return 1
    diffs = df["timestamp"].diff().dropna().astype("int64").values
    if len(diffs) == 0:
        return 1
    n = min(n, len(diffs))
    avg = int(diffs[-n:].mean())
    return max(1, avg)



def shift_forward_timestamps(df, shift_ms):
    if shift_ms <= 0:
        return df
    df = df.copy()
    df["timestamp"] = df["timestamp"] + int(shift_ms)
    return df

In [11]:
def append_chunks_until_target(df, target_ms, actions, warnings):
    """
    Append repeated copies/partial copies of the earliest data using ORIGINAL file as source.
    Uses the original file's average sampling interval to space appended chunks.
    """
    df = df.copy().reset_index(drop=True)
    orig_df = df.copy().reset_index(drop=True)
    if orig_df.empty:
        warnings.append("orig_empty_no_append")
        return df

    orig_first_ts = int(orig_df["timestamp"].iloc[0])
    orig_offsets = (orig_df["timestamp"].astype("int64") - orig_first_ts).astype("int64").values
    file_duration = int(orig_offsets[-1]) if len(orig_offsets) > 0 else 0
    if file_duration <= 0:
        warnings.append("zero_file_duration_can_not_append")
        return df

    actions.append("appended_copied")

    # Use avg delta from the original file to preserve sampling
    orig_avg_delta = compute_avg_delta_ms_from(orig_df)

    while (int(df["timestamp"].iloc[-1]) - int(df["timestamp"].iloc[0])) < target_ms:
        remaining = target_ms - (int(df["timestamp"].iloc[-1]) - int(df["timestamp"].iloc[0]))
        chunk_len = file_duration if file_duration <= remaining else remaining

        mask = orig_offsets < chunk_len
        if not mask.any():
            warnings.append("no_rows_in_append_window")
            break

        chunk = orig_df.loc[mask].copy().reset_index(drop=True)
        # base time = current last + orig_avg_delta
        base = int(df["timestamp"].iloc[-1]) + int(orig_avg_delta)
        prev_ts = int(df["timestamp"].iloc[-1])

        chunk_offsets = (chunk["timestamp"].astype("int64") - orig_first_ts).astype("int64").values

        proposed_ts = []
        for off in chunk_offsets:
            t_new = base + int(off)
            if t_new <= prev_ts:
                # ensure monotonic with small epsilon derived from orig_avg_delta
                eps = max(1, int(round(orig_avg_delta / 10.0)))
                t_new = prev_ts + eps
            proposed_ts.append(t_new)
            prev_ts = t_new

        chunk["timestamp"] = proposed_ts
        df = pd.concat([df, chunk], ignore_index=True)

    return df


In [33]:
def trim_to_target(df, target_ms, actions, warnings):
    """
    Trim rows with timestamp > first_timestamp + target_ms.
    Returns trimmed df (may be unchanged) and records action.
    """
    first_ts = int(df["timestamp"].iloc[0])
    max_allowed = first_ts + target_ms
    if df["timestamp"].iloc[-1] > max_allowed:
        df_trimmed = df[df["timestamp"] <= max_allowed].copy().reset_index(drop=True)
        actions.append("trimmed_to_target")
        return df_trimmed
    return df



def finalize_and_check(df, sensor_name, rate_hz, original_rows, original_start, original_end, actions, warnings, read_stats=None):
    """
    Trim/adjust to ensure final duration equals exactly TARGET_MS (first + TARGET_MS).
    Returns df and report_row.
    read_stats is an optional dict returned by read_csv_no_header with counts.
    """
    first_ts = int(df["timestamp"].iloc[0])
    desired_last = first_ts + TARGET_MS

    # If last timestamp is greater than desired, trim rows > desired_last
    if int(df["timestamp"].iloc[-1]) > desired_last:
        df = df[df["timestamp"] <= desired_last].copy().reset_index(drop=True)
        if "trimmed_to_target" not in actions:
            actions.append("trimmed_to_target")

    # Now, ensure last timestamp equals desired_last exactly.
    # If last timestamp < desired_last, set last timestamp to desired_last (enforce exact 180s)
    last_ts = int(df["timestamp"].iloc[-1])
    if last_ts != desired_last:
        # Ensure monotonicity: previous timestamp must be < desired_last
        if len(df) >= 2:
            prev_ts = int(df["timestamp"].iloc[-2])
            if prev_ts >= desired_last:
                # This is unusual; shift prev_ts downward by epsilon? instead we ensure prev_ts < desired_last by setting it to desired_last - 1
                df.at[len(df) - 2, "timestamp"] = desired_last - 1
            # set final timestamp exactly
            df.at[len(df) - 1, "timestamp"] = desired_last
        else:
            # only one row: set its timestamp to first (should already be equal) but enforce last=desired_last by adding one synthetic row?
            # We'll set the single row's timestamp to first_ts (can't create new rows here). To guarantee 180s, append a copy of this row with timestamp=desired_last.
            row = df.iloc[0].copy()
            row["timestamp"] = desired_last
            df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)

        actions.append("enforce_exact_target_last_ts")

    # Recompute final stats
    final_rows = len(df)
    final_start = int(df["timestamp"].iloc[0])
    final_end = int(df["timestamp"].iloc[-1])
    final_duration = final_end - final_start

    # estimated original rate
    original_estimated_rate = None
    if original_end is not None and original_start is not None and (original_end - original_start) > 0:
        original_estimated_rate = original_rows / ((original_end - original_start) / 1000.0)

    expected_final_rows = round(rate_hz * (TARGET_MS / 1000.0)) if rate_hz else None

    if rate_hz and expected_final_rows:
        diff_pct = abs(final_rows - expected_final_rows) / float(expected_final_rows)
        # if diff_pct > ROWCOUNT_MISMATCH_THRESHOLD:
        #     print('')
            # warnings.append(f"rowcount_mismatch_{diff_pct:.3f}")
            

    action_taken = ",".join(actions) if actions else "none"
    warnings_str = ";".join(warnings) if warnings else "NA"

    report_row = {
        "file_path": None,  # filled by caller
        "sensor_name": sensor_name,
        "original_rows": original_rows,
        "final_rows": final_rows,
        "original_start_ms": original_start,
        "original_end_ms": original_end,
        "final_start_ms": final_start,
        "final_end_ms": final_end,
        "original_duration_ms": original_end - original_start if original_end and original_start else "NA",
        "final_duration_ms": final_duration,
        "sampling_rate_hz": rate_hz if rate_hz else "NA",
        "expected_final_rows": expected_final_rows if expected_final_rows is not None else "NA",
        "action_taken": action_taken,
        "warnings": warnings_str,
        # new debug fields
        "rows_dropped_invalid_timestamp": read_stats.get("rows_dropped_invalid_timestamp", 0) if read_stats else 0,
        "rows_dropped_all_axes": read_stats.get("rows_dropped_all_axes", 0) if read_stats else 0,
        "original_estimated_rate_hz": round(original_estimated_rate, 2) if original_estimated_rate else "NA",
        "read_warnings": read_stats.get("read_warnings", "") if read_stats else ""
    }
    return df, report_row


In [19]:
def process_activity_folder(folder, sensor_rates_map, dry_run, report_rows, min_length_paths, already_ok_paths):
    # Find relevant sensor files in folder (skip _e)
    present_mapping = find_sensor_files_in_folder(folder, sensor_rates_map)
    # sensors expected list
    expected_sensors = list(sensor_rates_map.keys())

    # Determine missing sensors and log them (they will be added to report)
    for s in expected_sensors:
        if s not in present_mapping:
            report_rows.append({
                "file_path": os.path.join(folder, f"**missing**/{s}"),
                "sensor_name": s,
                "original_rows": 0,
                "final_rows": 0,
                "original_start_ms": "NA",
                "original_end_ms": "NA",
                "final_start_ms": "NA",
                "final_end_ms": "NA",
                "original_duration_ms": "NA",
                "final_duration_ms": "NA",
                "sampling_rate_hz": sensor_rates_map.get(s),
                "expected_final_rows": round(sensor_rates_map[s] * 180) if sensor_rates_map.get(s) else "NA",
                "action_taken": "missing_sensor",
                "warnings": "missing_sensor"
            })

    # Build present sensors mapping: sensor -> path
    # For synchronization we will compute reference_start using reference categories
    present_sensors = {k: v for k, v in present_mapping.items() if k in sensor_rates_map}

    if not present_sensors:
        # nothing to do in this folder
        return

    # Compute reference_start using reference categories among present sensors
    reference_candidates = []
    for sensor_key, path in present_sensors.items():
        lname = os.path.basename(path).lower()
        if any(k in sensor_key.lower() for k in REFERENCE_KEYWORDS) or any(kw in lname for kw in REFERENCE_KEYWORDS):
            try:
                df_tmp, read_stats = read_csv_no_header(path)
                if df_tmp.empty:
                    continue
                reference_candidates.append(int(df_tmp["timestamp"].iloc[0]))
            except Exception:
                continue
    if not reference_candidates:
        # fallback to the starts of all present sensors
        for sensor_key, path in present_sensors.items():
            try:
                df_tmp, read_stats = read_csv_no_header(path)
                if df_tmp.empty:
                    continue
                reference_candidates.append(int(df_tmp["timestamp"].iloc[0]))
            except Exception:
                continue

    if not reference_candidates:
        # can't compute reference start; skip
        return

    reference_start = max(reference_candidates)

    # Now process each present sensor file
    for sensor_name, filepath in present_sensors.items():
        actions = []
        warnings = []
        rate_hz = sensor_rates_map.get(sensor_name)
        try:
            df, read_stats = read_csv_no_header(filepath)
        except Exception as e:
            # log read failure
            report_rows.append({
                "file_path": filepath,
                "sensor_name": sensor_name,
                "original_rows": 0,
                "final_rows": 0,
                "original_start_ms": "NA",
                "original_end_ms": "NA",
                "final_start_ms": "NA",
                "final_end_ms": "NA",
                "original_duration_ms": "NA",
                "final_duration_ms": "NA",
                "sampling_rate_hz": rate_hz if rate_hz else "NA",
                "expected_final_rows": round(rate_hz * 180) if rate_hz else "NA",
                "action_taken": "read_failed",
                "warnings": str(e)
            })
            continue

        if df.empty:
            # nothing to do
            report_rows.append({
                "file_path": filepath,
                "sensor_name": sensor_name,
                "original_rows": 0,
                "final_rows": 0,
                "original_start_ms": "NA",
                "original_end_ms": "NA",
                "final_start_ms": "NA",
                "final_end_ms": "NA",
                "original_duration_ms": "NA",
                "final_duration_ms": "NA",
                "sampling_rate_hz": rate_hz if rate_hz else "NA",
                "expected_final_rows": round(rate_hz * 180) if rate_hz else "NA",
                "action_taken": "empty_file",
                "warnings": "empty_file"
            })
            continue

        # Sort & fix duplicates BEFORE computing original stats
        df = sort_and_fix_duplicates(df, actions, warnings)

        original_rows = len(df)
        original_start = int(df["timestamp"].iloc[0])
        original_end = int(df["timestamp"].iloc[-1])
        original_duration = original_end - original_start

        # Minimum duration check
        if original_duration < MIN_DURATION_MS:
            # skip and log in min_length.txt
            min_length_paths.append((filepath, original_duration))
            report_rows.append({
                "file_path": filepath,
                "sensor_name": sensor_name,
                "original_rows": original_rows,
                "final_rows": 0,
                "original_start_ms": original_start,
                "original_end_ms": original_end,
                "final_start_ms": "NA",
                "final_end_ms": "NA",
                "original_duration_ms": original_duration,
                "final_duration_ms": "NA",
                "sampling_rate_hz": rate_hz if rate_hz else "NA",
                "expected_final_rows": round(rate_hz * 180) if rate_hz else "NA",
                "action_taken": "skipped_too_short",
                "warnings": "skipped_too_short"
            })
            continue

        # Shift-forward to align start to reference_start
        shift_ms = reference_start - original_start
        if shift_ms > 0:
            df = shift_forward_timestamps(df, shift_ms)
            actions.append("shifted_start")
            # Update original_start/end to shifted values for reporting continuity
            original_start = int(df["timestamp"].iloc[0])
            original_end = int(df["timestamp"].iloc[-1])

        # Recompute duration after shift
        current_duration = int(df["timestamp"].iloc[-1]) - int(df["timestamp"].iloc[0])

        # If file originally longer than TARGET_MS (we trim and log)
        if current_duration > TARGET_MS:
            # Trim to target
            df = trim_to_target(df, TARGET_MS, actions, warnings)
            already_ok_paths.append((filepath, current_duration))
            # finalize and save later
            df, report = finalize_and_check(df, sensor_name, rate_hz, original_rows, original_start, original_end, actions, warnings)
            report["file_path"] = filepath
            report_rows.append(report)
            if not dry_run:
                # overwrite file
                df.to_csv(filepath, index=False, header=False, float_format="%.6f")
            else:
                print(f"[dry-run] would trim (longer) and overwrite: {filepath}")
            continue

        # If file shorter than target -> append repeated earliest data
        if current_duration < TARGET_MS:
            df_appended = append_chunks_until_target(df, TARGET_MS, actions, warnings)
            # then trim if overshot
            df_appended = trim_to_target(df_appended, TARGET_MS, actions, warnings)
            # finalize checks
            df_final, report = finalize_and_check(df_appended, sensor_name, rate_hz, original_rows, original_start, original_end, actions, warnings)
            report["file_path"] = filepath
            report_rows.append(report)
            if not dry_run:
                df_final.to_csv(filepath, index=False, header=False, float_format="%.6f")
            else:
                print(f"[dry-run] would append/trim and overwrite: {filepath}")
            continue

        # If current_duration == TARGET_MS (already exact) -> still finalize (no changes)
        df, report = finalize_and_check(df, sensor_name, rate_hz, original_rows, original_start, original_end, actions, warnings)
        report["file_path"] = filepath
        report_rows.append(report)
        if not dry_run:
            df.to_csv(filepath, index=False, header=False, float_format="%.6f")
        else:
            print(f"[dry-run] file already exactly target: {filepath}")


In [21]:
def write_logs(base_path, report_rows, min_length_paths, already_ok_paths):
    # Write report.csv
    report_path = os.path.join(base_path, REPORT_FILENAME)
    with open(report_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=REPORT_COLS)
        writer.writeheader()
        for r in report_rows:
            # ensure all keys present
            row = {k: r.get(k, "NA") for k in REPORT_COLS}
            writer.writerow(row)

    # Write min_length.txt
    min_path = os.path.join(base_path, MIN_LENGTH_FILENAME)
    with open(min_path, "w", encoding="utf-8") as f:
        for p, dur in min_length_paths:
            f.write(f"{p},{dur}\n")

    # Write Already_OK.txt
    ak_path = os.path.join(base_path, ALREADY_OK_FILENAME)
    with open(ak_path, "w", encoding="utf-8") as f:
        for p, dur in already_ok_paths:
            f.write(f"{p},{dur}\n")

In [29]:
def driver(base_path, dry_run=False):
    if not os.path.isdir(base_path):
        print(f"Base path not found: {base_path}", file=sys.stderr)
        sys.exit(1)

    report_rows = []
    min_length_paths = []
    already_ok_paths = []

    activity_folders = list_activity_folders(base_path)
    for folder in activity_folders:
        activity_name = os.path.basename(folder).lower()
        if ALLOWED_ACTIVITIES and activity_name not in ALLOWED_ACTIVITIES:
            continue  # skip this activity
        try:
            process_activity_folder(folder, sensor_rates, dry_run, report_rows, min_length_paths, already_ok_paths)
        except Exception as e:
            # log folder-level failure as a report row
            report_rows.append({
                "file_path": folder,
                "sensor_name": "folder_error",
                "original_rows": "NA",
                "final_rows": "NA",
                "original_start_ms": "NA",
                "original_end_ms": "NA",
                "final_start_ms": "NA",
                "final_end_ms": "NA",
                "original_duration_ms": "NA",
                "final_duration_ms": "NA",
                "sampling_rate_hz": "NA",
                "expected_final_rows": "NA",
                "action_taken": "folder_processing_failed",
                "warnings": str(e)
            })

    # Write logs (even in dry-run, per your plan we still output report files)
    write_logs(base_path, report_rows, min_length_paths, already_ok_paths)

    print("Processing complete.")
    print(f"Report written to: {os.path.join(base_path, REPORT_FILENAME)}")
    print(f"Min length list: {os.path.join(base_path, MIN_LENGTH_FILENAME)}")
    print(f"Already long / trimmed: {os.path.join(base_path, ALREADY_OK_FILENAME)}")
    if dry_run:
        print("Dry-run mode: no files were overwritten.")


In [31]:
driver(base_path, dry_run)





















































































































































































































Processing complete.
Report written to: C:\Users\Malik Haider\Documents\HUMCARE\DATASET_FILLING\test_data\report.csv
Min length list: C:\Users\Malik Haider\Documents\HUMCARE\DATASET_FILLING\test_data\min_length.txt
Already long / trimmed: C:\Users\Malik Haider\Documents\HUMCARE\DATASET_FILLING\test_data\Already_OK.txt
