In [1]:
import os
import pandas as pd
import numpy as np

# Define directories
input_dir = "harth_txt_processed"  # Main dataset folder
cleaned_dir = "Harth_Cleaned"  # Folder for cleaned files
os.makedirs(cleaned_dir, exist_ok=True)

# Function to clean each dataset
def clean_dataset(file_path, save_path):
    try:
        # Read TXT file as DataFrame (assuming space-separated values)
        df = pd.read_csv(file_path, sep=" ", header=None, engine="python")

        print(f"\n🔹 Processing file: {os.path.basename(file_path)}")

        # Initial number of rows
        initial_rows = df.shape[0]
        print(f"📌 Initial number of rows: {initial_rows}")
        print("📌 Original Data Sample:\n", df.head(), "\n")

        ### 1️⃣ Handling Missing Values ###
        missing_values_before = df.isnull().sum().sum()
        print(f"🔍 Missing values before: {missing_values_before}")

        # Fill numerical columns with median, categorical with mode
        for col in df.columns:
            if df[col].dtype == 'O':  # Categorical
                df[col].fillna(df[col].mode()[0], inplace=True)
            else:  # Numerical
                df[col].fillna(df[col].median(), inplace=True)

        missing_values_after = df.isnull().sum().sum()
        print(f"✅ Missing values after: {missing_values_after}")
        print("📌 Data After Handling Missing Values:\n", df.head(), "\n")

        ### 2️⃣ Removing Redundant Values (Duplicates) ###
        duplicate_rows = df.duplicated().sum()
        print(f"🔍 Duplicate rows found: {duplicate_rows}")
        df.drop_duplicates(inplace=True)
        after_duplicate_rows = df.shape[0]
        print(f"✅ Rows after duplicate removal: {after_duplicate_rows}")
        print("📌 Data After Removing Duplicates:\n", df.head(), "\n")

        ### 3️⃣ Fixing Structural Errors (Trimming & Lowercasing) ###
        df = df.applymap(lambda x: x.lower().strip() if isinstance(x, str) else x)
        print("📌 Data After Fixing Structural Errors:\n", df.head(), "\n")

        ### 4️⃣ Handling Outliers (IQR Method) ###
        outliers_removed = 0
        for col in df.select_dtypes(include=[np.number]).columns:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            
            before_outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)].shape[0]
            outliers_removed += before_outliers
            
            df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

        after_outliers_rows = df.shape[0]
        print(f"🔍 Outliers removed: {outliers_removed}")
        print(f"✅ Rows after outlier removal: {after_outliers_rows}")
        print("📌 Data After Removing Outliers:\n", df.head(), "\n")

        ### 5️⃣ Data Imputation (Re-check for missing values) ###
        for col in df.columns:
            if df[col].isnull().sum() > 0:
                if df[col].dtype == 'O':
                    df[col].fillna(df[col].mode()[0], inplace=True)
                else:
                    df[col].fillna(df[col].median(), inplace=True)

        ### 6️⃣ Removing Unwanted Observations ###
        before_removal = df.shape[0]
        df = df[~df.apply(lambda row: row.astype(str).str.contains("unknown|error", case=False).any(), axis=1)]
        removed_obs = before_removal - df.shape[0]
        after_removal_rows = df.shape[0]
        print(f"🔍 Unwanted observations removed: {removed_obs}")
        print(f"✅ Rows after removing unwanted observations: {after_removal_rows}")
        print("📌 Data After Removing Unwanted Observations:\n", df.head(), "\n")

        # Final row count
        final_rows = df.shape[0]
        print(f"📌 Final number of rows: {final_rows} (Rows Removed: {initial_rows - final_rows})")

        # Save cleaned data
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        df.to_csv(save_path, sep=" ", index=False, header=False)

        print(f"✅ Cleaned data saved: {save_path}")

    except Exception as e:
        print(f"❌ Error processing {file_path}: {e}")

# Traverse harth_txt_processed directories and clean all text files
for root, dirs, files in os.walk(input_dir):
    for file in files:
        if file.endswith(".txt"):
            input_path = os.path.join(root, file)
            relative_path = os.path.relpath(input_path, input_dir)  # Preserve directory structure
            output_path = os.path.join(cleaned_dir, relative_path)
            clean_dataset(input_path, output_path)

print("\n✅ Data cleaning completed for all Harth processed files.")



🔹 Processing file: S008.txt
📌 Initial number of rows: 418989
📌 Original Data Sample:
                          0         1         2         3  4
0  2019-01-12 00:00:00.000 -1.337773 -0.395162  0.288326  6
1  2019-01-12 00:00:00.020 -0.984473 -0.021598  0.644522  6
2  2019-01-12 00:00:00.040 -1.635857  0.230325 -0.192113  6
3  2019-01-12 00:00:00.060 -0.363280  0.372247 -0.381429  6
4  2019-01-12 00:00:00.080  0.008470  0.468494 -0.465681  6 

🔍 Missing values before: 0
✅ Missing values after: 0
📌 Data After Handling Missing Values:
                          0         1         2         3  4
0  2019-01-12 00:00:00.000 -1.337773 -0.395162  0.288326  6
1  2019-01-12 00:00:00.020 -0.984473 -0.021598  0.644522  6
2  2019-01-12 00:00:00.040 -1.635857  0.230325 -0.192113  6
3  2019-01-12 00:00:00.060 -0.363280  0.372247 -0.381429  6
4  2019-01-12 00:00:00.080  0.008470  0.468494 -0.465681  6 

🔍 Duplicate rows found: 0
✅ Rows after duplicate removal: 418989
📌 Data After Removing Duplicates

In [1]:
import os
import pandas as pd
import numpy as np

# Define directories
input_dir = "Harus"  # Main dataset folder
cleaned_dir = "Harus_Cleaned"  # Folder for cleaned files
os.makedirs(cleaned_dir, exist_ok=True)

# Function to clean each dataset
def clean_dataset(file_path, save_path):
    try:
        # Read TXT file as DataFrame (assuming space-separated values)
        df = pd.read_csv(file_path, sep=" ", header=None, engine="python")

        print(f"\n🔹 Processing file: {os.path.basename(file_path)}")

        # Initial number of rows
        initial_rows = df.shape[0]
        print(f"📌 Initial number of rows: {initial_rows}")
        print("📌 Original Data Sample:\n", df.head(), "\n")

        ### 1️⃣ Handling Missing Values ###
        missing_values_before = df.isnull().sum().sum()
        print(f"🔍 Missing values before: {missing_values_before}")

        # Fill numerical columns with median, categorical with mode
        for col in df.columns:
            if df[col].dtype == 'O':  # Categorical
                df[col].fillna(df[col].mode()[0], inplace=True)
            else:  # Numerical
                df[col].fillna(df[col].median(), inplace=True)

        missing_values_after = df.isnull().sum().sum()
        print(f"✅ Missing values after: {missing_values_after}")
        print("📌 Data After Handling Missing Values:\n", df.head(), "\n")

        ### 2️⃣ Removing Redundant Values (Duplicates) ###
        duplicate_rows = df.duplicated().sum()
        print(f"🔍 Duplicate rows found: {duplicate_rows}")
        df.drop_duplicates(inplace=True)
        after_duplicate_rows = df.shape[0]
        print(f"✅ Rows after duplicate removal: {after_duplicate_rows}")
        print("📌 Data After Removing Duplicates:\n", df.head(), "\n")

        ### 3️⃣ Fixing Structural Errors (Trimming & Lowercasing) ###
        df = df.applymap(lambda x: x.lower().strip() if isinstance(x, str) else x)
        print("📌 Data After Fixing Structural Errors:\n", df.head(), "\n")

        ### 4️⃣ Handling Outliers (IQR Method) ###
        outliers_removed = 0
        for col in df.select_dtypes(include=[np.number]).columns:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            
            before_outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)].shape[0]
            outliers_removed += before_outliers
            
            df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

        after_outliers_rows = df.shape[0]
        print(f"🔍 Outliers removed: {outliers_removed}")
        print(f"✅ Rows after outlier removal: {after_outliers_rows}")
        print("📌 Data After Removing Outliers:\n", df.head(), "\n")

        ### 5️⃣ Data Imputation (Re-check for missing values) ###
        for col in df.columns:
            if df[col].isnull().sum() > 0:
                if df[col].dtype == 'O':
                    df[col].fillna(df[col].mode()[0], inplace=True)
                else:
                    df[col].fillna(df[col].median(), inplace=True)

        ### 6️⃣ Removing Unwanted Observations ###
        before_removal = df.shape[0]
        df = df[~df.apply(lambda row: row.astype(str).str.contains("unknown|error", case=False).any(), axis=1)]
        removed_obs = before_removal - df.shape[0]
        after_removal_rows = df.shape[0]
        print(f"🔍 Unwanted observations removed: {removed_obs}")
        print(f"✅ Rows after removing unwanted observations: {after_removal_rows}")
        print("📌 Data After Removing Unwanted Observations:\n", df.head(), "\n")

        # Final row count
        final_rows = df.shape[0]
        print(f"📌 Final number of rows: {final_rows} (Rows Removed: {initial_rows - final_rows})")

        # Save cleaned data
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        df.to_csv(save_path, sep=" ", index=False, header=False)

        print(f"✅ Cleaned data saved: {save_path}")

    except Exception as e:
        print(f"❌ Error processing {file_path}: {e}")

# Traverse Harus directories and clean all text files
for root, dirs, files in os.walk(input_dir):
    for file in files:
        if file.endswith(".txt"):
            input_path = os.path.join(root, file)
            relative_path = os.path.relpath(input_path, input_dir)  # Preserve directory structure
            output_path = os.path.join(cleaned_dir, relative_path)
            clean_dataset(input_path, output_path)

print("\n✅ Data cleaning completed for all Harus files.")



🔹 Processing file: features.txt
📌 Initial number of rows: 561
📌 Original Data Sample:
    0                  1
0  1  tBodyAcc-mean()-X
1  2  tBodyAcc-mean()-Y
2  3  tBodyAcc-mean()-Z
3  4   tBodyAcc-std()-X
4  5   tBodyAcc-std()-Y 

🔍 Missing values before: 0
✅ Missing values after: 0
📌 Data After Handling Missing Values:
    0                  1
0  1  tBodyAcc-mean()-X
1  2  tBodyAcc-mean()-Y
2  3  tBodyAcc-mean()-Z
3  4   tBodyAcc-std()-X
4  5   tBodyAcc-std()-Y 

🔍 Duplicate rows found: 0
✅ Rows after duplicate removal: 561
📌 Data After Removing Duplicates:
    0                  1
0  1  tBodyAcc-mean()-X
1  2  tBodyAcc-mean()-Y
2  3  tBodyAcc-mean()-Z
3  4   tBodyAcc-std()-X
4  5   tBodyAcc-std()-Y 

📌 Data After Fixing Structural Errors:
    0                  1
0  1  tbodyacc-mean()-x
1  2  tbodyacc-mean()-y
2  3  tbodyacc-mean()-z
3  4   tbodyacc-std()-x
4  5   tbodyacc-std()-y 

🔍 Outliers removed: 0
✅ Rows after outlier removal: 561
📌 Data After Removing Outliers:
    0      

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


🔍 Unwanted observations removed: 0
✅ Rows after removing unwanted observations: 0
📌 Data After Removing Unwanted Observations:
 Empty DataFrame
Columns: []
Index: [] 

📌 Final number of rows: 0 (Rows Removed: 2947)
✅ Cleaned data saved: Harus_Cleaned/test/Inertial Signals/total_acc_x_test.txt
❌ Error processing Harus/test/Inertial Signals/body_acc_x_test.txt: Expected 193 fields in line 2, saw 215
❌ Error processing Harus/test/Inertial Signals/body_acc_y_test.txt: Expected 197 fields in line 2, saw 212
❌ Error processing Harus/test/Inertial Signals/body_gyro_y_test.txt: Expected 169 fields in line 5, saw 183
❌ Error processing Harus/test/Inertial Signals/body_gyro_x_test.txt: Expected 254 fields in line 67, saw 257

🔹 Processing file: total_acc_z_test.txt
📌 Initial number of rows: 2947
📌 Original Data Sample:
    0    1         2    3         4    5         6    7         8    9    ...  \
0  NaN  NaN  0.023780  NaN  0.076293  NaN  0.147475  NaN  0.139906  NaN  ...   
1  NaN  NaN  0.125

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


🔍 Unwanted observations removed: 0
✅ Rows after removing unwanted observations: 0
📌 Data After Removing Unwanted Observations:
 Empty DataFrame
Columns: []
Index: [] 

📌 Final number of rows: 0 (Rows Removed: 2947)
✅ Cleaned data saved: Harus_Cleaned/test/Inertial Signals/total_acc_z_test.txt
❌ Error processing Harus/test/Inertial Signals/body_gyro_z_test.txt: Expected 250 fields in line 2, saw 257
❌ Error processing Harus/test/Inertial Signals/body_acc_z_test.txt: Expected 223 fields in line 11, saw 224
❌ Error processing Harus/train/X_train.txt: Expected 662 fields in line 27, saw 665

🔹 Processing file: y_train.txt
📌 Initial number of rows: 7352
📌 Original Data Sample:
    0
0  5
1  5
2  5
3  5
4  5 

🔍 Missing values before: 0
✅ Missing values after: 0
📌 Data After Handling Missing Values:
    0
0  5
1  5
2  5
3  5
4  5 

🔍 Duplicate rows found: 7346
✅ Rows after duplicate removal: 6
📌 Data After Removing Duplicates:
      0
0    5
27   4
51   6
78   1
125  3 

📌 Data After Fixing 

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


📌 Data After Fixing Structural Errors:
    0         1         2         3         4         5         6         7    \
0  NaN -0.162636  0.102934 -0.155417  0.105687 -0.147383  0.102102 -0.149747   
1  NaN -0.162636  0.097930 -0.155417  0.099351 -0.147383  0.098114 -0.149747   
2  NaN -0.162636  0.091117 -0.155417  0.092676 -0.147383  0.096064 -0.149747   
3  NaN -0.162636  0.095152 -0.155417  0.095415 -0.147383  0.088274 -0.149747   
4  NaN -0.162636  0.080841 -0.155417  0.079127 -0.147383  0.078291 -0.149747   

        8         9    ...       247       248       249       250      251  \
0  0.106553 -0.143537  ...  0.095676  0.094843  0.099422  0.098350  0.12094   
1  0.097517 -0.143537  ...  0.095676  0.095126  0.099422  0.099496  0.12094   
2  0.099897 -0.143537  ...  0.095676  0.081413  0.099422  0.081936  0.12094   
3  0.086325 -0.143537  ...  0.095676  0.082785  0.099422  0.084084  0.12094   
4  0.084063 -0.143537  ...  0.095676  0.081640  0.099422  0.079652  0.12094   

    

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


📌 Data After Fixing Structural Errors:
    0         1         2        3         4         5         6        7    \
0  NaN -0.065913  1.012817 -0.06698  1.022833 -0.065939  1.022028 -0.06372   
1  NaN -0.065913  1.018851 -0.06698  1.022380 -0.065939  1.020781 -0.06372   
2  NaN -0.065913  1.023127 -0.06698  1.021882 -0.065939  1.019178 -0.06372   
3  NaN -0.065913  1.017682 -0.06698  1.018149 -0.065939  1.019854 -0.06372   
4  NaN -0.065913  1.019952 -0.06698  1.019616 -0.065939  1.020933 -0.06372   

        8         9    ...       247       248       249       250       251  \
0  1.017877 -0.065087  ...  0.020638  1.019815  0.022129  1.019290  0.019855   
1  1.020218 -0.065087  ...  0.020638  1.018685  0.022129  1.015660  0.019855   
2  1.015861 -0.065087  ...  0.020638  1.019434  0.022129  1.019916  0.019855   
3  1.019880 -0.065087  ...  0.020638  1.018887  0.022129  1.019161  0.019855   
4  1.023061 -0.065087  ...  0.020638  1.023884  0.022129  1.021753  0.019855   

        25

In [None]:
import os
import pandas as pd
import numpy as np

# Directories and file paths
merged_dir = "/Users/jesicaanniebijju/Desktop/ADS/Merged_Dataset"  # Path to store the merged dataset
output_file = "mergeds_100k.csv"  # Name for the final merged file

# Ensure the directory exists
if not os.path.exists(merged_dir):
    os.makedirs(merged_dir)

columns = ['body_acc_x', 'body_acc_y', 'body_acc_z', 'body_gyro_x', 'body_gyro_y', 'body_gyro_z', 'activity']
merged_data = pd.DataFrame(np.random.randn(100000, len(columns)), columns=columns)  # Example with 100,000 rows

# Check total rows in the dataset
total_rows = len(merged_data)
print(f"\n🔢 Total rows in the dataset: {total_rows}")

# Attempt to save the entire dataset to a single CSV
try:
    # Save the dataset in a single CSV file
    merged_path = os.path.join(merged_dir, output_file)
    merged_data.to_csv(merged_path, index=False)
    print(f"✅ Dataset saved successfully at {merged_path}")
except Exception as e:
    print(f"❌ Error while saving the dataset: {e}")



🔢 Total rows in the dataset: 100000
✅ Dataset saved successfully at /Users/jesicaanniebijju/Desktop/ADS/Merged_Dataset/mergeds_100k.csv
