In [None]:
import pandas as pd
import os


In [None]:
class DataPreprocessor:
    def __init__(self, raw_file_path, processed_dir):
        self.raw_file_path = raw_file_path
        self.processed_dir = processed_dir

    def load_data(self):
        """Loads the raw dataset from CSV."""
        if not os.path.exists(self.raw_file_path):
            raise FileNotFoundError(f"File not found: {self.raw_file_path}")
        try:
            df = pd.read_csv(self.raw_file_path)
            print(f"✅ Loaded raw data with {df.shape[0]} rows and {df.shape[1]} columns.")
            return df
        except Exception as e:
            print(f"❌ Error loading data: {e}")
            raise

    def preprocess(self, df):
        """
        Preprocesses and cleans the dataset:
          - Removes duplicate rows.
          - Optimizes data types for memory efficiency.
          - Imputes missing values (if any) using median for numeric features and mode for categorical features.
        """
        # Remove duplicate rows
        initial_count = df.shape[0]
        df.drop_duplicates(inplace=True)
        print(f"✅ Dropped {initial_count - df.shape[0]} duplicate rows.")

        # Optimize data types
        try:
            df['temperature'] = df['temperature'].astype('float32')
            df['pressure'] = df['pressure'].astype('float32')
            df['vibration'] = df['vibration'].astype('float32')
            df['humidity'] = df['humidity'].astype('float32')
            df['faulty'] = df['faulty'].astype('int8')
            df['equipment'] = df['equipment'].astype('category')
            df['location'] = df['location'].astype('category')
            print("✅ Optimized data types for memory efficiency.")
        except Exception as e:
            print(f"❌ Error converting data types: {e}")
            raise

        # Impute missing values if any
        missing = df.isnull().sum()
        if missing.any():
            print("⚠️ Missing values detected. Imputing missing values...")
            # Numeric columns imputed with median
            numeric_cols = ['temperature', 'pressure', 'vibration', 'humidity']
            for col in numeric_cols:
                if df[col].isnull().sum() > 0:
                    median_val = df[col].median()
                    df[col].fillna(median_val, inplace=True)
                    print(f"   Imputed missing values in '{col}' with median value {median_val}")

            # Categorical columns imputed with mode
            categorical_cols = ['equipment', 'location']
            for col in categorical_cols:
                if df[col].isnull().sum() > 0:
                    mode_val = df[col].mode()[0]
                    df[col].fillna(mode_val, inplace=True)
                    print(f"   Imputed missing values in '{col}' with mode value {mode_val}")

            # In case 'faulty' has missing values (unlikely)
            if df['faulty'].isnull().sum() > 0:
                mode_val = df['faulty'].mode()[0]
                df['faulty'].fillna(mode_val, inplace=True)
                print(f"   Imputed missing values in 'faulty' with mode value {mode_val}")
        else:
            print("✅ No missing values found.")

        return df

    def save_processed_data(self, df, filename="equipment_anomaly_data_processed.csv"):
        """Saves the processed dataset to the specified processed folder."""
        if not os.path.exists(self.processed_dir):
            os.makedirs(self.processed_dir)
            print(f"✅ Created processed data directory: {self.processed_dir}")
        processed_file_path = os.path.join(self.processed_dir, filename)
        try:
            df.to_csv(processed_file_path, index=False)
            print(f"✅ Processed data saved to: {processed_file_path}")
        except Exception as e:
            print(f"❌ Error saving processed data: {e}")
            raise


In [None]:
raw_file_path = r"C:\Users\Ken Ira Talingting\Desktop\anomaly-detection-project\data\raw\equipment_anomaly_data.csv"
processed_dir = r"C:\Users\Ken Ira Talingting\Desktop\anomaly-detection-project\data\processed"


In [None]:
preprocessor = DataPreprocessor(raw_file_path, processed_dir)


In [None]:
raw_df = preprocessor.load_data()


In [None]:
processed_df = preprocessor.preprocess(raw_df)


In [None]:
preprocessor.save_processed_data(processed_df)


In [None]:
if processed_df is not None:
    display(processed_df.head())  # Display first 5 rows
