In [None]:
"""
### Step 1: Import Dependencies
This cell imports all required libraries for:
- Data manipulation (`pandas`, `numpy`)
- Feature scaling (`StandardScaler` from `sklearn.preprocessing`)
"""

import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler


In [None]:
"""
### Step 2: Define File Paths
- `input_file`: Path to the processed dataset from the preprocessing step.
- `output_dir`: Directory where the feature-engineered dataset will be saved.
- `output_file`: Full path to the output CSV file.
"""

input_file = r"C:\Users\Ken Ira Talingting\Desktop\anomaly-detection-project\data\processed\equipment_anomaly_data_processed.csv"
output_dir = r"C:\Users\Ken Ira Talingting\Desktop\anomaly-detection-project\data\processed"
output_file = os.path.join(output_dir, "equipment_anomaly_data_feature_engineered.csv")


In [None]:
"""
### Step 3: Load the Processed Dataset
- Reads the preprocessed dataset from CSV.
- Displays basic information about the dataset, such as shape and sample records.
"""

df = pd.read_csv(input_file)

print(f"✅ Dataset loaded successfully: {df.shape[0]} rows, {df.shape[1]} columns.")
display(df.head())  # Using display() for better visualization in Jupyter Notebook


In [None]:
"""
### Step 4: Check for Missing Values
- Identifies any missing values in the dataset.
- If missing values are found, they should be handled accordingly.
"""

missing_values = df.isnull().sum()
print("🔍 Missing values per column:\n", missing_values)

# Uncomment below if you want to handle missing values (example)
# df.fillna(method='ffill', inplace=True)  # Forward fill to propagate last valid observation


In [None]:
"""
### Step 5: Feature Engineering
This step creates new features to enhance the dataset:

1. **Ratio Features**:
   - `temp_pressure_ratio` = Temperature / Pressure
   - `vib_humidity_ratio` = Vibration / Humidity

2. **Interaction Features**:
   - `pressure_vibration_prod` = Pressure * Vibration
   - `temp_humidity_prod` = Temperature * Humidity

3. **Logarithmic Transformation**:
   - `log_vibration` = Log-transformed Vibration (to reduce skewness)

A small constant (`epsilon`) is added to avoid division by zero issues.
"""

epsilon = 1e-6  # Small constant to prevent division by zero

# Create ratio features
df['temp_pressure_ratio'] = df['temperature'] / (df['pressure'] + epsilon)
df['vib_humidity_ratio'] = df['vibration'] / (df['humidity'] + epsilon)

# Create interaction features
df['pressure_vibration_prod'] = df['pressure'] * df['vibration']
df['temp_humidity_prod'] = df['temperature'] * df['humidity']

# Log transformation for vibration (useful for skewed distributions)
df['log_vibration'] = np.log(df['vibration'] + epsilon)

print("✅ Feature engineering completed. Sample of new features:")
display(df.head())


In [None]:
"""
### Step 6: Categorical Encoding
- Converts categorical variables into numerical format using **one-hot encoding**.
- Drops the first category to avoid multicollinearity (`drop_first=True`).

Categorical columns:
- `equipment`
- `location`
"""

df_encoded = pd.get_dummies(df, columns=['equipment', 'location'], drop_first=True)

print(f"✅ Categorical encoding completed. New dataset shape: {df_encoded.shape}")
display(df_encoded.head())


In [None]:
"""
### Step 7: Scale Numerical Features
- Standardizes numerical features using `StandardScaler` to improve model performance.
- Ensures that all features have **zero mean** and **unit variance**.

Columns being scaled:
- Original sensor readings (`temperature`, `pressure`, `vibration`, `humidity`)
- Newly engineered features (`temp_pressure_ratio`, `vib_humidity_ratio`, etc.)
"""

numeric_cols = [
    'temperature', 'pressure', 'vibration', 'humidity',
    'temp_pressure_ratio', 'vib_humidity_ratio',
    'pressure_vibration_prod', 'temp_humidity_prod', 'log_vibration'
]

scaler = StandardScaler()
df_encoded[numeric_cols] = scaler.fit_transform(df_encoded[numeric_cols])

print("✅ Feature scaling completed. Sample of scaled data:")
display(df_encoded.head())


In [None]:
"""
### Step 8: Save the Feature-Engineered Dataset
- Saves the processed dataset to CSV for further analysis or model training.
"""

df_encoded.to_csv(output_file, index=False)
print(f"✅ Feature engineered dataset saved to: {output_file}")
