In [1]:
import sys
import os

PROJECT_ROOT = os.path.abspath("..")
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

In [2]:
from src.data_prep.preprocess import preprocess_pipeline

DATA_PATH = os.path.join(PROJECT_ROOT, "data", "raw", "sensor_data.csv")

df = preprocess_pipeline(DATA_PATH)

In [3]:
# ---- Fix Python path so 'src' can be imported ----
import sys
import os

PROJECT_ROOT = os.path.abspath("..")
sys.path.append(PROJECT_ROOT)

# ---- Imports ----
from src.data_prep.preprocess import preprocess_pipeline
from src.features.lag_features import create_lag_features

# ---- Correct path to raw data ----
DATA_PATH = os.path.join(PROJECT_ROOT, "data", "raw", "sensor_data.csv")

# ---- Run preprocessing pipeline ----
df = preprocess_pipeline(DATA_PATH)

# ---- Create lag features ----
df = create_lag_features(
    df,
    cols=["temperature", "vibration", "pressure"],
    lags=[1, 2]
)

# ---- Drop rows with NaNs caused by lagging ----
df = df.dropna()

# ---- View output ----
df.head()


Unnamed: 0,timestamp,temperature,vibration,pressure,failure,hour,day,day_of_week,month,temp_mean_5,vib_std_10,pressure_max_30,failure_future,temperature_lag_1,temperature_lag_2,vibration_lag_1,vibration_lag_2,pressure_lag_1,pressure_lag_2
31,2025-01-01 00:31:00,67.967938,0.406418,5.962341,0,0,1,2,1,69.987873,0.083503,6.79153,0.0,70.463264,69.846496,0.417281,0.356693,6.027011,6.187963
32,2025-01-01 00:32:00,68.910997,0.491264,5.511033,0,0,1,2,1,70.138149,0.048965,6.79153,0.0,67.967938,70.463264,0.406418,0.417281,5.962341,6.027011
33,2025-01-01 00:33:00,71.519409,0.277239,5.913773,0,0,1,2,1,69.741621,0.066479,6.79153,0.0,68.910997,67.967938,0.491264,0.406418,5.511033,5.962341
34,2025-01-01 00:34:00,68.522247,0.472536,6.349727,0,0,1,2,1,69.476771,0.068204,6.79153,0.0,71.519409,68.910997,0.277239,0.491264,5.913773,5.511033
35,2025-01-01 00:35:00,68.058649,0.549104,5.871583,0,0,1,2,1,68.995848,0.077404,6.79153,0.0,68.522247,71.519409,0.472536,0.277239,6.349727,5.913773


In [4]:
from src.features.rolling_features import create_rolling_features


df = create_rolling_features(df)

df.isna().sum().sort_values(ascending=False).head()

df = df.dropna()

df.head()


Unnamed: 0,timestamp,temperature,vibration,pressure,failure,hour,day,day_of_week,month,temp_mean_5,...,press_ema_2,temp_roll_mean_3,vib_roll_mean_3,press_roll_mean_3,temp_roll_std_3,vib_roll_std_3,press_roll_std_3,temp_ema_3,vib_ema_3,press_ema_3
33,2025-01-01 00:33:00,71.519409,0.277239,5.913773,0,0,1,2,1,69.741621,...,5.829672,69.466115,0.39164,5.795716,1.839661,0.107775,0.247736,69.979438,0.36304,5.82523
34,2025-01-01 00:34:00,68.522247,0.472536,6.349727,0,0,1,2,1,69.476771,...,6.176375,69.650884,0.413679,5.924844,1.629822,0.118532,0.419457,69.250843,0.417788,6.087479
35,2025-01-01 00:35:00,68.058649,0.549104,5.871583,0,0,1,2,1,68.995848,...,5.97318,69.366768,0.432959,6.045028,1.878597,0.140187,0.264719,68.654746,0.483446,5.979531
36,2025-01-01 00:36:00,66.226812,0.742907,6.176097,0,0,1,2,1,68.647623,...,6.108458,67.60257,0.588182,6.132469,1.21378,0.139357,0.242039,67.440779,0.613176,6.077814
37,2025-01-01 00:37:00,70.223407,0.40124,5.980518,0,0,1,2,1,68.910105,...,6.023165,68.169623,0.564417,6.009399,2.000607,0.171347,0.154298,68.832093,0.507208,6.029166


In [5]:
import pandas as pd
import numpy as np

timestamps = pd.date_range(
    start="2025-01-01 00:00:00",
    periods=2000,   
    freq="T"
)

data = {
    "timestamp": timestamps,
    "temperature": np.random.normal(70, 3, size=2000),
    "vibration": np.random.normal(0.5, 0.1, size=2000),
    "pressure": np.random.normal(6, 0.5, size=2000),
    "failure": np.random.choice([0, 1], size=2000, p=[0.97, 0.03])
}

df_raw = pd.DataFrame(data)
df_raw.to_csv("../data/raw/sensor_data.csv", index=False)

df_raw.head()


  timestamps = pd.date_range(


Unnamed: 0,timestamp,temperature,vibration,pressure,failure
0,2025-01-01 00:00:00,69.667978,0.350152,6.34705,0
1,2025-01-01 00:01:00,68.237696,0.491281,7.025172,0
2,2025-01-01 00:02:00,69.337224,0.362496,5.788535,0
3,2025-01-01 00:03:00,69.073075,0.451931,6.60431,0
4,2025-01-01 00:04:00,73.266209,0.570082,5.797754,0


In [6]:
from src.data_prep.preprocess import preprocess_pipeline
from src.features.rolling_features import create_rolling_features

df = preprocess_pipeline("../data/raw/sensor_data.csv")
df = create_rolling_features(df)
df = df.dropna()

df.head()

Unnamed: 0,timestamp,temperature,vibration,pressure,failure,hour,day,day_of_week,month,temp_mean_5,...,press_ema_2,temp_roll_mean_3,vib_roll_mean_3,press_roll_mean_3,temp_roll_std_3,vib_roll_std_3,press_roll_std_3,temp_ema_3,vib_ema_3,press_ema_3
31,2025-01-01 00:31:00,66.564328,0.603721,6.660972,0,0,1,2,1,67.093964,...,6.295914,66.322891,0.484387,6.025823,1.386487,0.115411,0.696555,66.383251,0.514221,6.18461
32,2025-01-01 00:32:00,72.787254,0.427336,5.155399,0,0,1,2,1,67.842962,...,5.535571,68.061042,0.468135,5.69909,4.183712,0.120485,0.835375,69.585252,0.470778,5.670005
33,2025-01-01 00:33:00,68.837286,0.364811,6.313129,0,0,1,2,1,68.118643,...,6.053943,69.396289,0.465289,6.043167,3.148899,0.123895,0.788256,69.211269,0.417795,5.991567
34,2025-01-01 00:34:00,71.40877,0.68031,6.172707,0,0,1,2,1,68.885836,...,6.133119,71.011103,0.490819,5.880411,2.004786,0.167055,0.631793,70.31002,0.549052,6.082137
35,2025-01-01 00:35:00,70.080927,0.431266,5.839191,0,0,1,2,1,69.935713,...,5.937167,70.108994,0.492129,6.108342,1.285972,0.166322,0.243437,70.195473,0.490159,5.960664


In [7]:
df = df.dropna().reset_index(drop=True)

df.shape


(529, 31)

In [8]:


# Safety: ensure chronological order
df = df.sort_values("timestamp").reset_index(drop=True)

# Choose split point based on time (80% past, 20% future)
split_time = df["timestamp"].quantile(0.8)

# Time-based split
train = df[df["timestamp"] < split_time]
test  = df[df["timestamp"] >= split_time]

# Sanity checks (VERY IMPORTANT)
print("Train shape:", train.shape)
print("Test shape:", test.shape)

print("Train last timestamp:", train["timestamp"].max())
print("Test first timestamp:", test["timestamp"].min())

Train shape: (423, 31)
Test shape: (106, 31)
Train last timestamp: 2025-01-01 07:33:00
Test first timestamp: 2025-01-01 07:34:00


In [9]:
train.to_csv("../data/processed/train.csv", index=False)
test.to_csv("../data/processed/test.csv", index=False)

print("Train and test datasets saved successfully")

Train and test datasets saved successfully
