In [6]:
import sys
import os

PROJECT_ROOT = os.path.abspath("..")
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

In [7]:
from src.data_prep.preprocess import preprocess_pipeline

DATA_PATH = os.path.join(PROJECT_ROOT, "data", "raw", "sensor_data.csv")

df = preprocess_pipeline(DATA_PATH)

In [8]:
# ---- Fix Python path so 'src' can be imported ----
import sys
import os

PROJECT_ROOT = os.path.abspath("..")
sys.path.append(PROJECT_ROOT)

# ---- Imports ----
from src.data_prep.preprocess import preprocess_pipeline
from src.features.lag_features import create_lag_features

# ---- Correct path to raw data ----
DATA_PATH = os.path.join(PROJECT_ROOT, "data", "raw", "sensor_data.csv")

# ---- Run preprocessing pipeline ----
df = preprocess_pipeline(DATA_PATH)

# ---- Create lag features ----
df = create_lag_features(
    df,
    cols=["temperature", "vibration", "pressure"],
    lags=[1, 2]
)

# ---- Drop rows with NaNs caused by lagging ----
df = df.dropna()

# ---- View output ----
df.head()


Unnamed: 0,timestamp,temperature,vibration,pressure,failure,hour,day,day_of_week,month,temp_mean_5,vib_std_10,pressure_max_30,failure_future,temperature_lag_1,temperature_lag_2,vibration_lag_1,vibration_lag_2,pressure_lag_1,pressure_lag_2
31,2025-01-01 00:31:00,69.957459,0.56219,6.479047,0,0,1,2,1,70.628883,0.085567,6.968048,0.0,74.421078,73.030997,0.62532,0.551207,5.699033,6.100145
32,2025-01-01 00:32:00,68.829985,0.369622,6.723833,0,0,1,2,1,70.84421,0.097328,6.968048,1.0,69.957459,74.421078,0.56219,0.62532,6.479047,5.699033
33,2025-01-01 00:33:00,72.13897,0.659513,6.38158,0,0,1,2,1,71.675698,0.087087,6.968048,0.0,68.829985,69.957459,0.369622,0.56219,6.723833,6.479047
34,2025-01-01 00:34:00,72.919509,0.496602,6.010932,0,0,1,2,1,71.6534,0.088147,6.968048,0.0,72.13897,68.829985,0.659513,0.369622,6.38158,6.723833
35,2025-01-01 00:35:00,64.760157,0.478547,6.672967,0,0,1,2,1,69.721216,0.088139,6.968048,0.0,72.919509,72.13897,0.496602,0.659513,6.010932,6.38158


In [9]:
from src.features.rolling_features import create_rolling_features


df = create_rolling_features(df)

df.isna().sum().sort_values(ascending=False).head()

df = df.dropna()

df.head()


Unnamed: 0,timestamp,temperature,vibration,pressure,failure,hour,day,day_of_week,month,temp_mean_5,...,press_ema_2,temp_roll_mean_3,vib_roll_mean_3,press_roll_mean_3,temp_roll_std_3,vib_roll_std_3,press_roll_std_3,temp_ema_3,vib_ema_3,press_ema_3
33,2025-01-01 00:33:00,72.13897,0.659513,6.38158,0,0,1,2,1,71.675698,...,6.468466,70.308805,0.530442,6.528153,1.682239,0.14753,0.176332,70.766346,0.562709,6.49151
34,2025-01-01 00:34:00,72.919509,0.496602,6.010932,0,0,1,2,1,71.6534,...,6.163443,71.296155,0.508579,6.372115,2.17113,0.145316,0.356545,71.842928,0.529656,6.251221
35,2025-01-01 00:35:00,64.760157,0.478547,6.672967,0,0,1,2,1,69.721216,...,6.503126,69.939545,0.544887,6.35516,4.502428,0.099678,0.331807,68.301542,0.504101,6.462094
36,2025-01-01 00:36:00,71.316254,0.32604,5.519527,0,0,1,2,1,69.992975,...,5.847394,69.665307,0.43373,6.067809,4.322959,0.093698,0.57882,69.808898,0.415071,5.990811
37,2025-01-01 00:37:00,73.71571,0.399188,5.648883,0,0,1,2,1,70.97012,...,5.715053,69.930707,0.401258,5.947126,4.635762,0.076274,0.631916,71.762304,0.40713,5.819847


In [10]:
import pandas as pd
import numpy as np

timestamps = pd.date_range(
    start="2025-01-01 00:00:00",
    periods=2000,   
    freq="min"
)

data = {
    "timestamp": timestamps,
    "temperature": np.random.normal(70, 3, size=2000),
    "vibration": np.random.normal(0.5, 0.1, size=2000),
    "pressure": np.random.normal(6, 0.5, size=2000),
    "failure": np.random.choice([0, 1], size=2000, p=[0.97, 0.03])
}

df_raw = pd.DataFrame(data)
df_raw.to_csv("../data/raw/sensor_data.csv", index=False)

df_raw.head()


Unnamed: 0,timestamp,temperature,vibration,pressure,failure
0,2025-01-01 00:00:00,75.18842,0.527152,5.437993,0
1,2025-01-01 00:01:00,71.724186,0.388242,6.614212,0
2,2025-01-01 00:02:00,73.44006,0.468771,6.490857,0
3,2025-01-01 00:03:00,69.744325,0.445689,6.077975,0
4,2025-01-01 00:04:00,67.252522,0.319292,6.074631,0


In [11]:
from src.data_prep.preprocess import preprocess_pipeline
from src.features.rolling_features import create_rolling_features

df = preprocess_pipeline("../data/raw/sensor_data.csv")
df = create_rolling_features(df)
df = df.dropna()

df.head()

Unnamed: 0,timestamp,temperature,vibration,pressure,failure,hour,day,day_of_week,month,temp_mean_5,...,press_ema_2,temp_roll_mean_3,vib_roll_mean_3,press_roll_mean_3,temp_roll_std_3,vib_roll_std_3,press_roll_std_3,temp_ema_3,vib_ema_3,press_ema_3
31,2025-01-01 00:31:00,67.22568,0.389307,5.806099,0,0,1,2,1,69.143023,...,5.935116,68.488118,0.552841,6.103817,2.588185,0.159232,0.313638,68.172508,0.511958,6.029388
32,2025-01-01 00:32:00,72.525865,0.507792,5.295546,0,0,1,2,1,69.657961,...,5.508736,68.841651,0.53483,5.725246,3.198627,0.160756,0.395522,70.349187,0.509875,5.662467
33,2025-01-01 00:33:00,70.708685,0.433499,5.918997,0,0,1,2,1,69.739781,...,5.782243,70.15341,0.443533,5.673547,2.693369,0.059876,0.33219,70.528936,0.471687,5.790732
34,2025-01-01 00:34:00,70.069285,0.662447,5.978643,0,0,1,2,1,69.460584,...,5.913176,71.101278,0.534579,5.731062,1.274478,0.116801,0.378345,70.29911,0.567067,5.884687
35,2025-01-01 00:35:00,72.578711,0.446831,6.193875,0,0,1,2,1,70.621645,...,6.100309,71.118894,0.514259,6.030505,1.304035,0.128508,0.144592,71.438911,0.506949,6.039281


In [12]:
df = df.dropna().reset_index(drop=True)

df.shape


(529, 31)

In [13]:


# Safety: ensure chronological order
df = df.sort_values("timestamp").reset_index(drop=True)

# Choose split point based on time (80% past, 20% future)
split_time = df["timestamp"].quantile(0.8)

# Time-based split
train = df[df["timestamp"] < split_time]
test  = df[df["timestamp"] >= split_time]

# Sanity checks (VERY IMPORTANT)
print("Train shape:", train.shape)
print("Test shape:", test.shape)

print("Train last timestamp:", train["timestamp"].max())
print("Test first timestamp:", test["timestamp"].min())

Train shape: (423, 31)
Test shape: (106, 31)
Train last timestamp: 2025-01-01 07:33:00
Test first timestamp: 2025-01-01 07:34:00


In [14]:
train.to_csv("../data/processed/train.csv", index=False)
test.to_csv("../data/processed/test.csv", index=False)

print("Train and test datasets saved successfully")

Train and test datasets saved successfully
