# Data Preparation
## Getting the System Ready and Loading the data

In [13]:
# 01_prepare_data.ipynb
import pandas as pd, os

RAW_DIR       = "../data/raw"
PROCESSED_CSV = "../data/processed/hydraulic_cycles.csv"
os.makedirs(os.path.dirname(PROCESSED_CSV), exist_ok=True)

# load each PS1…SE file as before
sensor_groups = {
  "PS":   {"files":[f"PS{i}.txt" for i in range(1,7)], "pts":6000},
  "EPS1": {"files":["EPS1.txt"], "pts":6000},
  "FS":   {"files":["FS1.txt","FS2.txt"], "pts":600},
  "LOW":  {"files":["TS1.txt","TS2.txt","TS3.txt","TS4.txt","VS1.txt","CE.txt","CP.txt","SE.txt"], "pts":60}
}
dfs = []
for grp in sensor_groups.values():
    for fn in grp["files"]:
        df = pd.read_csv(f"{RAW_DIR}/{fn}", sep="\t", header=None)
        base = os.path.splitext(fn)[0]
        df.columns = [f"{base}_{i}" for i in range(grp["pts"])]
        dfs.append(df)
sensors = pd.concat(dfs, axis=1)

profile_cols = ["cooler_pct","valve_pct","pump_leak","acc_pressure","stable_flag"]
profile = pd.read_csv(f"{RAW_DIR}/profile.txt", sep="\t", header=None, names=profile_cols)

df = pd.concat([sensors, profile], axis=1)
df.to_csv(PROCESSED_CSV, index=False)
print("Processed:", df.shape)
df.head()


Processed: (2205, 43685)


Unnamed: 0,PS1_0,PS1_1,PS1_2,PS1_3,PS1_4,PS1_5,PS1_6,PS1_7,PS1_8,PS1_9,...,SE_55,SE_56,SE_57,SE_58,SE_59,cooler_pct,valve_pct,pump_leak,acc_pressure,stable_flag
0,151.47,151.45,151.52,151.27,150.8,150.69,153.89,154.67,152.88,153.82,...,68.223,68.223,68.159,68.159,68.264,3,100,0,130,1
1,151.11,151.12,151.16,150.92,150.7,150.62,152.4,153.21,152.81,153.53,...,68.491,68.491,68.528,68.528,68.595,3,100,0,130,1
2,150.81,150.79,150.84,150.65,150.35,150.23,152.03,152.81,152.44,153.27,...,68.456,68.456,68.758,68.758,68.628,3,100,0,130,1
3,150.48,150.47,150.52,150.31,150.04,149.98,151.63,152.48,152.24,152.94,...,69.021,69.021,68.851,68.851,68.868,3,100,0,130,1
4,150.41,150.35,150.24,150.12,149.87,149.71,151.64,152.37,151.78,152.68,...,68.862,68.862,69.036,69.036,68.972,3,100,0,130,1


## Understanding the Data

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2205 entries, 0 to 2204
Columns: 43685 entries, PS1_0 to stable_flag
dtypes: float64(43646), int64(39)
memory usage: 734.9 MB


In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PS1_0,2205.0,148.978467,3.684421,144.92,147.06,148.56,150.98,188.80
PS1_1,2205.0,148.977220,3.685462,144.92,147.10,148.56,150.98,188.88
PS1_2,2205.0,148.966916,3.685930,144.56,147.10,148.56,150.98,188.97
PS1_3,2205.0,148.647410,3.790893,144.44,146.18,148.34,150.87,188.84
PS1_4,2205.0,147.968898,4.023828,141.19,145.06,148.13,150.74,186.11
...,...,...,...,...,...,...,...,...
cooler_pct,2205.0,41.240816,42.383143,3.00,3.00,20.00,100.00,100.00
valve_pct,2205.0,90.693878,10.681802,73.00,80.00,100.00,100.00,100.00
pump_leak,2205.0,0.669388,0.817233,0.00,0.00,0.00,1.00,2.00
acc_pressure,2205.0,107.199546,16.435848,90.00,90.00,100.00,130.00,130.00


## Missing Value and Outlier Treatment