In [6]:
import numpy as np
import pandas as pd
import multiprocessing
from numba import jit
from joblib import Parallel, delayed
import os  # Import os to manage directories

# Load dataset
file_path = "C:/Users/gutti/OneDrive/Documents/Project/SWaT_Dataset_NoHeaders.csv"

try:
    sensor_data = pd.read_csv(file_path, header=None)  # Since no headers
    print(f" Dataset loaded successfully! Total samples: {len(sensor_data)}")
except FileNotFoundError:
    print(" File not found! Please check the file path.")
    exit()

# Keep only numeric columns
sensor_data = sensor_data.select_dtypes(include=[np.number])

# Define thresholds
ACF_THRESHOLD = 0.3  
VARIANCE_WINDOW = 5000  # Moving window size for variance detection
VARIANCE_THRESHOLD = 0.05  # Change detection threshold
MIN_SUBSEQUENCE_SIZE = 5000  # Min samples per sequence


@jit(nopython=True, parallel=True)
def fast_autocorrelation(series, nlags=50):
    """Compute autocorrelation using Numba for speed."""
    n = len(series)
    mean = np.mean(series)
    series -= mean
    result = np.correlate(series, series, mode='full')[-n:]
    result /= np.arange(n, 0, -1)
    result /= result[0]
    return result[:nlags+1]

def compute_autocorrelation(col_data):
    """Find meaningful split points based on autocorrelation (JIT optimized)."""
    series = col_data.dropna().values
    if len(series) < 50 or np.std(series) == 0:
        return []
    
    try:
        autocorr_values = fast_autocorrelation(series, nlags=50)
        split_indices = np.where(np.abs(autocorr_values) < ACF_THRESHOLD)[0].tolist()

        
        filtered_splits = []
        last_idx = 0
        for idx in split_indices:
            if idx - last_idx >= MIN_SUBSEQUENCE_SIZE:
                filtered_splits.append(idx)
                last_idx = idx

        return filtered_splits
    except Exception:
        return []

# Step 1: Compute Split Indices Using Autocorrelation
split_points_acf = Parallel(n_jobs=multiprocessing.cpu_count() // 2)(
    delayed(compute_autocorrelation)(sensor_data[col]) for col in sensor_data.columns
)

# Merge all split points and remove duplicates
split_indices_acf = sorted(set(idx for sublist in split_points_acf for idx in sublist if 0 < idx < len(sensor_data)))
def detect_variance_changes(data):
    """Detect change points based on rolling variance calculation."""
    variance_series = data.rolling(window=VARIANCE_WINDOW).var()
    split_indices_var = np.where(variance_series.mean(axis=1) > VARIANCE_THRESHOLD)[0].tolist()
    
    
    filtered_splits = []
    last_idx = 0
    for idx in split_indices_var:
        if idx - last_idx >= MIN_SUBSEQUENCE_SIZE:
            filtered_splits.append(idx)
            last_idx = idx
            
    return filtered_splits

split_indices_var = detect_variance_changes(sensor_data)
split_indices = sorted(set(split_indices_acf + split_indices_var))
split_indices += list(range(100000, len(sensor_data), 100000))
split_indices = sorted(set(split_indices))  # Remove duplicates
subsequences = []
start_idx = 0
for idx in split_indices:
    if idx - start_idx >= MIN_SUBSEQUENCE_SIZE:  
        subsequences.append(sensor_data.iloc[start_idx:idx])
        start_idx = idx  

if start_idx < len(sensor_data):
    subsequences.append(sensor_data.iloc[start_idx:])
total_rows = sum(len(subseq) for subseq in subsequences)
print(f"\nTotal subsequences generated: {len(subsequences)}")
print(f"Original Dataset Rows: {len(sensor_data)}")
print(f"Total Rows After Splitting: {total_rows}")
if total_rows == len(sensor_data):
    print(" Data fully split with NO extra rows! ")
elif total_rows > len(sensor_data):
    print(f" Extra rows detected! Difference: {total_rows - len(sensor_data)}")
elif total_rows < len(sensor_data):
    print(f" Data loss detected! Difference: {len(sensor_data) - total_rows}")
output_folder = "C:/Users/gutti/OneDrive/Documents/Project/Subsequences/new_split.csv"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
for i, subseq in enumerate(subsequences):
    save_path = os.path.join(output_folder, f"subsequence_{i+1}.csv")
    subseq.to_csv(save_path, index=False)
    print(f" Saved: {save_path} (Rows: {len(subseq)})")=
# Display first 3 subsequences
for i, subseq in enumerate(subsequences[:3]):
    print(f"\n🔹 Subsequence {i+1} (Rows: {len(subseq)})")
    print(subseq.head())


✅ Dataset loaded successfully! Total samples: 449919

✅ Total subsequences generated: 90
Original Dataset Rows: 449919
Total Rows After Splitting: 449919
✅ Data fully split with NO extra rows! 🎯
📂 Saved: C:/Users/gutti/OneDrive/Documents/Project/Subsequences/new_split.csv\subsequence_1.csv (Rows: 5000)
📂 Saved: C:/Users/gutti/OneDrive/Documents/Project/Subsequences/new_split.csv\subsequence_2.csv (Rows: 5000)
📂 Saved: C:/Users/gutti/OneDrive/Documents/Project/Subsequences/new_split.csv\subsequence_3.csv (Rows: 5000)
📂 Saved: C:/Users/gutti/OneDrive/Documents/Project/Subsequences/new_split.csv\subsequence_4.csv (Rows: 5000)
📂 Saved: C:/Users/gutti/OneDrive/Documents/Project/Subsequences/new_split.csv\subsequence_5.csv (Rows: 5000)
📂 Saved: C:/Users/gutti/OneDrive/Documents/Project/Subsequences/new_split.csv\subsequence_6.csv (Rows: 5000)
📂 Saved: C:/Users/gutti/OneDrive/Documents/Project/Subsequences/new_split.csv\subsequence_7.csv (Rows: 5000)
📂 Saved: C:/Users/gutti/OneDrive/Documents