In [1]:
import os
import pandas as pd
import numpy as np
import glob
import time
import warnings
from autonomous_feature_selection import run_autonomous_xgboost_pipeline

In [2]:
DATA_DIR = r'D:\Data\SCADA\Wind_Turbine'

In [3]:
def get_farm_scada_chunked(farm='A', asset_ids=None, chunksize=50000):
    
    farm_dir = os.path.join(DATA_DIR, 'Wind Farm '+farm)
    farm_dataset_dir = os.path.join(farm_dir, 'datasets')
    all_files = glob.glob(os.path.join(farm_dataset_dir, '*.csv'))
    parquet_path = os.path.join(farm_dir, f'farm_{farm}_optimized.parquet')
    dtype_dict = {
        'asset_id': 'int16',
        'status_type_id': 'int8',
    }
    # Check if already processed
    if os.path.exists(parquet_path):
        print(f"Loading pre-processed Farm {farm}...")
        return pd.read_parquet(parquet_path)
        
    all_data = []
    
    for f in all_files:
        # Read in chunks
        for chunk in pd.read_csv(f, sep=";", dtype=dtype_dict, chunksize=chunksize):
            # Filter to specific assets if provided
            if asset_ids is not None:
                chunk = chunk[chunk['asset_id'].isin(asset_ids)]
            
            # Optimize dtypes
            float_cols = chunk.select_dtypes(include=['float64']).columns
            chunk[float_cols] = chunk[float_cols].astype('float32')
            
            # Clean
            chunk["time_stamp"] = pd.to_datetime(chunk["time_stamp"])
            chunk = chunk.drop(['train_test', 'id'], axis=1, errors='ignore')
            chunk = chunk.dropna(subset=["asset_id", "time_stamp"])
            
            all_data.append(chunk)
    
    scada_data = pd.concat(all_data, ignore_index=True)
    scada_data = scada_data.drop_duplicates(subset=['asset_id', 'time_stamp'], keep='first')
    scada_data = scada_data.sort_values(["asset_id", "time_stamp"]).reset_index(drop=True)

    scada_data.to_parquet(parquet_path, compression='snappy', index=False)
    print(f"Saved to {parquet_path}")
    
    return scada_data

In [4]:
def get_event_info(farm='A'):
    farm_dir = os.path.join(DATA_DIR, 'Wind Farm '+farm)
    event_info = pd.read_csv(farm_dir + '\\event_info.csv', sep=';')
    event_info = event_info.rename(columns={'asset':'asset_id'})
    # Drop events that aren't anomalies
    event_info = event_info[event_info['event_label'] == 'anomaly']
    # Clean Up
    event_info["event_start"] = pd.to_datetime(event_info["event_start"])
    event_info["event_end"] = pd.to_datetime(event_info["event_end"])
    event_info["asset_id"] = pd.to_numeric(event_info["asset_id"], errors="coerce").astype("Int16")
    # Drop rows with missing critical info
    event_info = event_info.dropna(subset=["asset_id", "event_start", 'event_end'])
    # Sort for asset_id
    event_info = event_info.sort_values(["asset_id"]).reset_index(drop=True)

    return event_info

In [None]:
# Load your data / set your parameters
farm = 'A'
cohens_d_threshold = 0.8
correlation_threshold = .9
max_cohens = 50
max_features = 15
scada = get_farm_scada_chunked(farm=farm)
event_info = get_event_info(farm=farm)

start_time = time.perf_counter()
results = run_autonomous_xgboost_pipeline(
    farm=farm,
    scada_data=scada,
    event_info=event_info,
    cohens_d_threshold=cohens_d_threshold,
    correlation_threshold=correlation_threshold,
    max_cohens_d= max_cohens,
    max_features=max_features,
    use_gpu=False,
    gpu_id=0
)
end_time = time.perf_counter()
elapsed_time = end_time - start_time

print(f"Recall: {results['results']['detected'].sum()}/{len(results['results'])}")
print(f"FPR: {results['fpr']:.1%}")
print(f"Top 5 Features:\n{results['feature_importance'].head()}")
print("="*50)
print(f"Farm {farm} Results:")
print(f"  Recall: {results['results']['detected'].mean():.1%}")
print(f"  FPR: {results['fpr']:.1%}")
print(f"  Selected Features: {len(results['selected_features'])}")
print(f"\nTop 3 Features:")
for i, (feat, imp) in enumerate(results['feature_importance'].head(3).items(), 1):
    print(f"  {i}. {feat}: {imp:.3f}")
print("="*80)
print(f"The code executed in {elapsed_time:.6f} seconds (wall-clock time).")
