In [2]:
#import necessary packages
from pathlib import Path
import os
import ast
import shutil
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, precision_score
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sktime.transformations.panel.rocket import (
    MiniRocket,
    MiniRocketMultivariate,
    MiniRocketMultivariateVariable,
)

In [3]:
#collection of dataset information for each telemetry data 
training_dfs = []
test_dfs = []

training_df = []
test_df = []

file_names = []

label_df = pd.read_csv("labeled_anomalies.csv")

train_data_path = "npy_train" 
test_data_path = "npy_test" 

os.makedirs("raw_train", exist_ok=True)
os.makedirs("raw_test", exist_ok=True)

index = 0
for root, _, files in os.walk(train_data_path):
    for file in files:
        example_path = os.path.join(root, file)
        filename = Path(example_path).stem
        
        row_indices = label_df[label_df['chan_id'] == filename].index.tolist()
        if not row_indices:
            continue
        spacecraft = label_df.loc[row_indices[0], 'spacecraft']
        
        if (spacecraft == "SMAP"): 
            data = np.load(example_path)
            df = pd.DataFrame(data)
            print("(train) df: ", filename, df.shape, "index: ", index)
            df.to_csv(f"raw_train/{filename}.csv", index=False)
            training_dfs.append(df) 
            index += 1

index = 0
for root, _, files in os.walk(test_data_path):
    for file in files:
        example_path = os.path.join(root, file)
        filename = Path(example_path).stem
        
        row_indices = label_df[label_df['chan_id'] == filename].index.tolist()
        if not row_indices:
            continue
        spacecraft = label_df.loc[row_indices[0], 'spacecraft']
        
        if (spacecraft == "SMAP"): 
            data = np.load(example_path)
            df = pd.DataFrame(data)
            print("(test) df: ", filename, df.shape, "index: ", index)
            df.to_csv(f"raw_test/{filename}.csv", index=False)
            test_dfs.append(df) 
            file_names.append(filename)
            
            index += 1

In [4]:
def get_data():
    training_dfs = {}
    test_dfs = {}
    file_names = {"train": {}, "test": {}}
    label_df = pd.read_csv("labeled_anomalies.csv")
    train_data_path = "raw_train" 
    test_data_path = "raw_test" 
    
    for root, _, files in os.walk(train_data_path):
        for file in files:
            example_path = os.path.join(root, file)
            filename = Path(example_path).stem
            df = pd.read_csv(example_path)
            channel = filename[0]            
            if (training_dfs.get(channel) == None):
                training_dfs[channel] = []
                file_names["train"][channel] = []
                training_dfs[channel].append(df)
                file_names["train"][channel].append(filename)
            else:
                training_dfs[channel].append(df)
                file_names["train"][channel].append(filename)
    
    for root, _, files in os.walk(test_data_path):
        for file in files:
            example_path = os.path.join(root, file)
            filename = Path(example_path).stem
            df = pd.read_csv(example_path)
            channel = filename[0]
            if (test_dfs.get(channel) == None):
                test_dfs[channel] = []
                file_names["test"][channel] = []
                test_dfs[channel].append(df)
                file_names["test"][channel].append(filename)
            else:
                test_dfs[channel].append(df)
                file_names["test"][channel].append(filename)

    return (training_dfs, test_dfs, label_df, file_names)

In [5]:
def create_windows(training_dfs, test_dfs, window_size, window_overlap, label_df, file_names):
    training_data = {}
    test_data = {}
    label_data = {}
    
    for channel in training_dfs.keys():
        training_data[channel] = []
        for df in training_dfs[channel]: 
            for i in range(0, len(df), window_overlap):
                window = []
                if i + window_size > len(df): 
                    window = df.iloc[-window_size:].to_numpy().tolist()
                else: 
                    window = df.iloc[i:i + window_size].to_numpy().tolist()
                np_window = np.array(window)
                transposed_window = np_window.T
                normal_window = transposed_window.tolist() 
                training_data[channel].append(normal_window)
                
    for channel in test_dfs.keys():
        test_data[channel] = []
        label_data[channel] = []
        for i in range(len(test_dfs[channel])):
            df = test_dfs[channel][i]
            for j in range(0, len(df), window_overlap):
                window = []
                if j + window_size > len(df): 
                    window = df.iloc[-window_size:].to_numpy().tolist()
                else: 
                    window = df.iloc[j:j + window_size].to_numpy().tolist()
                row_indices = label_df[label_df["chan_id"] == file_names["test"][channel][i]].index.tolist()
                
                if not row_indices:
                    continue

                anomaly_sequence = label_df.loc[row_indices[0], 'anomaly_sequences']
                anomaly_sequence = ast.literal_eval(anomaly_sequence)
                labeled = False
                for anomalies in anomaly_sequence:
                    if (not(anomalies[1] <= j or anomalies[0] >= j + window_size)) and labeled == False:
                        label_data[channel].append(1)
                        labeled = True
                if labeled == False:
                    label_data[channel].append(0)
                np_window = np.array(window)
                transposed_window = np_window.T
                normal_window = transposed_window.tolist() 
                test_data[channel].append(normal_window)
                
    return (training_data, test_data, label_data)

In [None]:
window_size = 60 
window_overlap = 20
training_dfs, test_dfs, label_df, file_names = get_data()

X_train_collection, X_test_collection, y_test_collection = create_windows(training_dfs, test_dfs, window_size, window_overlap, label_df, file_names)

os.makedirs("training_data", exist_ok=True)
os.makedirs("testing_data", exist_ok=True)
os.makedirs("label_data", exist_ok=True)

for channel in X_train_collection: 
    X_train = np.array(X_train_collection[channel])
    print("train" ,X_train.shape)
    # train_df = pd.DataFrame(X_train)
    # train_df.to_csv(f"training_data{channel}.csv", index=False)
    
for channel in X_test_collection: 
    X_test = np.array(X_test_collection[channel])
    # test_df = pd.DataFrame(X_test)
    # test_df.to_csv(f"testing_data/{channel}.csv", index=False)
    print("test" ,X_test.shape)
for channel in y_test_collection: 
    y_test =  np.array(y_test_collection[channel])
    # true_label_df = pd.DataFrame(y_test)
    # true_label_df.to_csv(f"label_data/{channel}.csv", index=False)
    print("label" ,y_test.shape)

In [None]:
X_transform_train_collection = {}
X_transform_test_collection = {}
y_true_label_collection = {}

for channel in X_train_collection: 
    minirocket = MiniRocketMultivariate(n_jobs = 2, random_state = 42) 
    X_train = np.array(X_train_collection[channel])
    if (X_test_collection.get(channel) is None or y_test_collection.get(channel) is None):  
        continue
    X_test = np.array(X_test_collection[channel])
    minirocket.fit(X_train)
    X_transform_train = minirocket.transform(X_train)
    X_transform_test = minirocket.transform(X_test)
    X_transform_train_collection[channel] = X_transform_train
    X_transform_test_collection[channel] = X_transform_test
    print("channel: ", channel)
    print("(transformed train): " ,X_transform_train.shape)
    print("(transformed test): " ,X_transform_test.shape)

for channel in y_test_collection: 
    y_test =  np.array(y_test_collection[channel])
    y_true_label_collection[channel] = y_test
    print("channel: ", channel)
    print("label" ,y_test.shape)

In [None]:
X_fit_train_collection = {}
X_fit_test_collection = {}

for channel in X_transform_train_collection: 
    X_transform_train = X_transform_train_collection[channel]
    X_transform_test = X_transform_test_collection[channel]
    scaler = MinMaxScaler()
    scaler.fit(X_transform_train)
    X_fit_train = scaler.transform(X_transform_train)
    X_fit_test = scaler.transform(X_transform_test)
    
    X_fit_train_collection[channel] = X_fit_train
    X_fit_test_collection[channel] = X_fit_test
    print("channel: ", channel)

In [None]:
clf = OneClassSVM(gamma='auto').fit(X_fit_train_collection["A"])
y_pred = clf.predict(X_fit_test)

# print("params: ", grid_search.best_params_)
# print("score: ", best_svc_model.score(fit_X_test, y_test))
print(classification_report(y_test, y_pred))
precision = precision_score(y_test, y_pred, average='weighted')
print("FDR:", 1 - precision)
c_m = confusion_matrix(y_test, y_pred)
print(c_m)
disp = ConfusionMatrixDisplay(confusion_matrix=c_m)
disp.plot(cmap='Blues')

In [None]:
clf = IsolationForest(random_state=0).fit(X_transform_train_collection["A"])
y_pred = clf.predict(X_transform_test)

# print("params: ", grid_search.best_params_)
# print("score: ", best_svc_model.score(fit_X_test, y_test))
print(classification_report(y_test, y_pred))
precision = precision_score(y_test, y_pred, average='weighted')
print("FDR:", 1 - precision)
c_m = confusion_matrix(y_test, y_pred)
print(c_m)
disp = ConfusionMatrixDisplay(confusion_matrix=c_m)
disp.plot(cmap='Blues')

In [None]:
#implement KNN


In [None]:
#implement GridSearch 

# svc = SVC(class_weight='balanced', random_state=42)
# param_grid = {
#         'kernel': ['poly'],
#         'degree': [2, 3, 4],
#         'C': [1, 10, 100, 1000],
#         'gamma': [0.0001, 0.001, 0.01, 0.1]
#         }
# grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, scoring='f1_weighted', cv=10, n_jobs=-1)
# grid_search.fit(fit_X_train, y_train)

# best_svc_model = grid_search.best_estimator_
# test_accuracy = best_svc_model.score(fit_X_test, y_test)
# y_pred = best_svc_model.predict(fit_X_test)


In [None]:
#implement grid search for unsupervised learning