In [1]:
#import necessary packages
from pathlib import Path
import os
import ast
import shutil
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, precision_score
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sktime.transformations.panel.rocket import (
    MiniRocket,
    MiniRocketMultivariate,
    MiniRocketMultivariateVariable,
)

In [2]:
#collection of dataset information for each telemetry data 
training_dfs = []
test_dfs = []

training_df = []
test_df = []

file_names = []

label_df = pd.read_csv("labeled_anomalies.csv")

train_data_path = "npy_train" 
test_data_path = "npy_test" 

os.makedirs("raw_train", exist_ok=True)
os.makedirs("raw_test", exist_ok=True)

index = 0
for root, _, files in os.walk(train_data_path):
    for file in files:
        example_path = os.path.join(root, file)
        filename = Path(example_path).stem
        
        row_indices = label_df[label_df['chan_id'] == filename].index.tolist()
        if not row_indices:
            continue
        spacecraft = label_df.loc[row_indices[0], 'spacecraft']
        
        if (spacecraft == "SMAP"): 
            data = np.load(example_path)
            df = pd.DataFrame(data)
            print("(train) df: ", filename, df.shape, "index: ", index)
            df.to_csv(f"raw_train/{filename}.csv", index=False)
            training_dfs.append(df) 
            index += 1

index = 0
for root, _, files in os.walk(test_data_path):
    for file in files:
        example_path = os.path.join(root, file)
        filename = Path(example_path).stem
        
        row_indices = label_df[label_df['chan_id'] == filename].index.tolist()
        if not row_indices:
            continue
        spacecraft = label_df.loc[row_indices[0], 'spacecraft']
        
        if (spacecraft == "SMAP"): 
            data = np.load(example_path)
            df = pd.DataFrame(data)
            print("(test) df: ", filename, df.shape, "index: ", index)
            df.to_csv(f"raw_test/{filename}.csv", index=False)
            test_dfs.append(df) 
            file_names.append(filename)
            
            index += 1

(train) df:  D-12 (312, 25) index:  0
(train) df:  T-2 (2855, 25) index:  1
(train) df:  E-7 (2769, 25) index:  2
(train) df:  S-1 (2818, 25) index:  3
(train) df:  E-9 (2880, 25) index:  4
(train) df:  G-1 (2820, 25) index:  5
(train) df:  G-6 (2881, 25) index:  6
(train) df:  D-7 (2583, 25) index:  7
(train) df:  R-1 (2874, 25) index:  8
(train) df:  D-9 (2583, 25) index:  9
(train) df:  A-2 (2648, 25) index:  10
(train) df:  A-5 (705, 25) index:  11
(train) df:  P-7 (2853, 25) index:  12
(train) df:  F-1 (2869, 25) index:  13
(train) df:  E-11 (2880, 25) index:  14
(train) df:  G-7 (2446, 25) index:  15
(train) df:  E-8 (2880, 25) index:  16
(train) df:  E-6 (2880, 25) index:  17
(train) df:  D-13 (1490, 25) index:  18
(train) df:  E-1 (2880, 25) index:  19
(train) df:  T-3 (2876, 25) index:  20
(train) df:  E-10 (2880, 25) index:  21
(train) df:  A-4 (2690, 25) index:  22
(train) df:  P-1 (2872, 25) index:  23
(train) df:  A-3 (2736, 25) index:  24
(train) df:  D-8 (2602, 25) index

In [3]:
def get_data():
    training_dfs = {}
    test_dfs = {}
    file_names = {"train": {}, "test": {}}
    label_df = pd.read_csv("labeled_anomalies.csv")
    train_data_path = "raw_train" 
    test_data_path = "raw_test" 
    
    for root, _, files in os.walk(train_data_path):
        for file in files:
            example_path = os.path.join(root, file)
            filename = Path(example_path).stem
            df = pd.read_csv(example_path)
            channel = filename[0]            
            if (training_dfs.get(channel) == None):
                training_dfs[channel] = []
                file_names["train"][channel] = []
                training_dfs[channel].append(df)
                file_names["train"][channel].append(filename)
            else:
                training_dfs[channel].append(df)
                file_names["train"][channel].append(filename)
    
    for root, _, files in os.walk(test_data_path):
        for file in files:
            example_path = os.path.join(root, file)
            filename = Path(example_path).stem
            df = pd.read_csv(example_path)
            channel = filename[0]
            if (test_dfs.get(channel) == None):
                test_dfs[channel] = []
                file_names["test"][channel] = []
                test_dfs[channel].append(df)
                file_names["test"][channel].append(filename)
            else:
                test_dfs[channel].append(df)
                file_names["test"][channel].append(filename)

    return (training_dfs, test_dfs, label_df, file_names)

In [4]:
def create_windows(training_dfs, test_dfs, window_size, window_overlap, label_df, file_names):
    training_data = {}
    test_data = {}
    label_data = {}
    
    for channel in training_dfs.keys():
        training_data[channel] = []
        for df in training_dfs[channel]: 
            for i in range(0, len(df), window_overlap):
                window = []
                if i + window_size > len(df): 
                    window = df.iloc[-window_size:].to_numpy().tolist()
                else: 
                    window = df.iloc[i:i + window_size].to_numpy().tolist()
                np_window = np.array(window)
                transposed_window = np_window.T
                normal_window = transposed_window.tolist() 
                training_data[channel].append(normal_window)
                
    for channel in test_dfs.keys():
        test_data[channel] = []
        label_data[channel] = []
        for i in range(len(test_dfs[channel])):
            df = test_dfs[channel][i]
            for j in range(0, len(df), window_overlap):
                window = []
                if j + window_size > len(df): 
                    window = df.iloc[-window_size:].to_numpy().tolist()
                else: 
                    window = df.iloc[j:j + window_size].to_numpy().tolist()
                row_indices = label_df[label_df["chan_id"] == file_names["test"][channel][i]].index.tolist()
                
                if not row_indices:
                    continue

                anomaly_sequence = label_df.loc[row_indices[0], 'anomaly_sequences']
                anomaly_sequence = ast.literal_eval(anomaly_sequence)
                labeled = False
                for anomalies in anomaly_sequence:
                    if (not(anomalies[1] <= j or anomalies[0] >= j + window_size)) and labeled == False:
                        label_data[channel].append(1)
                        labeled = True
                if labeled == False:
                    label_data[channel].append(0)
                np_window = np.array(window)
                transposed_window = np_window.T
                normal_window = transposed_window.tolist() 
                test_data[channel].append(normal_window)
                
    return (training_data, test_data, label_data)

In [5]:
window_size = 60 
window_overlap = 20
training_dfs, test_dfs, label_df, file_names = get_data()

X_train_collection, X_test_collection, y_test_collection = create_windows(training_dfs, test_dfs, window_size, window_overlap, label_df, file_names)

os.makedirs("training_data", exist_ok=True)
os.makedirs("testing_data", exist_ok=True)
os.makedirs("label_data", exist_ok=True)

for channel in X_train_collection: 
    X_train = np.array(X_train_collection[channel])
    print("train" ,X_train.shape)
    # train_df = pd.DataFrame(X_train)
    # train_df.to_csv(f"training_data{channel}.csv", index=False)
    
for channel in X_test_collection: 
    X_test = np.array(X_test_collection[channel])
    # test_df = pd.DataFrame(X_test)
    # test_df.to_csv(f"testing_data/{channel}.csv", index=False)
    print("test" ,X_test.shape)
for channel in y_test_collection: 
    y_test =  np.array(y_test_collection[channel])
    # true_label_df = pd.DataFrame(y_test)
    # true_label_df.to_csv(f"label_data/{channel}.csv", index=False)
    print("label" ,y_test.shape)

train (842, 25, 60)
train (703, 25, 60)
train (432, 25, 60)
train (1445, 25, 60)
train (144, 25, 60)
train (793, 25, 60)
train (431, 25, 60)
train (1867, 25, 60)
train (141, 25, 60)
train (122, 25, 60)
test (5486, 25, 60)
test (1292, 25, 60)
test (2405, 25, 60)
test (4789, 25, 60)
test (3374, 25, 60)
test (1281, 25, 60)
test (2056, 25, 60)
test (403, 25, 60)
test (363, 25, 60)
test (367, 25, 60)
label (5486,)
label (1292,)
label (2405,)
label (4789,)
label (3374,)
label (1281,)
label (2056,)
label (403,)
label (363,)
label (367,)


In [7]:
X_transform_train_collection = {}
X_transform_test_collection = {}
y_true_label_collection = {}

for channel in X_train_collection: 
    minirocket = MiniRocketMultivariate(n_jobs = 2, random_state = 42) 
    X_train = np.array(X_train_collection[channel])
    if (X_test_collection.get(channel) is None or y_test_collection.get(channel) is None):  
        continue
    X_test = np.array(X_test_collection[channel])
    minirocket.fit(X_train)
    X_transform_train = minirocket.transform(X_train)
    X_transform_test = minirocket.transform(X_test)
    X_transform_train_collection[channel] = X_transform_train
    X_transform_test_collection[channel] = X_transform_test
    print("channel: ", channel)
    print("(transformed train): " ,X_transform_train.shape)
    print("(transformed test): " ,X_transform_test.shape)

for channel in y_test_collection: 
    y_test =  np.array(y_test_collection[channel])
    y_true_label_collection[channel] = y_test
    print("channel: ", channel)
    print("label" ,y_test.shape)

channel:  A
(transformed train):  (842, 9996)
(transformed test):  (3374, 9996)
channel:  P
(transformed train):  (703, 9996)
(transformed test):  (2056, 9996)
channel:  F
(transformed train):  (432, 9996)
(transformed test):  (1281, 9996)
channel:  D
(transformed train):  (1445, 9996)
(transformed test):  (4789, 9996)
channel:  R
(transformed train):  (144, 9996)
(transformed test):  (363, 9996)
channel:  G
(transformed train):  (793, 9996)
(transformed test):  (2405, 9996)
channel:  T
(transformed train):  (431, 9996)
(transformed test):  (1292, 9996)
channel:  E
(transformed train):  (1867, 9996)
(transformed test):  (5486, 9996)
channel:  S
(transformed train):  (141, 9996)
(transformed test):  (367, 9996)
channel:  B
(transformed train):  (122, 9996)
(transformed test):  (403, 9996)
channel:  E
label (5486,)
channel:  T
label (1292,)
channel:  G
label (2405,)
channel:  D
label (4789,)
channel:  A
label (3374,)
channel:  F
label (1281,)
channel:  P
label (2056,)
channel:  B
label (

In [None]:
X_transform_train_df = pd.DataFrame(X_transform_train)
X_transform_test_df = pd.DataFrame(X_transform_test)
X_transform_train_df.to_csv("rocket_train.csv", index=False)
X_transform_test_df.to_csv("rocket_test.csv", index=False)

In [None]:
scaler = MinMaxScaler()
X_fit_train = scaler.fit_transform(X_transform_train)
X_fit_test = scaler.transform(X_transform_test)

clf = OneClassSVM(gamma='auto').fit(X_fit_train)
y_pred = clf.predict(X_fit_test)

# print("params: ", grid_search.best_params_)
# print("score: ", best_svc_model.score(fit_X_test, y_test))
print(classification_report(y_test, y_pred))
precision = precision_score(y_test, y_pred, average='weighted')
print("FDR:", 1 - precision)
c_m = confusion_matrix(y_test, y_pred)
print(c_m)
disp = ConfusionMatrixDisplay(confusion_matrix=c_m)
disp.plot(cmap='Blues')

In [None]:
param_grid = {
        'n_estimators': [100, 200, 300, 400],
        'C': [1, 10, 100, 1000],
        'gamma': [0.0001, 0.001, 0.01, 0.1]
        }

clf = IsolationForest(random_state=0).fit(X_transform_train)
y_pred = clf.predict(X_transform_test)

# print("params: ", grid_search.best_params_)
# print("score: ", best_svc_model.score(fit_X_test, y_test))
print(classification_report(y_test, y_pred))
precision = precision_score(y_test, y_pred, average='weighted')
print("FDR:", 1 - precision)
c_m = confusion_matrix(y_test, y_pred)
print(c_m)
disp = ConfusionMatrixDisplay(confusion_matrix=c_m)
disp.plot(cmap='Blues')

In [16]:
#implement KNN


In [None]:
#implement GridSearch 

# svc = SVC(class_weight='balanced', random_state=42)
# param_grid = {
#         'kernel': ['poly'],
#         'degree': [2, 3, 4],
#         'C': [1, 10, 100, 1000],
#         'gamma': [0.0001, 0.001, 0.01, 0.1]
#         }
# grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, scoring='f1_weighted', cv=10, n_jobs=-1)
# grid_search.fit(fit_X_train, y_train)

# best_svc_model = grid_search.best_estimator_
# test_accuracy = best_svc_model.score(fit_X_test, y_test)
# y_pred = best_svc_model.predict(fit_X_test)



In [None]:
#implement grid search for unsupervised learning
#flattening feature engineering 
#rocket multithreading 
#window prediction brainstorm