In [1]:
#import necessary packages
from pathlib import Path
import os
import ast
import shutil
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, precision_score
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sktime.transformations.panel.rocket import (
    MiniRocket,
    MiniRocketMultivariate,
    MiniRocketMultivariateVariable,
)

In [2]:
#collection of dataset information for each telemetry data 
training_dfs = []
test_dfs = []

training_df = []
test_df = []

file_names = []

label_df = pd.read_csv("labeled_anomalies.csv")

train_data_path = "train" 
test_data_path = "test" 

os.makedirs("training_set", exist_ok=True)
os.makedirs("test_set", exist_ok=True)

index = 0
for root, _, files in os.walk(train_data_path):
    for file in files:
        example_path = os.path.join(root, file)
        filename = Path(example_path).stem
        
        row_indices = label_df[label_df['chan_id'] == filename].index.tolist()
        if not row_indices:
            continue
        spacecraft = label_df.loc[row_indices[0], 'spacecraft']
        
        if (spacecraft == "SMAP"): 
            data = np.load(example_path)
            df = pd.DataFrame(data)
            print("(train) df: ", filename, df.shape, "index: ", index)
            df.to_csv(f"training_set/{filename}.csv", index=False)
            training_dfs.append(df) 
            index += 1

index = 0
for root, _, files in os.walk(test_data_path):
    for file in files:
        example_path = os.path.join(root, file)
        filename = Path(example_path).stem
        
        row_indices = label_df[label_df['chan_id'] == filename].index.tolist()
        if not row_indices:
            continue
        spacecraft = label_df.loc[row_indices[0], 'spacecraft']
        
        if (spacecraft == "SMAP"): 
            data = np.load(example_path)
            df = pd.DataFrame(data)
            print("(test) df: ", filename, df.shape, "index: ", index)
            df.to_csv(f"test_set/{filename}.csv", index=False)
            test_dfs.append(df) 
            file_names.append(filename)
            
            index += 1

(train) df:  D-4 (2833, 25) index:  0
(train) df:  E-12 (2880, 25) index:  1
(train) df:  D-3 (2880, 25) index:  2
(train) df:  A-6 (682, 25) index:  3
(train) df:  P-4 (2609, 25) index:  4
(train) df:  P-3 (2855, 25) index:  5
(train) df:  A-1 (2880, 25) index:  6
(train) df:  A-8 (762, 25) index:  7
(train) df:  F-2 (2861, 25) index:  8
(train) df:  E-4 (2880, 25) index:  9
(train) df:  E-3 (2880, 25) index:  10
(train) df:  T-1 (2875, 25) index:  11
(train) df:  D-11 (2611, 25) index:  12
(train) df:  G-2 (2478, 25) index:  13
(train) df:  A-9 (762, 25) index:  14
(train) df:  F-3 (2880, 25) index:  15
(train) df:  P-2 (2821, 25) index:  16
(train) df:  A-7 (2879, 25) index:  17
(train) df:  E-13 (2880, 25) index:  18
(train) df:  D-2 (2880, 25) index:  19
(train) df:  D-5 (2561, 25) index:  20
(train) df:  G-3 (2624, 25) index:  21
(train) df:  G-4 (2551, 25) index:  22
(train) df:  B-1 (2435, 25) index:  23
(train) df:  E-2 (2880, 25) index:  24
(train) df:  E-5 (2880, 25) index: 

In [3]:
def get_data():
    training_dfs = []
    test_dfs = []
    file_names = []
    label_df = pd.read_csv("labeled_anomalies.csv")
    train_data_path = "training_set" 
    test_data_path = "test_set" 
    
    index = 0
    for root, _, files in os.walk(train_data_path):
        for file in files:
            example_path = os.path.join(root, file)
            filename = Path(example_path).stem
            
            df = pd.read_csv(example_path)
            print("(train) df: ", filename, df.shape, "index: ", index)
            training_dfs.append(df) 
            index += 1
    
    index = 0
    for root, _, files in os.walk(test_data_path):
        for file in files:
            example_path = os.path.join(root, file)
            filename = Path(example_path).stem
            
            df = pd.read_csv(example_path)
            print("(test) df: ", filename, df.shape, "index: ", index)
            test_dfs.append(df) 
            file_names.append(filename) 
            index += 1
    return (training_dfs, test_dfs, label_df, file_names)

In [4]:
def create_windows(training_dfs, test_dfs, window_size, window_overlap, label_df, file_names):

    training_data = []
    test_data = []
    label_data = []
    
    for df in training_dfs:
        for i in range(0, len(df), window_overlap):
            examples = []
            if i + window_size > len(df) - 1: 
                difference = window_size - (len(df) - (i + 1))
                for j in range(window_size):
                    examples.append(df.iloc[i - difference + j].tolist())
            else: 
                for j in range(window_size):
                    examples.append(df.iloc[i + j].tolist())
            row = []
            for example in examples: 
                for e in example: 
                    row.append(e)
            training_data.append(row)

    for i in range(len(test_dfs)):
        df = test_dfs [i]
        for j in range(0, len(df), window_overlap):
            examples = []
            if j + window_size >= len(df): 
                difference = window_size - (len(df) - (j + 1))
                for k in range(window_size):
                    examples.append(df.iloc[j - difference + k].tolist())
            else: 
                for k in range(window_size):
                    examples.append(df.iloc[j + k].tolist())
            row_indices = label_df[label_df['chan_id'] == file_names[i]].index.tolist()
            if not row_indices:
                continue
            anomaly_sequence = label_df.loc[row_indices[0], 'anomaly_sequences']
            anomaly_sequence = ast.literal_eval(anomaly_sequence)
            labeled = False
            for anomalies in anomaly_sequence:
                if j >= anomalies[0] and j <= anomalies[1] and labeled == False:
                    label_data.append(1)
                    labeled = True
            if labeled == False:
                label_data.append(0)
            row = []
            for example in examples: 
                for e in example: 
                    row.append(e)
            test_data.append(row)

    train = np.array(training_data)
    test = np.array(test_data)
    label = np.array(label_data)
    return (train, test, label)

In [5]:
window_size = 60 
window_overlap = 20
training_dfs, test_dfs, label_df, file_names = get_data()
X_train, X_test, y_test = create_windows(training_dfs, test_dfs, window_size, window_overlap, label_df, file_names)

train_df = pd.DataFrame(X_train)
test_df = pd.DataFrame(X_test)
label_df = pd.DataFrame(y_test)

train_df.to_csv("train.csv", index=False)
test_df.to_csv("test.csv", index=False)
label_df.to_csv("label.csv", index=False)

#notes
#normalize both data 
#create the one-class SVM 

(train) df:  G-2 (2478, 25) index:  0
(train) df:  E-4 (2880, 25) index:  1
(train) df:  E-3 (2880, 25) index:  2
(train) df:  T-1 (2875, 25) index:  3
(train) df:  D-11 (2611, 25) index:  4
(train) df:  A-6 (682, 25) index:  5
(train) df:  P-4 (2609, 25) index:  6
(train) df:  E-12 (2880, 25) index:  7
(train) df:  P-3 (2855, 25) index:  8
(train) df:  A-1 (2880, 25) index:  9
(train) df:  A-8 (762, 25) index:  10
(train) df:  F-2 (2861, 25) index:  11
(train) df:  D-4 (2833, 25) index:  12
(train) df:  D-3 (2880, 25) index:  13
(train) df:  B-1 (2435, 25) index:  14
(train) df:  E-2 (2880, 25) index:  15
(train) df:  E-5 (2880, 25) index:  16
(train) df:  G-3 (2624, 25) index:  17
(train) df:  G-4 (2551, 25) index:  18
(train) df:  D-2 (2880, 25) index:  19
(train) df:  D-5 (2561, 25) index:  20
(train) df:  A-9 (762, 25) index:  21
(train) df:  F-3 (2880, 25) index:  22
(train) df:  E-13 (2880, 25) index:  23
(train) df:  P-2 (2821, 25) index:  24
(train) df:  A-7 (2879, 25) index: 

In [6]:
print("train_df:", train_df.shape)
print("test_df:", test_df.shape)
print("label_df:", label_df.shape)

train_df: (6920, 1500)
test_df: (21816, 1500)
label_df: (21816, 1)


In [None]:
minirocket = MiniRocket()  
minirocket.fit(X_train)
X_transform_train = minirocket.transform(X_train)
X_transform_test = minirocket.transform(X_test)
X_transform_train.shape



In [None]:
X_transform_train_df = pd.DataFrame(X_transform_train)
X_transform_test_df = pd.DataFrame(X_transform_test)
X_transform_train_df.to_csv("rocket_train.csv", index=False)
X_transform_test_df.to_csv("rocket_test.csv", index=False)

In [None]:
scaler = MinMaxScaler()
X_fit_train = scaler.fit_transform(X_transform_train)
X_fit_test = scaler.transform(X_transform_test)

clf = OneClassSVM(gamma='auto').fit(X_fit_train)
y_pred = clf.predict(X_fit_test)

# print("params: ", grid_search.best_params_)
# print("score: ", best_svc_model.score(fit_X_test, y_test))
print(classification_report(y_test, y_pred))
precision = precision_score(y_test, y_pred, average='weighted')
print("FDR:", 1 - precision)
c_m = confusion_matrix(y_test, y_pred)
print(c_m)
disp = ConfusionMatrixDisplay(confusion_matrix=c_m)
disp.plot(cmap='Blues')

In [None]:
param_grid = {
        'n_estimators': [100, 200, 300, 400],
        'C': [1, 10, 100, 1000],
        'gamma': [0.0001, 0.001, 0.01, 0.1]
        }

clf = IsolationForest(random_state=0).fit(X_transform_train)
y_pred = clf.predict(X_transform_test)

# print("params: ", grid_search.best_params_)
# print("score: ", best_svc_model.score(fit_X_test, y_test))
print(classification_report(y_test, y_pred))
precision = precision_score(y_test, y_pred, average='weighted')
print("FDR:", 1 - precision)
c_m = confusion_matrix(y_test, y_pred)
print(c_m)
disp = ConfusionMatrixDisplay(confusion_matrix=c_m)
disp.plot(cmap='Blues')

In [None]:
#implement KNN


In [None]:
#implement GridSearch 

# svc = SVC(class_weight='balanced', random_state=42)
# param_grid = {
#         'kernel': ['poly'],
#         'degree': [2, 3, 4],
#         'C': [1, 10, 100, 1000],
#         'gamma': [0.0001, 0.001, 0.01, 0.1]
#         }
# grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, scoring='f1_weighted', cv=10, n_jobs=-1)
# grid_search.fit(fit_X_train, y_train)

# best_svc_model = grid_search.best_estimator_
# test_accuracy = best_svc_model.score(fit_X_test, y_test)
# y_pred = best_svc_model.predict(fit_X_test)



In [None]:
#implement grid search for unsupervised learning
#flattening feature engineering 
#rocket multithreading 
#window prediction brainstorm