# Notebook for final evaluation on the test data set
-------------------
This notebook is meant to evaluate the final model of your group on the hold-out test set.

Fill in the notebook with the data preparation steps needed to generate the features for your model and apply your model to the test data.

## Load Data

################################
**Do not change the code in this section!**
################################

In this section, the metadata file is loaded into a dictionary and the true label of the cylinder bottoms is extracted. The metadata file will be located in "data/cylinder_bottom_test/meta_data.json".

In [1]:
import joblib
import pandas as pd
import json
from sklearn.metrics import f1_score

# load the json file with metadata
metadata_file = 'data/cylinder_bottom_test/meta_data.json'
f = open(metadata_file)
meta_dict = json.load(f)


FileNotFoundError: [Errno 2] No such file or directory: 'data/cylinder_bottom_test/meta_data.json'

In [None]:
def assign_label(part_anomaly):
    if part_anomaly >= 1:
        label = 1
    else:
        label = 0
    return label

def get_anomalies(metadata):
    """
    :param metadata: dictionary imported from meta data file
    :return: ground truth dataframe with anomaly label for each part
    """
    # transform dict into pd dataframe and filter only the milling events
    metadata_process = pd.json_normalize(metadata, 'process_data', ['part_type', 'part_id'])
    metadata_process_milling = metadata_process[metadata_process['name']=='cnc_milling_machine']

    # extract anomaly information
    anomalies = metadata_process_milling['anomaly']
    anomalies = anomalies.reset_index(drop=True)
    anomalies = anomalies.astype(int)
    # transformation of anomaly information to a binary label
    anomalies = anomalies.apply(assign_label)
    return anomalies

# extract the true labels from the metadata file
y_true = get_anomalies(meta_dict)

## Data Preparation

In this section, you should perform all necessary steps of data preparation. The output of this section shall be a matrix X containing all the features for the ML model.

The folders with the recorded data are stored in the path "/data/cylinder_bottom_test/cnc_milling_machine/process_data/" and are also described in the metadata filed loaded in the section above.

In [None]:
import numpy as np
import scipy
import os
import h5py
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import joblib

In [None]:
## Timeseries Features
class TimeFeatures:
    def __init__(self) -> None:
        pass
    #1 'mean'

    #2 'max'

    #3 'min'

    #4 rms
    def rms(x):
        #lambda x: np.sqrt(sum(x**2)/x.size)
        res = np.sqrt(np.mean(x**2))
        return res

    #5 abs_mean
    def abs_mean(x):
        res = np.mean(np.abs(x))
        return res

    #6 scipy.stats.kurtosis()
    def kurtosis(x):
        res = scipy.stats.kurtosis(x)
        return res

    #7 scipy.stats.skew()
    def skew(x):
        res = scipy.stats.skew(x)
        return res

    #8 std deviation

    #9 coefficient of variation
    def coef_var(x):
        res = scipy.stats.variation(x)
        return res

    #9 zero peak to peak: np.ptp (Barsczc2019)
    def zptp(x):
        res = np.ptp(x)/2
        return res

    #10 Crest Factor 
    def crest(x):
        res = (np.ptp(x)/2)/TimeFeatures.rms(x)
        return res

    #11 Impulse Factor (Ahmed2020/Wang2019b)
    def impulse_factor(x):
        res = x.max()/x.abs().mean()
        return res

    #12 Margin Factor (Ahmed2020)
    def margin_factor(x):
        peak= np.ptp(x)/2
        res = peak/((np.mean(np.sqrt(np.abs(x))))**2)
        return res

    #13 Shape Factor (Ahmed2020/Wang2019b)
    def shape_factor(x):
        rms = np.sqrt(np.mean(x**2))
        res = rms/(np.mean(np.abs(x)))
        return res

    #14 Clearance Factor (Ahmed2020/Wang2019b)
    def clearance_factor(x):
        res = x.max()/((np.mean(np.sqrt(np.abs(x))))**2)
        return res


In [None]:
def read_sensor(obj_features, data_paths):
    with h5py.File(data_paths, 'r') as hf:
        # Access the dataset and column names
        data = hf["data"]
        column_names = data.attrs["column_names"]
        
        # Pandas to read the data
        df = pd.DataFrame(data, columns=column_names)
    # add the word frontside/backside to not overwrite values
    file_name = data_paths.split('/')[-1]
    first_word = file_name.split('_')[0]

    for col in df.columns:
        if col in ["timestamp"]:
            continue
        # Calculate the mean of the column
        col_mean = df[col].mean()
        # Add the mean value to the dictionary with the column name as the key
        obj_features[first_word+"_"+col+"_"+"mean"] = col_mean
        
        # Calculate possible time series features
        obj_features[first_word+"_"+col+"_"+"rms"] = TimeFeatures.rms(df[col])
        obj_features[first_word+"_"+col+"_"+"abs_mean"] = TimeFeatures.abs_mean(df[col])
        obj_features[first_word+"_"+col+"_"+"kurtosis"] = TimeFeatures.kurtosis(df[col])
        obj_features[first_word+"_"+col+"_"+"skew"] = TimeFeatures.skew(df[col])
        obj_features[first_word+"_"+col+"_"+"coef_var"] = TimeFeatures.coef_var(df[col])
        obj_features[first_word+"_"+col+"_"+"zptp"] = TimeFeatures.zptp(df[col])
        obj_features[first_word+"_"+col+"_"+"crest"] = TimeFeatures.crest(df[col])
        obj_features[first_word+"_"+col+"_"+"impulse_factor"] = TimeFeatures.impulse_factor(df[col])
        obj_features[first_word+"_"+col+"_"+"margin_factor"] = TimeFeatures.margin_factor(df[col])
        obj_features[first_word+"_"+col+"_"+"shape_factor"] = TimeFeatures.shape_factor(df[col])
        obj_features[first_word+"_"+col+"_"+"clearance_factor"] = TimeFeatures.clearance_factor(df[col])

    return obj_features


In [None]:
def read_data(data):
    file_names = [
        'frontside_internal_machine_signals.h5',
        'frontside_external_sensor_signals.h5',
        'backside_external_sensor_signals.h5',
        'backside_internal_machine_signals.h5'
    ]
    
    # Initialize an empty dataframe to store the features
    features = pd.DataFrame()
            
    # Loop through each object in the data
    for obj in data:
        # Get the part_id and the anomaly
        anomaly = obj["process_data"][1]["anomaly"] # Assuming the anomaly is the same for all processes
        
        # Initialize a dictionary to store the features for this object
        obj_features = []
        obj_features = {"anomaly": int(anomaly)}
        
        # Loop through the process_data and load the data_paths
        for process_data in obj["process_data"]:
            for data_paths  in process_data["data_paths"]:
                # if file path contain saw -> ignore
                file_name = data_paths.split('/')[-1]
                part_id = data_paths.split('/')[-2]
                # check if file_name is points to the right path
                if file_name not in file_names:
                    continue

                # Path to the file 
                data_paths = "/data/cylinder_bottom_test/cnc_milling_machine/process_data/" + part_id +"/"+ file_name
                
                # Check the file extension
                if data_paths.endswith(".h5"):
                    # # Load the h5 file
                    if os.path.exists(data_paths):
                        h5_file = h5py.File(data_paths, "r")
                        # # Do something with the h5 file, e.g. extract some statistics
                        obj_features = read_sensor(obj_features, data_paths)
                        # # Close the h5 file
                        h5_file.close()
                    else:
                        print("file does not exist: ")
                        print(data_paths)
                        
                elif data_paths.endswith(".csv"):
                    data_paths = "data/" + data_paths
    
    
        # Append the obj_features dictionary to the features dataframe
        if features.empty:
            features = pd.DataFrame([obj_features])
        else:
            features = pd.concat([features, pd.DataFrame([obj_features])])
    

    print(f'Done ..')
    return features


In [None]:
def preprocess_data(test_df):
    # Load the pre-trained imputer and scaler
    imputer = joblib.load('imputer.joblib')
    scaler = joblib.load('scalar.joblib')
    
    # Apply imputing to the test_tf dataframe
    # test_df = imputer.transform(test_df)

    # Set feature names for the scaler
    scaler.feature_names_in_ = list(test_df.columns)
    
    # Apply scaling to the test_tf dataframe
    test_df = scaler.transform(test_df)

    return test_df
    
    

In [None]:
# Extract features
features_test = read_data(meta_dict)

In [None]:
# Load selected features
selected_features = pd.read_csv("selected_features_names.csv")["column_names"].tolist()

# Keep only the selected features
test_df = pd.concat([features_test[feature] for feature in selected_features], axis=1)

In [None]:

# final output: feature matrix X
X = preprocess_data(test_df)

## Testing the Model

Import your machine learning model. In this notebook, you should not perform any training! This notebook is only used for the evaluation of the model on unseen test data.

To export and import your already generated model, you can use the dump() and load() functions from the *joblib* library.

In [None]:
# load model
model = joblib.load("random_forest_classifier.joblib") # replace "your model" with your generated machine learning model

# predict the anomaly label for the test data
y_pred = model.predict(X)

## Result

################################
**Do not change the code in this section!**
################################

The evaluation is done using the f1-score.

In [None]:
score = 100 * f1_score(y_true, y_pred)

print('The f1-score of the model on the test set is: {:.2f}%'.format(score))