In [241]:
import numpy as np
import pandas as pd
import scipy.io
import os
from glob import glob
import matplotlib.pyplot as plt
from scipy.signal import spectrogram
import tqdm

In [242]:
def rename_keys(dic):
    '''
    Rename some keys so that they can be loaded into a 
    DataFrame with consistent column names
    '''
    # For each file in the dictionary
    for _,v1 in dic.items():
        # For each key-value pair, rename the following keys 
        for k2,_ in list(v1.items()):
            if 'DE_time' in k2:
                v1['DE_time'] = v1.pop(k2)
            elif 'BA_time' in k2:
                v1['BA_time'] = v1.pop(k2)
            elif 'FE_time' in k2:
                v1['FE_time'] = v1.pop(k2)
            elif 'RPM' in k2:
                v1['RPM'] = v1.pop(k2)

In [243]:
def remove_dic_items(dic):
    '''
    Remove redundant data in the dictionary returned by matfile_to_dic inplace.
    '''
    # For each file in the dictionary, delete the redundant key-value pairs
    for _, values in dic.items():
        del values['__header__']
        del values['__version__']    
        del values['__globals__']


In [244]:
def matfile_to_dic(folder_path):
    '''
    Read all the matlab files of the CWRU Bearing Dataset and return a 
    dictionary. The key of each item is the filename and the value is the data 
    of one matlab file, which also has key value pairs.
    
    Parameter:
        folder_path: 
            Path (Path object) of the folder which contains the matlab files.
    Return:
        output_dic: 
            Dictionary which contains data of all files in the folder_path.
    '''
    output_dic = {}
    for _, filepath in enumerate(glob(os.path.join(folder_path, '*.mat'))):
        # strip the folder path and get the filename only.
        key_name = str(filepath).split('\\')[-1]
        output_dic[key_name] = scipy.io.loadmat(filepath)
    return output_dic

In [245]:
def label(filename):
    '''
    Function to create label for each signal based on the filename. Apply this
    to the "filename" column of the DataFrame.
    Usage:
        df['label'] = df['filename'].apply(label)
    '''
    if 'B' in filename:
        return 1
    elif 'IR' in filename:
        return 2
    elif 'OR' in filename:
        return 3
    elif 'Normal' in filename:
        return 0

In [246]:
def matfile_to_df(folder_path):
    '''
    Read all the matlab files in the folder, preprocess, and return a DataFrame
    
    Parameter:
        folder_path: 
            Path (Path object) of the folder which contains the matlab files.
    Return:
        DataFrame with preprocessed data
    '''
    dic = matfile_to_dic(folder_path)
    remove_dic_items(dic)
    rename_keys(dic)
    df = pd.DataFrame.from_dict(dic).T
    df = df.reset_index().rename(mapper={'index':'filename'},axis=1)
    df['label'] = df['filename'].apply(label)
    return df.drop(['BA_time','FE_time', 'RPM', 'ans'], axis=1, errors='ignore')

In [247]:
datapath_de12 = "/data/home/jkataok1/DA_DFD/data/raw/CWRU/12k_DE"
datapath_fe12 = "/data/home/jkataok1/DA_DFD/data/raw/CWRU/12k_FE"
normalpath = "/data/home/jkataok1/DA_DFD/data/raw/CWRU/Normal"
de12 = matfile_to_df(datapath_de12)
fe12 = matfile_to_df(datapath_fe12)
normal = matfile_to_df(normalpath)
fe12.drop(["i"], axis=1, inplace=True)
df = pd.concat([de12, fe12, normal], axis=0)
temp = df["filename"].tolist()
condition = [int(temp[i].split('/')[-1].split('_')[-1].split(".")[0]) for i in range(len(temp))]
df["condition"] = condition
#temp = get_df_all(defe, segment_length=512, normalize=False)

In [248]:
val_list = df["DE_time"].to_list()
label_list = df["label"].to_list()
cond_list = df["condition"].to_list()
segment_length = 2048
frequency = 1
nperseg = 128
x_out = []
y_out = []
cond_out = []
# Itereate each signal and split it into segments of length `segment_length`
# Use tqdm
for i, (val, label, cond) in enumerate(tqdm.tqdm(zip(val_list, label_list, cond_list))):
    N = len(val)
    splitted_val = np.stack(np.array_split(val[:int(N//segment_length * segment_length)], N//segment_length))
    splitted_val = splitted_val.reshape(-1, segment_length)
    for i in range(splitted_val.shape[0]):
        f, t, sxx = spectrogram(splitted_val[i, :], fs=frequency, nperseg=nperseg)
        x_out.append(np.expand_dims(sxx, 0))
        y_out.append(label)
        cond_out.append(cond)
x = np.stack(x_out, axis=0)
y = np.stack(y_out, axis=0)
y = np.expand_dims(y, 1)
cond = np.stack(cond_out, axis=0)


109it [00:01, 58.93it/s]


In [255]:
def normalize(arr):
    #arr = (arr - arr.min()) / (arr.max() - arr.min())
    arr = np.log(arr + 1)
    return arr
x0, y0 = normalize(x[cond==0]), y[cond==0]
np.savez("/data/home/jkataok1/DA_DFD/data/processed/CWRU/0_spectrogram.npz", x=x0, y=y0)
x1, y1 = normalize(x[cond==1]), y[cond==1]
np.savez("/data/home/jkataok1/DA_DFD/data/processed/CWRU/1_spectrogram.npz", x=x1, y=y1)
x2, y2 = normalize(x[cond==2]), y[cond==2]
np.savez("/data/home/jkataok1/DA_DFD/data/processed/CWRU/2_spectrogram.npz", x=x2, y=y2)
x3, y3 = normalize(x[cond==3]), y[cond==3]
np.savez("/data/home/jkataok1/DA_DFD/data/processed/CWRU/3_spectrogram.npz", x=x3, y=y3)

In [257]:
temp = np.load("/data/home/jkataok1/DA_DFD/data/processed/CWRU/1_spectrogram.npz")
temp['x'].max()

6.901360037335617