In [55]:
from scipy.io import loadmat
import pandas as pd
from pathlib import Path
import numpy as np
from sklearn.model_selection import train_test_split

# Pytorch
import torch
from torch import nn
from torch.nn import functional as F
from torch import Tensor
from torch.utils.data import TensorDataset, DataLoader
from torch import optim
from torch.nn.modules.loss import CrossEntropyLoss

import nn_model
from train_helper import get_dataloader, fit, validate

In [29]:
def remove_dic_items(dic):
    '''
    Remove redundant data in the dictionary returned by matfile_to_dic inplace.
    '''
    # For each file in the dictionary, delete the redundant key-value pairs
    for _, values in dic.items():
        del values['__header__']
        del values['__version__']    
        del values['__globals__']

def rename_keys(dic):
    '''
    Rename some keys so that they can be loaded into a 
    DataFrame with consistent column names
    '''
    # For each file in the dictionary
    for _,v1 in dic.items():
        # For each key-value pair, rename the following keys 
        for k2,_ in list(v1.items()):
            if 'DE_time' in k2:
                v1['DE_time'] = v1.pop(k2)
            elif 'BA_time' in k2:
                v1['BA_time'] = v1.pop(k2)
            elif 'FE_time' in k2:
                v1['FE_time'] = v1.pop(k2)
            elif 'RPM' in k2:
                v1['RPM'] = v1.pop(k2)

def label(filename):
    '''
    Function to create label for each signal based on the filename. Apply this
    to the "filename" column of the DataFrame.
    Usage:
        df['label'] = df['filename'].apply(label)
    '''
    if 'B' in filename:
        return 'B'
    elif 'IR' in filename:
        return 'IR'
    elif 'OR' in filename:
        return 'OR'
    elif 'Normal' in filename:
        return 'N'

def divide_signal(df, segment_length):
    '''
    This function divide the signal into segments, each with a specific number 
    of points as defined by segment_length. Each segment will be added as an 
    example (a row) in the returned DataFrame. Thus it increases the number of 
    training examples. The remaining points which are less than segment_length 
    are discarded.
    
    Parameter:
        df: 
            DataFrame returned by matfile_to_df()
        segment_length: 
            Number of points per segment.
    Return:
        DataFrame with segmented signals and their corresponding filename and 
        label
    '''
    dic = {}
    idx = 0
    for i in range(df.shape[0]):
        n_sample_points = len(df.iloc[i,1])
        n_segments = n_sample_points // segment_length
        for segment in range(n_segments):
            dic[idx] = {
                'signal': df.iloc[i,1][segment_length * segment:segment_length * (segment+1)], 
                'label': df.iloc[i,2],
                'filename' : df.iloc[i,0]
            }
            idx += 1
    df_tmp = pd.DataFrame.from_dict(dic,orient='index')
    df_output = pd.concat(
        [df_tmp[['label', 'filename']], 
         pd.DataFrame(np.hstack(df_tmp["signal"].values).T)
        ], 
        axis=1 )
    return df_output

def matfile_to_dic(folder_path):
    '''
    Read all the matlab files of the CWRU Bearing Dataset and return a 
    dictionary. The key of each item is the filename and the value is the data 
    of one matlab file, which also has key value pairs.
    
    Parameter:
        folder_path: 
            Path (Path object) of the folder which contains the matlab files.
    Return:
        output_dic: 
            Dictionary which contains data of all files in the folder_path.
    '''
    output_dic = {}
    for _, filepath in enumerate(folder_path.glob('*.mat')):
        # strip the folder path and get the filename only.
        key_name = str(filepath).split('\\')[-1]
        output_dic[key_name] = loadmat(filepath)
    return output_dic

def matfile_to_df(folder_path):
    '''
    Read all the matlab files in the folder, preprocess, and return a DataFrame
    
    Parameter:
        folder_path: 
            Path (Path object) of the folder which contains the matlab files.
    Return:
        DataFrame with preprocessed data
    '''
    dic = matfile_to_dic(folder_path)
    remove_dic_items(dic)
    rename_keys(dic)
    df = pd.DataFrame.from_dict(dic).T
    df = df.reset_index().rename(mapper={'index':'filename'},axis=1)
    df['label'] = df['filename'].apply(label)
    return df.drop(['BA_time','FE_time', 'RPM', 'ans'], axis=1, errors='ignore')

def get_df_all(data_path, segment_length=512, normalize=False):
    '''
    Load, preprocess and return a DataFrame which contains all signals data and
    labels and is ready to be used for model training.
    
    Parameter:
        normal_path: 
            Path of the folder which contains matlab files of normal bearings
        DE_path: 
            Path of the folder which contains matlab files of DE faulty bearings
        segment_length: 
            Number of points per segment. See divide_signal() function
        normalize: 
            Boolean to perform normalization to the signal data
    Return:
        df_all: 
            DataFrame which is ready to be used for model training.
    '''
    df = matfile_to_df(data_path) 

    # if normalize:
    #     normalize_signal(df)
    df_processed = divide_signal(df, segment_length)

    map_label = {'N':0, 'B':1, 'IR':2, 'OR':3}
    df_processed['label'] = df_processed['label'].map(map_label)
    return df_processed

In [41]:
DATA_PATH = Path("./12k_DE")

df_all = get_df_all(DATA_PATH, segment_length=500, normalize=True)
features = df_all.columns[2:]
target = 'label'

In [42]:
print(df_all['label'].shape)
df_all.sample(100)

(17987,)


Unnamed: 0,label,filename,0,1,2,3,4,5,6,7,...,490,491,492,493,494,495,496,497,498,499
15427,1,3008_B028_3.mat,-0.028483,-0.929360,0.043131,0.848793,-0.029297,-0.858559,-0.238443,1.090492,...,-0.049642,0.004069,-0.638833,-1.083168,0.559081,1.649573,0.100097,-1.966955,0.465494,1.784663
4335,3,145_OR007@3_1.mat,0.622127,-0.372789,-0.945372,-0.009340,1.145168,0.596949,-0.819891,-0.982326,...,-0.043451,-0.459285,-0.148222,0.369540,0.300099,-0.217257,-0.332992,0.077563,0.276140,-0.045482
16393,0,98_Normal_1.mat,0.061124,0.031292,-0.028789,-0.093042,-0.110149,-0.079482,-0.017941,0.033796,...,-0.021487,0.039220,0.072598,0.073224,0.049025,0.031084,0.049650,0.061124,0.042349,-0.002712
14135,2,3003_IR028_2.mat,1.273190,-1.451412,-1.494544,1.114092,1.837154,-0.373941,-1.694738,-0.262044,...,-1.006671,-1.676428,-0.619709,0.928953,0.747068,-0.312092,-0.255126,0.417073,0.513915,0.071208
13987,2,3002_IR028_1.mat,0.413410,2.036535,2.049148,0.151367,-0.214029,0.319010,-0.354410,-0.753172,...,-0.484211,0.551756,1.102292,0.887856,0.572101,0.519204,0.402017,-0.345458,-0.713296,0.273030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7667,1,188_B014_3.mat,0.091126,0.041421,-0.073421,0.106557,0.274028,0.086091,0.041096,0.262495,...,-0.029076,0.027776,0.070822,0.009909,-0.037685,-0.065136,-0.075207,-0.007147,0.156912,0.230008
2432,1,120_B007_2.mat,-0.314474,0.121989,0.391469,-0.017056,-0.199308,0.225622,0.295145,-0.212952,...,-0.322759,-0.025015,0.253561,0.041259,-0.179004,-0.004223,0.060426,0.060426,0.102172,-0.012508
5570,3,159_OR007@12_2.mat,-0.027289,0.019330,0.053928,0.033787,-0.117603,0.157562,0.422494,-0.211491,...,0.111106,0.034599,-0.061076,0.051817,0.106233,-0.034111,-0.079106,0.078456,0.070984,0.016081
1795,2,108_IR007_3.mat,-0.047269,-0.011533,0.003736,0.013807,0.054416,0.147978,0.254536,0.125562,...,-0.532462,-0.500300,0.181278,0.006822,0.135958,0.799993,0.220424,-0.237480,0.355733,0.163247


In [48]:
## Split the data into train and validation set
random_seed = 42
bs = 66
lr = 0.001
wd = 1e-5
betas=(0.99, 0.999)
X_train, X_valid, y_train, y_valid = train_test_split(df_all[features], 
                                                      df_all[target], 
                                                      test_size=0.20, random_state=random_seed, shuffle=True
                                                     )

In [49]:
# Functions for training
def get_dataloader(train_ds, valid_ds, bs):
    '''
        Get dataloaders of the training and validation set.

        Parameter:
            train_ds: Dataset
                Training set
            valid_ds: Dataset
                Validation set
            bs: Int
                Batch size
        
        Return:
            (train_dl, valid_dl): Tuple of DataLoader
                Dataloaders of training and validation set.
    '''
    return (
        DataLoader(train_ds, batch_size=bs, shuffle=True),
        DataLoader(valid_ds, batch_size=bs * 2),
    )

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
## Create DataLoader of train and validation set
X_train = torch.tensor(X_train.values, dtype=torch.float32)
X_valid = torch.tensor(X_valid.values, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.long)
y_valid = torch.tensor(y_valid.values, dtype=torch.long)

train_ds = TensorDataset(X_train, y_train)
valid_ds = TensorDataset(X_valid, y_valid)
train_dl, valid_dl = get_dataloader(train_ds, valid_ds, bs)

In [52]:
## Instantiate model, optimizer and loss function
model = nn_model.CNN_1D_2L(len(features))
model.to(device)
opt = optim.Adam(model.parameters(), lr=lr, betas=betas, weight_decay=wd)
loss_func = CrossEntropyLoss()

In [62]:
%%time
## Train
epochs = 1
model, metrics = fit(epochs, model, loss_func, opt, train_dl, valid_dl, train_metric=False)

EPOCH 	 Train Loss 	 Val Loss 	 Train Acc 	 Val Acc 	
0 	 0.00110 	 0.05515 	 0.00000 	0.98110 	
CPU times: total: 3min 11s
Wall time: 1min 6s


In [None]:
m=nn.BatchNorm1d(10)

input=torch.randn(20,100)
print(input.shape)

output = m(input)
print(output.shape)