In [1]:
import glob
import os

import pandas as pd
import scipy.io

import numpy as np

In [14]:
cwruData0 = pd.read_parquet('data/cwru0.parquet')
cwruData1 = pd.read_parquet('data/cwru1.parquet')

cwruData = pd.concat([cwruData0, cwruData1])

Unnamed: 0,condition,faultyBearingPosition,sampleRate,motorLoad,faultDiameter,relativeFaultPosition,fileName,rpm,fanEndData,baseData,driveEndData
0,Normal Baseline,,12000,0,,,97.mat,1796.0,"[0.14566727272727273, 0.09779636363636364, 0.0...",,"[0.05319692307692307, 0.08866153846153846, 0.0..."
1,Normal Baseline,,12000,1,,,98.mat,,"[0.023216363636363632, 0.08115454545454545, 0....",,"[0.046104, -0.03713353846153846, -0.0894959999..."
2,Normal Baseline,,12000,2,,,99.mat,,"[0.038625454545454546, 0.0967690909090909, 0.1...",,"[0.06425353846153846, 0.06300184615384616, -0...."
3,Normal Baseline,,12000,3,,,100.mat,1725.0,"[0.19292181818181817, 0.16436363636363635, 0.0...",,"[0.014603076923076923, 0.05444861538461539, 0...."
4,Inner Race Fault,drive end,12000,0,0.007,,105.mat,1797.0,"[-0.40207454545454546, -0.004725454545454545, ...","[0.06466148367952523, -0.023096261127596444, -...","[-0.08300435129740519, -0.19573433133732535, 0..."
...,...,...,...,...,...,...,...,...,...,...,...
156,Outer Race Fault,fan end,12000,3,0.021,orthogonal,318.mat,1728.0,"[-0.11007844155844157, -0.11284259740259742, 0...","[-0.2252891394658754, 0.06554670623145402, 0.1...","[0.09551185628742516, -0.00462940119760479, -0..."
157,Outer Race Fault,fan end,12000,0,0.007,opposite,302.mat,1797.0,"[0.028528831168831172, 0.08485498501498502, -0...","[0.06458100890207716, 0.10614623145400594, -0....","[-0.1521643113772455, -0.18667329341317365, -0..."
158,Outer Race Fault,fan end,12000,1,0.007,opposite,305.mat,1775.0,"[0.135281038961039, -0.06715272727272728, -0.0...","[0.06055727002967359, 0.10550243323442138, -0....","[0.08625305389221556, 0.0497051497005988, 0.03..."
159,Outer Race Fault,fan end,12000,2,0.007,opposite,306.mat,1755.0,"[0.16519896103896106, -0.1920275324675325, 0.0...","[0.08433756676557864, -0.1955537091988131, 0.0...","[0.11435433133732535, -0.05295385229540918, -0..."


In [11]:
cwruData[:80].to_parquet('data/cwru0.parquet')

In [12]:
cwruData[80:].to_parquet('data/cwru1.parquet')

In [3]:
from sklearn.base import TransformerMixin, BaseEstimator

class dataSelector(TransformerMixin, BaseEstimator):

    def __init__(self, columns = 'all', **column_values):
        self.columns = columns
        self.column_values = column_values


    def fit(self, X, y=None):
        
        # Return the transformer
        return self


    def transform(self, X):
        
        X_ = X.copy()
        
        if self.columns == 'all':
            columns = slice(None)
        else:
            columns = self.columns
    
        for column, values in self.column_values.items():
            X_ = X_[X_[column].isin(values)]
    
    
        return X_.loc[:, columns]
    
    
class numpyConverter(TransformerMixin, BaseEstimator):

    def fit(self, X, y=None):
        
        # Return the transformer
        return self


    def transform(self, X):
        
        X_ = X.copy()
    
        return X_.to_numpy()
    

class numpyFlattener(TransformerMixin, BaseEstimator):

    def fit(self, X, y=None):
        
        # Return the transformer
        return self


    def transform(self, X):
        
        X_ = X.copy()
    
        return np.concatenate(X_)
    
class numpyListFilter(TransformerMixin, BaseEstimator):

    def fit(self, X, y=None):
        
        # Return the transformer
        return self


    def transform(self, X):
        
        X_ = X.copy()
    
        return np.array([array for array in X_ if isinstance(array, np.ndarray)])
    
    
class timeSeriesChunker(TransformerMixin, BaseEstimator):
    
    def __init__(self, chunk_size, keep_rest = False):
        self.chunk_size = chunk_size
        self.keep_rest = keep_rest

    def fit(self, X, y=None):
        
        # Return the transformer
        return self


    def transform(self, X):
        
        X_ = X.copy()
        X_ = np.array([np.array(list(self._chunk(array, self.chunk_size, self.keep_rest))) for array in X_])
    
        return X_  

    def _chunk(self, array, length, keep_rest):
        for i in range(0, len(array), length):
            result = array[i:i + length]
            if keep_rest:
                yield result
            else:
                if (len(result) == length):
                    yield result
                    
class FeatureExtractor(TransformerMixin, BaseEstimator):

    def fit(self, X, y=None):
        
        # Return the transformer
        return self


    def transform(self, X):
        
        X_ = X.copy()
        
        aggregated = pd.DataFrame()
        aggregated = \
        aggregated.assign(
                      maximum_value = X_.apply(np.max),
                      standard_value =  X_.apply(lambda X_: np.sqrt(np.mean((X_ - np.max(X_)) ** 2))),
                      mean_value = X_.apply(np.mean),
                      minimum_value = X_.apply(np.min),
                      mean_amplitude = X_.apply(lambda X_: np.max(np.abs(X_))),
                      root_mean_square_value = X_.apply(lambda X_: np.sqrt(np.mean((X_**2)))),
                      skewness_value = X_.apply(lambda X_: np.mean((X_**3))),
                      kurtosis_value = X_.apply(lambda X_: np.mean((X_**4))),
                      square_root_amplitude = X_.apply(lambda X_: np.mean(np.sqrt(np.abs(X_)))**2),
                 )
        
        aggregated = \
        aggregated.assign(
                      peak_to_peak_value = aggregated["maximum_value"] - aggregated["minimum_value"],
                      waveform_indicator = aggregated["root_mean_square_value"] / aggregated["mean_amplitude"],
                      pulse_indicator = aggregated["maximum_value"] / aggregated["mean_amplitude"],
                      kurtosis_index = aggregated["root_mean_square_value"] / aggregated["root_mean_square_value"],
                      peak_index = aggregated["kurtosis_value"] / aggregated["root_mean_square_value"],
                      skewness_indicator = aggregated["skewness_value"] / (aggregated["root_mean_square_value"] ** 4),
                      margin_indicator = aggregated["maximum_value"] / aggregated["square_root_amplitude"]
                 )
        
        X_ = aggregated
    
        return X_.to_numpy()

In [4]:
from sklearn.preprocessing import FunctionTransformer
transformer = FunctionTransformer(lambda array: pd.DataFrame(array))

In [5]:
from sklearn.pipeline import Pipeline

prep = \
Pipeline(steps=[('dataSelector', dataSelector(columns = ['baseData', 'driveEndData'], faultDiameter = [0.007])),
                ('numpyConverter', numpyConverter()),
                ('numpyFlattener', numpyFlattener()),
                ('numpyListFilter', numpyListFilter()),
                ('timeSeriesChunker', timeSeriesChunker(2000)),
                ('numpyFlattener2', numpyFlattener()),
                ('transformer', FunctionTransformer(lambda array: pd.DataFrame(array).T)),
                ('featureExtractor', FeatureExtractor()),
                ('reshaper', FunctionTransformer(lambda array: [a.reshape(4,4) for a in array]))])

X = prep.transform(cwruData)

