### Excercise: use autoencoder (LSTM) for anomaly detection in accelerometer based vibration dataset.

####  The autoencoder tries to reconstruct the input at the output. Hence, for healthy data, it tries to reconstruct healthy data. But it will have a hard time trying to reconstruct faulty data in its neural network bottleneck (LSTM). That's how the anomaly detector works.

In [1]:
import numpy as np
import pandas as pd
import scipy.io as sio
import os

#### Download healthy data

In [2]:
# original site seems to be permantly offline, using cached data

#!wget http://csegroups.case.edu/sites/default/files/bearingdatacenter/files/Datafiles/97.mat
#!wget http://csegroups.case.edu/sites/default/files/bearingdatacenter/files/Datafiles/98.mat
#!wget http://csegroups.case.edu/sites/default/files/bearingdatacenter/files/Datafiles/99.mat
#!wget http://csegroups.case.edu/sites/default/files/bearingdatacenter/files/Datafiles/100.mat
!wget https://github.com/IBM/skillsnetwork/raw/master/coursera_ai/week3/data/cwr_healthy/97.mat
!wget https://github.com/IBM/skillsnetwork/raw/master/coursera_ai/week3/data/cwr_healthy/98.mat
!wget https://github.com/IBM/skillsnetwork/raw/master/coursera_ai/week3/data/cwr_healthy/99.mat
!wget https://github.com/IBM/skillsnetwork/raw/master/coursera_ai/week3/data/cwr_healthy/100.mat


--2021-11-08 10:48:15--  https://github.com/IBM/skillsnetwork/raw/master/coursera_ai/week3/data/cwr_healthy/97.mat
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://github.com/IBM/claimed/raw/master/coursera_ai/week3/data/cwr_healthy/97.mat [following]
--2021-11-08 10:48:15--  https://github.com/IBM/claimed/raw/master/coursera_ai/week3/data/cwr_healthy/97.mat
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/IBM/claimed/master/coursera_ai/week3/data/cwr_healthy/97.mat [following]
--2021-11-08 10:48:15--  https://raw.githubusercontent.com/IBM/claimed/master/coursera_ai/week3/data/cwr_healthy/97.mat
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercont

In [3]:
!mkdir cwr_healthy

In [4]:
!mv *.mat cwr_healthy/

In [5]:
!ls -lahr cwr_healthy/

total 34M
-rw-rw-r-- 1 hugo hugo  15M Nov  8 10:48 99.mat
-rw-rw-r-- 1 hugo hugo 7,4M Nov  8 10:48 98.mat
-rw-rw-r-- 1 hugo hugo 3,8M Nov  8 10:48 97.mat
-rw-rw-r-- 1 hugo hugo 7,5M Nov  8 10:48 100.mat
drwxrwxr-x 4 hugo hugo 4,0K Nov  8 10:48 ..
drwxrwxr-x 2 hugo hugo 4,0K Nov  8 10:48 .


In [6]:
mlf = sio.loadmat('./cwr_healthy/100.mat')
mlf

{'__header__': b'MATLAB 5.0 MAT-file, Platform: PCWIN, Created on: Fri Jan 28 11:25:48 2000',
 '__version__': '1.0',
 '__globals__': [],
 'X100_DE_time': array([[ 0.01460308],
        [ 0.05444862],
        [ 0.10764554],
        ...,
        [-0.02357354],
        [ 0.00521538],
        [ 0.04777292]]),
 'X100_FE_time': array([[ 0.19292182],
        [ 0.16436364],
        [ 0.09081091],
        ...,
        [ 0.10930182],
        [ 0.05218545],
        [-0.00452   ]]),
 'X100RPM': array([[1725]], dtype=uint16)}

#### Store data in dataframes

In [7]:
def read_folder(folder):
    data = 'dummy'
    skip = False
    for file in os.listdir(folder):
        file_id = file[:-4]
        mat_file_dict = sio.loadmat(folder+file)
        del data
        for key, value in mat_file_dict.items():
            if 'DE_time' in key or 'FE_time' in key:
                a = np.array(mat_file_dict[key])
                try:
                    data
                except NameError:
                    data = a
                else:
                    if (data.shape[0] != a.shape[0]):
                        print('skipping ' + file_id)
                        skip = True
                        continue
                    data = np.hstack((data,a))
        if skip:
            skip = False
            continue
        id = np.repeat(file_id, data.shape[0])
        id.shape = (id.shape[0], 1)
        data = np.hstack((id, data))
        if data.shape[1] == 2:
            zeros = np.repeat(float(0), data.shape[0])
            zeros.shape = (data.shape[0], 1)
            data = np.hstack((data, zeros))
        try:
            result
        except NameError:
            result = data
        else:
            result = np.vstack((result, data))
    return result

In [8]:
result_healthy = read_folder('./cwr_healthy/')

skipping 99
skipping 99


In [9]:
print(result_healthy.shape)
result_healthy

(1213484, 3)


array([['100', '0.014603076923076923', '0.19292181818181817'],
       ['100', '0.05444861538461539', '0.16436363636363635'],
       ['100', '0.10764553846153846', '0.09081090909090908'],
       ...,
       ['97', '-0.034630153846153845', '0.14053090909090907'],
       ['97', '0.01668923076923077', '0.09553636363636364'],
       ['97', '0.04693846153846153', '0.09019454545454544']], dtype='<U32')

#### Store healthy data in dataframe

In [10]:
pdf = pd.DataFrame(result_healthy)

In [11]:
pdf.to_csv('result_healthy_pandas.csv', header=False, index=True)

In [12]:
!head result_healthy_pandas.csv

0,100,0.014603076923076923,0.19292181818181817
1,100,0.05444861538461539,0.16436363636363635
2,100,0.10764553846153846,0.09081090909090908
3,100,0.13372246153846154,0.08649636363636364
4,100,0.11265230769230769,0.09923454545454545
5,100,0.08240307692307691,0.09307090909090908
6,100,0.08699261538461538,0.08197636363636364
7,100,0.11056615384615384,0.03307818181818182
8,100,0.12767261538461538,0.01171090909090909
9,100,0.11348676923076922,0.02218909090909091


#### Download faulty data

In [None]:
# uncommented because way too much data
#!for url in `curl -s csegroups.case.edu/bearingdatacenter/pages/12k-drive-end-bearing-fault-data |grep mat |grep http |awk -F'href="' '{print $2}' |awk -F'">' '{print $1}'`; do wget $url; done
#!for url in `curl -s csegroups.case.edu/bearingdatacenter/pages/48k-drive-end-bearing-fault-data |grep mat |grep http |awk -F'href="' '{print $2}' |awk -F'">' '{print $1}'`; do wget $url; done
#!for url in `curl -s csegroups.case.edu/bearingdatacenter/pages/12k-fan-end-bearing-fault-data |grep mat |grep http |awk -F'href="' '{print $2}' |awk -F'">' '{print $1}'`; do wget $url; done
#!mkdir cwr_faulty
#!mv *.mat cwr_faulty/

In [13]:
# original site seems to be permantly offline, using cached data
# !wget http://csegroups.case.edu/sites/default/files/bearingdatacenter/files/Datafiles/105.mat
# !wget http://csegroups.case.edu/sites/default/files/bearingdatacenter/files/Datafiles/106.mat
# !wget http://csegroups.case.edu/sites/default/files/bearingdatacenter/files/Datafiles/107.mat
# !wget http://csegroups.case.edu/sites/default/files/bearingdatacenter/files/Datafiles/108.mat
!wget https://github.com/IBM/skillsnetwork/raw/master/coursera_ai/week3/data/cwr_faulty/105.mat
!wget https://github.com/IBM/skillsnetwork/raw/master/coursera_ai/week3/data/cwr_faulty/106.mat
!wget https://github.com/IBM/skillsnetwork/raw/master/coursera_ai/week3/data/cwr_faulty/107.mat
!wget https://github.com/IBM/skillsnetwork/raw/master/coursera_ai/week3/data/cwr_faulty/108.mat

--2021-11-08 10:48:53--  https://github.com/IBM/skillsnetwork/raw/master/coursera_ai/week3/data/cwr_faulty/105.mat
Resolving github.com (github.com)... 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://github.com/IBM/claimed/raw/master/coursera_ai/week3/data/cwr_faulty/105.mat [following]
--2021-11-08 10:48:53--  https://github.com/IBM/claimed/raw/master/coursera_ai/week3/data/cwr_faulty/105.mat
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/IBM/claimed/master/coursera_ai/week3/data/cwr_faulty/105.mat [following]
--2021-11-08 10:48:54--  https://raw.githubusercontent.com/IBM/claimed/master/coursera_ai/week3/data/cwr_faulty/105.mat
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercont

In [14]:
!mkdir cwr_faulty
!mv *.mat cwr_faulty/

In [15]:
!ls cwr_faulty/

105.mat  106.mat  107.mat  108.mat


#### Store faulty data in dataframe

In [16]:
result_faulty = read_folder('./cwr_faulty/')

In [17]:
pdf = pd.DataFrame(result_faulty)

In [18]:
pdf.to_csv('result_faulty_pandas.csv', header=False, index=True)

In [19]:
!head result_faulty_pandas.csv

0,106,-0.2776016367265469,0.040885454545454544
1,106,-0.04434479041916167,0.06985454545454545
2,106,0.11760303393213573,0.3377672727272727
3,106,-0.14505457085828344,0.2516818181818182
4,106,-0.111430499001996,0.1088909090909091
5,106,0.1309227145708583,0.07889454545454545
6,106,0.03281189620758483,0.02116181818181818
7,106,-0.1970338123752495,0.12861454545454545
8,106,-0.07488259481037925,0.20196181818181816
9,106,0.009583672654690619,-0.04684363636363636


#### Store healthy and faulty dataframes in IBM cloud

In [23]:
# In order to obtain the correct values for "credentias", "bucket_name" and "endpoint" 
# please follow the tutorial at https://github.com/IBM/skillsnetwork/wiki/Cloud-Object-Storage-Setup

credentials = {
  "apikey": "RHPLuwaEctTZ68Ne8FwoHZaXKujcdzApQEgCauExsIbC",
  "cos_hmac_keys": {
    "access_key_id": "a89357bb879e4a59aa015abffb144827",
    "secret_access_key": "0d78e16cbb6fe96d6f3b55dca4c7f9b6d03ece26626b1cc3"
  },
  "endpoints": "https://control.cloud-object-storage.cloud.ibm.com/v2/endpoints",
  "iam_apikey_description": "Auto-generated for key a89357bb-879e-4a59-aa01-5abffb144827",
  "iam_apikey_name": "Service credentials-1",
  "iam_role_crn": "crn:v1:bluemix:public:iam::::serviceRole:Writer",
  "iam_serviceid_crn": "crn:v1:bluemix:public:iam-identity::a/3dba62a148ab4574867f8eb140c3a44e::serviceid:ServiceId-109769b1-d4d5-4997-93a1-faefc036bfa9",
  "resource_instance_id": "crn:v1:bluemix:public:cloud-object-storage:global:a/3dba62a148ab4574867f8eb140c3a44e:643e3143-6265-453a-877a-15ae3947ef9a::"
}

bucket_name = "cloud-object-storage-appliedaideeplearning"
endpoint = "https://s3.eu-de.cloud-object-storage.appdomain.cloud"

In [24]:
import base64
from ibm_botocore.client import Config
import ibm_boto3
import time

# Create client 
client = ibm_boto3.client(
    's3',
    aws_access_key_id = credentials["cos_hmac_keys"]['access_key_id'],
    aws_secret_access_key = credentials["cos_hmac_keys"]["secret_access_key"],
    endpoint_url = endpoint
)

client.upload_file('result_healthy_pandas.csv', bucket_name, 'result_healthy_pandas.csv')
client.upload_file('result_faulty_pandas.csv', bucket_name, 'result_faulty_pandas.csv')