In [1]:
import numpy as np
import pandas as pd
import scipy.io as sio
import os
import wget
import requests
import re
from bs4 import BeautifulSoup
import shutil

In [2]:
def download_cwru_data(dir='.'):
    """
    Download CWRU Bearing dataset
    
    Arguments
    ---------
    dir: str
        Directory where the dataset should be stored
    
    """
    
    healthy_assets_url = [
        "http://csegroups.case.edu/sites/default/files/bearingdatacenter/files/Datafiles/97.mat",
        "http://csegroups.case.edu/sites/default/files/bearingdatacenter/files/Datafiles/98.mat",
        "http://csegroups.case.edu/sites/default/files/bearingdatacenter/files/Datafiles/99.mat",
        "http://csegroups.case.edu/sites/default/files/bearingdatacenter/files/Datafiles/100.mat"
    ]
    
    for url in healthy_assets_url:
        if not os.path.isfile('./data_healthy/' + os.path.basename(url)):
            wget.download(url=url, out=os.path.realpath('data_healthy/'))

In [3]:
download_cwru_data()

In [4]:
def read_folder(folder):
    """
    Separate data from .mat files and convert into arrays.
    
     'X097_DE_time': array([[ 0.05319692],
        [ 0.08866154],
        [ 0.09971815],
        ...,
        [-0.03463015],
        [ 0.01668923],
        [ 0.04693846]]),
 'X097_FE_time': array([[0.14566727],                 (We aim to convert this)
        [0.09779636],
        [0.05485636],
        ...,
        [0.14053091],
        [0.09553636],
        [0.09019455]]),
 'X097RPM': array([[1796]], dtype=uint16)}
    
    Arguments:
    ----------
    folder: str
        Path of directory where data is stored
    
    """
    data = 'dummy'
    skip = False
    
    for file in os.listdir(folder):
        file_id = file[:-4]
        matlab_file_dict = sio.loadmat(folder+file)
        del data
        
        for key, value in matlab_file_dict.items():
            if 'DE_time' in key or 'FE_time' in key:
                a = np.array(matlab_file_dict[key])
                
                try:
                    data
                except NameError:
                    data = a
                else:
                    if (data.shape[0] != a.shape[0]):
                        print('skipping ' + file_id)
                        skip = True
                        continue
                    data = np.hstack((data,a))
        """
        When data has lots of missing entries, filling in data to maintain quality
        
        """
        if skip:
            skip = False
            continue
        id = np.repeat(file_id, data.shape[0])
        id.shape = (id.shape[0],1)
        data = np.hstack((id,data))
        if data.shape[1] == 2:
            zeroes = np.repeat(float(0),data.shape[0])
            zeroes.shape = (data.shape[0],1)
            data = np.hstack((data,zeroes))
        try:
            result
        except NameError:
            result = data
        else:
            result = np.vstack((result,data))
    return result

In [5]:
result_healthy = read_folder('./data_healthy/')

skipping 99
skipping 99


In [6]:
pdf = pd.DataFrame(result_healthy)

In [9]:
pdf

Unnamed: 0,0,1,2
0,100,0.014603076923076923,0.19292181818181817
1,100,0.05444861538461539,0.16436363636363635
2,100,0.10764553846153846,0.09081090909090908
3,100,0.13372246153846154,0.08649636363636364
4,100,0.11265230769230769,0.09923454545454545
...,...,...,...
1213479,98,-0.04318338461538461,-0.05362363636363636
1213480,98,-0.06738276923076922,-0.055883636363636364
1213481,98,-0.09909230769230769,-0.007601818181818181
1213482,98,-0.10827138461538462,0.0402690909090909


In [13]:
pdf.to_csv('result_healthy_pandas.csv', header=False, index=True)

In [None]:
"""
    Moving on to downloading faulty data from 3 different links;
    - 12k-drive-end-bearing-fault-data
    - 48k-drive-end-bearing-fault-data
    - 12k-fan-end-bearing-fault-data

"""

In [8]:
req1 = requests.get("https://csegroups.case.edu/bearingdatacenter/pages/12k-drive-end-bearing-fault-data")
soup1 = BeautifulSoup(req1.text, "lxml")

pages1 = soup1.findAll('a', href=re.compile('.*Datafiles?.*'))

In [9]:
req2 = requests.get("https://csegroups.case.edu/bearingdatacenter/pages/48k-drive-end-bearing-fault-data")
soup2 = BeautifulSoup(req2.text, "lxml")

pages2 = soup2.findAll('a', href=re.compile('.*Datafiles?.*'))

In [10]:
req3 = requests.get("https://csegroups.case.edu/bearingdatacenter/pages/12k-fan-end-bearing-fault-data")
soup3 = BeautifulSoup(req3.text, "lxml")

pages3 = soup3.findAll('a', href=re.compile('.*Datafiles?.*'))

In [11]:
def scrape_file(url):
    """
    Scraping CRWU website for faulty bearing dataset
    
    Arguments
    ---------
    url: str
        String from <pages> list where the faulty data is online
    
    """
    """path = url['href']
    r = requests.get(url, stream=True)
    if r.status_code == 200:
        with open(path, 'wb') as f:
            for chunk in r:
                f.write(chunk)
    """
    path=[]
    for link in url:
        if link.has_attr('href'):
            path.append(link['href'])
    return path

In [13]:
faulty_url1 = scrape_file(pages1)

faulty_url2 = scrape_file(pages2)

faulty_url3 = scrape_file(pages3)

In [14]:
faulty_url = faulty_url1 + faulty_url2 + faulty_url3

In [17]:
def download_faulty(dir='.'):
    """
    Download faulty bearing dataset
    
    Arguments
    ---------
    dir: str
        Directory where the dataset should be stored
    
    """
    
    for url in faulty_url:
        if not os.path.isfile('./data_faulty/' + os.path.basename(url)):
            wget.download(url=url, out=os.path.realpath('data_faulty/'))

In [18]:
download_faulty()

100% [..........................................................................] 7779920 / 7779920

In [None]:
result_faulty = read_folder('./data_faulty/')

In [None]:
fpdf = pd.DataFrame(result_faulty)

In [None]:
fpdf.tocsv('result_faulty_pandas.csv', header=False, index=True)

In [None]:
"""
    Additionally cleaning up artifacts for memory limited machines
    and cleaning up directories for files

"""

del result_healthy
del result_faulty
del pdf
del fpdf

shutil.move('/Users/krishnateja.kuppa/Documents/Python Scripts/result_healthy_pandas.csv', '/Users/krishnateja.kuppa/Documents/Python Scripts/data_healthy/result_healthy_pandas.csv')

shutil.move('/Users/krishnateja.kuppa/Documents/Python Scripts/result_faulty_pandas.csv', '/Users/krishnateja.kuppa/Documents/Python Scripts/data_healthy/result_faulty_pandas.csv')