# MAPM: Microsoft Azure Predictive Maintenance

In [3]:
import os
import tqdm
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

## Overview of Raw Data

Error logs (hourly):
- encountered by the machines while in operating condition.
- don't shut down the machines, not considered as failures.


In [6]:
pd.read_csv('../datasets/mapm/PdM_errors.csv.gz')

Unnamed: 0,datetime,machineID,errorID
0,2015-01-03 07:00:00,1,error1
1,2015-01-03 20:00:00,1,error3
2,2015-01-04 06:00:00,1,error5
3,2015-01-10 15:00:00,1,error4
4,2015-01-22 10:00:00,1,error4
...,...,...,...
3914,2015-11-21 08:00:00,100,error2
3915,2015-12-04 02:00:00,100,error1
3916,2015-12-08 06:00:00,100,error2
3917,2015-12-08 06:00:00,100,error3


In [8]:
pd.read_csv('../datasets/mapm/PdM_failures.csv.gz')

Unnamed: 0,datetime,machineID,failure
0,2015-01-05 06:00:00,1,comp4
1,2015-03-06 06:00:00,1,comp1
2,2015-04-20 06:00:00,1,comp2
3,2015-06-19 06:00:00,1,comp4
4,2015-09-02 06:00:00,1,comp4
...,...,...,...
756,2015-11-29 06:00:00,99,comp3
757,2015-12-14 06:00:00,99,comp4
758,2015-02-12 06:00:00,100,comp1
759,2015-09-10 06:00:00,100,comp1


In [9]:
pd.read_csv('../datasets/mapm/PdM_machines.csv.gz')

Unnamed: 0,machineID,model,age
0,1,model3,18
1,2,model4,7
2,3,model3,8
3,4,model3,7
4,5,model3,2
...,...,...,...
95,96,model2,10
96,97,model2,14
97,98,model2,20
98,99,model1,14


In [None]:
pd.read_csv('../datasets/ma/PdM_maint.csv.gz')

Telemetry Time Series Data:  
- Hourly average of voltage, rotation, pressure, vibration
- Collected from 100 machines for the year 2015

In [None]:
pd.read_csv('../datasets/MAPM/PdM_telemetry.csv.gz')

In [None]:
def get_sequence_and_events(machine_id=1, error_id=None, comp_id=None,
                             start_date=None, end_date=None):

    sensors = pd.read_csv('../datasets/MAPM/PdM_telemetry.csv.gz')
    sensors = sensors[sensors.machineID == machine_id].reset_index(drop=True)
    # print(sensors.shape)
    errors = pd.read_csv('../datasets/MAPM/PdM_errors.csv.gz')
    errors = errors[errors.machineID == machine_id].reset_index(drop=True)
    # print(errors.shape)
    failures = pd.read_csv('../datasets/MAPM/PdM_failures.csv.gz')
    failures = failures[failures.machineID == machine_id]
    # print(failures.shape)
    data = sensors.merge(errors, how='left', on='datetime').merge(
        failures, how='left', on='datetime')

    data.datetime = pd.to_datetime(data.datetime)
    data = data.set_index('datetime')

    if start_date is not None:
        data = data[start_date:]
    if end_date is not None:
        data = data[:end_date]

    return data

In [None]:
def plot_sequence_and_events(data, machine_id=1):

    data = data[data.machineID == machine_id]
    fig, ax = plt.subplots(4 + 2, figsize=(8, 8))

    data.plot(y='volt', legend=True, ax=ax[0])
    data.plot(y='rotate', legend=True, ax=ax[1])
    data.plot(y='pressure', legend=True, ax=ax[2])
    data.plot(y='vibration', legend=True, ax=ax[3])

    if data.errorID.isnull().sum() < data.errorID.shape[0]:
        pd.get_dummies(data.errorID).plot(ax=ax[4])
    if data.failure.isnull().sum() < data.failure.shape[0]:
        pd.get_dummies(data.failure).plot(ax=ax[5])

    ax[0].set_title('Machine #{}'.format(machine_id))

    for i in range(5):
        ax[i].set_xlabel(None)
        ax[i].set_xticklabels([])

    fig.tight_layout()

    return fig, ax

In [None]:
def gen_summary(outdir='../out/'):
    os.makedirs(outdir, exist_ok=True)
    df = load_data('../datasets/mapm/')
    with PdfPages(outdir + '/mapm_summary.pdf') as pp:
        for i in tqdm.trange(1, 101):
            fig, _ = plot_sequence_and_events(df, machine_id=i)
            fig.savefig(pp, format='pdf')
            plt.clf()
            plt.close()


In [None]:
gen_summary()

In [None]:
# Plot for a single machine
plot_sequence_and_events()

## Dataset Making

In [None]:
def load_data(fp):
    # Sensor data
    data = pd.read_csv(fp + '/PdM_telemetry.csv.gz')

    # Error alarm logs
    data = data.merge(
        pd.read_csv(fp + '/PdM_errors.csv.gz'),
        how='left', on=['datetime', 'machineID'])

    # Failure logs
    data = data.merge(
        pd.read_csv(fp + '/PdM_failures.csv.gz'),
        how='left', on=['datetime', 'machineID'])
    
    # Formatting
    data.datetime = pd.to_datetime(data.datetime)

    return data

In [None]:
def cleaning(df):

    # NaN values are encoded to -1
    df = df.sort_values('errorID')
    df.errorID = df.errorID.factorize()[0]
    df = df.sort_values('failure')
    df.failure = df.failure.factorize()[0]
    df = df.sort_values(['machineID', 'datetime'])

    df.errorID = df.errorID.astype('category')
    df.failure = df.failure.astype('category')

    df.volt = df.volt.astype('float32')
    df.rotate = df.rotate.astype('float32')
    df.pressure = df.pressure.astype('float32')
    df.vibration = df.vibration.astype('float32')

    df.datetime = pd.to_datetime(df.datetime)
    return df

In [None]:
df = load_data('../datasets/mapm/')
df = cleaning(df)
# df.errorID = df.errorID.sort_values().factorize()[0]
# df.failure = df.failure.sort_values().factorize()[0]
# df = df.sort_values(['machineID', 'datetime'])

In [None]:
df.dtypes

In [None]:
df = load_data('../datasets/mapm/')
df = df.sort_values('errorID')
df.errorID = df.errorID.factorize()[0]
df = df.sort_values('failure')
df.failure = df.failure.factorize()[0]
df = df.sort_values(['machineID', 'datetime'])

In [None]:
df

In [None]:
df_m = [g for _, g in df.groupby('machineID')]
df_m[1]

## Preprocessing