In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#| default_exp utils

# Introduction


Predictive Maintenance (PdM) is a great application of Survival Analysis since it consists in predicting when equipment failure will occur and therefore alerting the maintenance team to prevent that failure.

### ` Objectives`
> - To estimate Remaining Useful Time(RUL) of a machine/component
> - To Predict Probability of Failure in Next N days
> - Create a Dashboard for tracking and alerts

# Setup

### `Requirements`
> To automatically generate requirements.txt file type `pipreqs .` in terminal
- matplotlib==3.7.1
- numpy==1.24.2
- pandas==2.0.1
- setuptools==65.6.3 

# Dataset

Dataset we are going to use contains the following features

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import pandas as pd
import numpy as np

import PredictiveMaintenance2.Datasets as Datasets
from PredictiveMaintenance2 import Visualize

### load dataset

In [None]:
file = 'predictive_maintenance_dataset.csv'
machine_data = Datasets.load_dataset(file,'r')

File exists
.csv file extension is supported


### Explore dataset
- dataset.shape
- dataset.null values
- dataset.duplicate rows

In [None]:
Datasets.explore_dataset(dataset_df = machine_data,
                         NAN_action='drop',
                         duplicate_action = 'drop',
                         duplicate_subset=['device', 'date'])

In Dataset 
Observations : 124493 
Columns :12

-----NAN values-----
date       0
device     0
failure    0
metric1    0
metric2    0
metric3    0
metric4    0
metric5    0
metric6    0
metric7    0
metric8    0
metric9    0
dtype: int64

-----Duplicate records-----
0



In [None]:
machine_data.dtypes

date       object
device     object
failure     int64
metric1     int64
metric2     int64
metric3     int64
metric4     int64
metric5     int64
metric6     int64
metric7     int64
metric8     int64
metric9     int64
dtype: object

In [None]:
# add feature function  - explain every feature
"""
Features.explain_features(machine_Unique_Identifer = 'string',
                        machine_features = [column_names],
                        observation_date = 'string',
                        age/survival_time = 'string',
                        failure = 'string'
                        sensors = [column_names]
                            )
"""

"\nDatasets.explain_features(machine_Unique_Identifer = 'string',\n                        machine_features = [column_names],\n                        observation_date = 'string',\n                        age/survival_time = 'string',\n                        failure = 'string'\n                        sensors = [column_names]\n                            )\n"

In [None]:
# machine_unique_identifier
unique_device_types = pd.DataFrame(machine_data.groupby(['device']).agg(['count']))
print(f"There are {(unique_device_types.shape)[0]} unique machines")

In [None]:
# Date of observation
unique_observation_dates = pd.DataFrame(machine_data.groupby(['date']).agg(['count']))
print(f"Observations are recorded for {(unique_observation_dates.shape)[0]} days")
# temp = pd.DataFrame(data.groupby(['date']))

In [None]:
# failure rate
failure_rate = machine_data.groupby(['failure'])['device'].agg('count')
print(f"Number of records where, \nFailure = FALSE are {failure_rate[0]} \nFailure = TRUE are {failure_rate[1]}")
print(f"Percentage of failures : {failure_rate[1]*100/(machine_data.shape)[0] : .3f}%")

In [None]:
# ------data transformation------

"""if RUL or survival time or age of equipment is not given calculate it from date and failure columns"""

# format datetime field which comes in as string
machine_data['date'] = pd.to_datetime(machine_data['date'],format = 'mixed')

# group by machine ID and find the last date of observation for each machine
last_observation_dates = machine_data.groupby('device')['date'].max().reset_index()
machine_data_rul = pd.merge(machine_data, last_observation_dates, on='device')  
# print(machine_data_rul.shape)

# calculate the time difference between the last observation date and each observation date for that machine
time_diff = machine_data_rul['date_y'] - machine_data_rul['date_x']

# calculate the RUL for each observation
machine_data_rul['RUL'] = (time_diff.dt.days).astype(int)
machine_data_rul.sort_values(by='date_x',inplace=True)

# view modified dataset after adding RUL and dropping intermediate columns created
machine_data_rul.reset_index(inplace=True)

# drop intermediate columns function
machine_data_rul.drop(columns=['index','date_y'],axis=0,inplace=True)

machine_data_rul.head()

In [None]:
# encoding categorical variables to numerical
machine_data_rul['device'] = pd.factorize(machine_data_rul['device'])[0]
machine_data_rul.head()

In [None]:
# defining rul time and failure event columns
rul_time = machine_data_rul['RUL']
failure_event = machine_data_rul['failure']

# define features
features = np.setdiff1d(machine_data_rul.columns, ['RUL', 'failure','date_x']).tolist()

# Visualization

In [None]:
# failure column
Visualize.plot(feature_type='event',feature='failure',dataset=machine_data_rul)

In [None]:
# rul column
Visualize.plot(feature_type='time',feature='RUL',dataset=machine_data_rul)

In [None]:
# visualize
cats = ['device']
nums = ['metric1','metric2','metric3','metric4','metric5','metric6','metric7','metric8','metric9']

for feature in cats:
    Visualize.plot(feature=feature,feature_type='Categorical',dataset=machine_data_rul)

In [None]:
for feature in nums:
    Visualize.plot(feature=feature,feature_type='Numerical',dataset=machine_data_rul)

In [None]:
# there are too many outliers and data is not normally distributed

# PreProcessing

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()