In [None]:
%load_ext autoreload
%autoreload 2

# Feature Engineering

> This module under Package PredictiveMaintenance2 defines functions for feature exploration,feature creation,feature selection,feature encoding and feature extraction.
> feature transformation functions are imported from preprocessing module

In [None]:
#|default_exp FeatureEng

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#|export
import pandas as pd

In [None]:
#| export
def explain_features(dataset_df : pd.DataFrame, # Pandas DataFrame object of dataset
                    machine_Unique_Identifer :str = None, # a unique ID to identify machine
                    machine_features: list = None, # a list of machine features such as vendor_name, machine_type,manufacturer ....
                    observation_date : str = None, # date when observation is recorded
                    survival_time : int = None, # age of equipment till observation_date
                    failure : int = None, # event of failute . In most cases failure = 0 means not failed yet, failure=1 means equipment is failed
                    sensor_values:list = None # measured values of multiple sensors
                    ):
    try:
        
        # machine_unique_identifier
        if machine_Unique_Identifer:
            unique_device_types = pd.DataFrame(dataset_df.groupby([machine_Unique_Identifer]).agg(['count']))
            print(f"There are {(unique_device_types.shape)[0]} unique machines\n")
            
        
        # observation_date
        if observation_date:
            unique_observation_dates = pd.DataFrame(dataset_df.groupby([observation_date]).agg(['count']))
            print(f"Observations are recorded for {(unique_observation_dates.shape)[0]} days\n")
            print(f" First 5 unique dates are \n{(unique_observation_dates.index)[:5]}\n")
            
            
        # machine features = meta_features
        if machine_features:
            # counter for each machine feature[category,count]
            # visualize as categorical variable [Visualize module]
            pass
        
        
        # survival time
        if survival_time:
            # plot histogram of survival time with failure event [visualize module]
            pass
        
        # failure
        if failure:
            # calculate failure rate
            failure_count = dataset_df.groupby(['failure'])['device'].agg('count')
            print(f"Number of records where, \nFailure = FALSE are {failure_count[0]} \nFailure = TRUE are {failure_count[1]}")
            failure_rate = failure_count[1]*100/(dataset_df.shape)[0]
            print(f"\nPercentage of failures : {failure_rate:.3f}%")
            
            if failure_rate < 50:
                print(f"\n--Warning---: \nNumber of record of event type failure are too low \nDataset is unbalanced.\nUse expand_target_window function")
        
        # sensor_values
        if len(sensor_values):
            for sensor in sensor_values:
                # provide feature statistics - pd.describe()
                pass
        
    except Exception as e:
        print(e)
        return None
    else:
        #return unique_device_types
        pass 

In [None]:
#| hide
# function to create running summeries for sensor_values by feature_window = * days
"""but in this case they are not much useful because most sensor values do not change"""

'but in this case they are not much useful because most sensor values do not change'

In [None]:
#| export

def expand_target_window(dataset_df: pd.DataFrame,
                         target_window: int,
                         observation_date : str,
                         machine_Unique_Identifer :str,
                         rul:str = None,
                         survival_time : str = None,
                         failure_date : str = None
                        ):
    try:
        # if rul column is not given in dataset- calculate it 
        if rul is None:
            rul = calculate_rul(dataset_df,observation_date,)
        else:
            pass 
        
    except Exception as e:
        print(e)
        return None
    
    else:
        pass

In [None]:
#| export

def calculate_rul(dataset_df :pd.DataFrame,
                  observation_date : str,
                  machine_Unique_Identifer :str
                 ):
    try:
        # format observation_date field if it comes in as string
        dataset_df[observation_date] = pd.to_datetime(dataset_df[observation_date],format = 'mixed')
        
        # group by machine ID and find the last date of observation for each machine which is nothing but failure date
        last_observation_dates = dataset_df.groupby(machine_Unique_Identifer)[observation_date].max().reset_index()
        last_observation_dates = last_observation_dates.rename(columns = {observation_date :'last_observation_date'})
        
        # merge last_observation_dates and dataset_df
        dataset_df_rul = pd.merge(dataset_df, last_observation_dates, on=machine_Unique_Identifer) 
        
        # calculate the time difference between the last observation date and each observation date for that machine
        time_diff = dataset_df_rul['last_observation_date'] - dataset_df_rul[observation_date]
        
        # calculate the RUL for each observation
        dataset_df_rul['RUL'] = (time_diff.dt.days).astype(int)
        dataset_df_rul.sort_values(by=observation_date,inplace=True)
        
        # view modified dataset after adding RUL and dropping intermediate columns created
        dataset_df_rul.reset_index(inplace=True)

        # drop intermediate columns function
        dataset_df_rul.drop(columns=['index','last_observation_date'],axis=0,inplace=True)

    except Exception as e:
        print(e)
        return None
    
    else:
        return dataset_df_rul

In [None]:
#| export

def to_numerical(dataset_df :pd.DataFrame):
    try:
        column_types = dataset_df.dtypes
        for column in dataset_df.columns:
            if column_types[column] == object:
                if column == 'date':
                    pass
                else:
                    dataset_df[column] = pd.factorize(dataset_df[column])[0]
                    
    except Exception as e:
        print(e)
        return None
    else:
        return dataset_df

In [None]:
data = pd.read_csv('predictive_maintenance_dataset.csv')
data = to_numerical(data)
data.head()

Unnamed: 0,date,device,failure,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric8,metric9
0,01-01-2015,0,0,215630672,55,0,52,6,407438,0,0,7
1,01-01-2015,1,0,61370680,0,3,0,6,403174,0,0,0
2,01-01-2015,2,0,173295968,0,0,0,12,237394,0,0,0
3,01-01-2015,3,0,79694024,0,0,0,6,410186,0,0,0
4,01-01-2015,4,0,135970480,0,0,0,15,313173,0,0,3


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()