In [None]:
#| hide
# to set up auto reaload whenever changes are made in other modules
%load_ext autoreload
%autoreload 2

# Datasets 
> A module in PredictiveMaintenance2 package that helps in loading datasets

In [None]:
#| default_exp Datasets

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import pandas as pd
import os

In [None]:
#| hide
class File_NotFound_Exception(Exception):
    def __init__(self, message):
        super().__init__(message)

class Unsupplorted_FileFormat_Exception(Exception):
    def __init__(self, message):
        super().__init__(message)

In [None]:
#| export

def load_dataset(filepath : str, # full path/relative path of the dataset file
                 mode: str # read,write or append
                ) :  # -> pd.DataFrame(): # return DataFrame of dataset 
    # check if file exists
    try :
        if os.path.exists(filepath):
            print("File exists")
            
        else:
            raise File_NotFound_Exception(f"Cannot open file{filepath}")
            
    except Exception as e:
        print(e)
        return None
    
    # check if file extension is supported
    name,extension = filepath.split(".")

    try : 
        if extension == 'csv' or extension == 'xlsx' or extension == 'json':
            print(f".{extension} file extension is supported")
            data_frame = pd.read_csv(filepath)
            
        else:
            raise Unsupported_FileFormat_Exception(f"Cannot open file with extension{extension}")
            
    except Exception as e:
        print(e)
        return None
    
    else:
        return data_frame  


In [None]:
# example
df = load_dataset('predictive_maintenance_dataset.csv','r')
df.head()

File exists
.csv file extension is supported


Unnamed: 0,date,device,failure,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric8,metric9
0,01-01-2015,S1F01085,0,215630672,55,0,52,6,407438,0,0,7
1,01-01-2015,S1F0166B,0,61370680,0,3,0,6,403174,0,0,0
2,01-01-2015,S1F01E6Y,0,173295968,0,0,0,12,237394,0,0,0
3,01-01-2015,S1F01JE0,0,79694024,0,0,0,6,410186,0,0,0
4,01-01-2015,S1F01R2B,0,135970480,0,0,0,15,313173,0,0,3


In [None]:
#| export

def explore_dataset(dataset_df : pd.DataFrame,# DataFrame handler of the dataset
                    NAN_action : str , # action to take on null values
                    duplicate_action : str, # action to take on duplicate values
                    NAN_subset : list = None, # drop rows with null values in columns of NAN_subset
                    duplicate_subset : list = None, # drop rows with duplicate values in columns of dupliate_subset
                    inplace : bool = True # whether ot not to make changes in original dataset
                    ):
    try:
        
        print(f"In Dataset \nObservations : {(dataset_df.shape)[0]} \nColumns :{(dataset_df.shape)[1]}\n")

        ### null values ###
        null_values = dataset_df.isnull().sum(axis = 0)
        print(f"-----NAN values-----\n{null_values}\n")
        
        # if null values found in dataset perform NAN_action specified 
        if (null_values.sum()):
            if NAN_action =='drop': 
                dataset.dropna(inplace=inplace)
                print(f"{NAN_action} NAN successful \n")
                
                

        ### duplicates ###
        
        # we need record of each machine only once per day
        duplicate_values = dataset_df.duplicated(subset=duplicate_subset).sum()
        print(f"-----Duplicate records-----\n{duplicate_values}\n")
        
        # if null values found in dataset perform NAN_action specified 
        if duplicate_values:
            if duplicate_action =='drop': 
                dataset_df.drop_duplicates(subset= duplicate_subset, inplace=inplace)
                print(f"{NAN_action} Duplicates successful \n")
                
    
    except Exception as e:
        print(e)
        return None
    

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()