# Data Understanding
---

# 1. Import Libraries, Functions and Read Data 
---

### 1.1 Import Libraries  

In [1]:
import numpy as np 
import pandas as pd

### 1.2 Functions

In [2]:
def read_csv(file_path:str)-> pd.DataFrame:
    """
    Read data from a CSV file and return it as a pandas DataFrame.

    Parameters:
    - file_path (str): The path to the CSV file.

    Returns:
    - pd.DataFrame(DataFrame: A pandas DataFrame representing the CSV data.
    """
    df = pd.read_csv(file_path)
    df[['Date', 'Time']] = df['Date'].str.split(' ', expand=True)
    return df



def filter_data_by_node_id(df: pd.DataFrame, node_id: int)-> pd.DataFrame:
    """
    Filter a DataFrame based on a specified nodeID.

    Parameters:
    - df (pd.DataFrame): The DataFrame to be filtered.
    - node_id (int): The nodeID to filter the DataFrame (must be between 0 and 11).
                                  Defaults to None if not provided.

    Returns:
    - pd.DataFrame: A DataFrame containing observations for the specified nodeID.
    """
    filtered_df = df[df['NodeID'] == node_id]
    return filtered_df



def create_subdatasets_by_node_id(df: pd.DataFrame)-> dict:
    """
    Create a dictionary of subdatasets based on a specified column.

    Parameters:
    - df (pd.DataFrame): The DataFrame to be grouped.
    - groupby_column (str): The column to use for grouping (default is 'NodeID').

    Returns:
    - dict: A dictionary where keys are unique values in the specified column, and values are subdatasets (DataFrames).
    """
    grouped_data = df.groupby('NodeID')
    subdatasets = {key: group for key, group in grouped_data}
    return subdatasets




def create_subdatasets_by_time(df: pd.DataFrame) -> dict:
    """
    Create a dictionary of subdatasets based on a specified time column.

    Parameters:
    - df (pd.DataFrame): The DataFrame to be grouped.
    - time_column (str): The column to use for grouping based on time (default is 'DateTime').

    Returns:
    - dict: A dictionary where keys are unique values in the specified time column, and values are subdatasets (DataFrames).
    """
    grouped_data = df.groupby('Time')
    subdatasets = {key: group for key, group in grouped_data}
    return subdatasets

### 1.3 Read Data 

import os
os.getcwd()

In [3]:
df = read_csv(r"..\data\raw\machine-failure-data.csv")
df.head()

Unnamed: 0,Date,NodeID,MotorCurrent,Frequency,PIP,TubingPressure,CasingPressure,PIT,MotorTemperature,XVib,Label,Time
0,2020-01-10,0,0.690741,0.919376,0.324617,0.427775,0.039333,0.800865,0.83325,0.004845,0,00:00:00
1,2020-01-10,0,0.688889,0.920676,0.324346,0.413583,0.039333,0.800865,0.8335,0.005233,0,01:00:00
2,2020-01-10,0,0.687037,0.920676,0.324301,0.399392,0.039333,0.800865,0.8335,0.004651,0,02:00:00
3,2020-01-10,0,0.688889,0.919258,0.323632,0.399392,0.039758,0.800865,0.832795,0.004651,0,03:00:00
4,2020-01-10,0,0.688889,0.927769,0.327327,0.399392,0.044,0.802338,0.835799,0.004651,0,04:00:00


# 2. Data Understanding  and Creating Subdatasets
---

## 2.1 Data Understanding 

#### 2.1.1 Observation in each Node

In [4]:
grouped_data = df.groupby(['NodeID', 'Label']).size().unstack().reset_index()
grouped_data.columns = ['NodeID', 'No_Of_0_Obs', 'No_Of_1_Obs']
grouped_data['Total_No_of_Obs'] = grouped_data['No_Of_0_Obs'] + grouped_data['No_Of_1_Obs']
grouped_data

Unnamed: 0,NodeID,No_Of_0_Obs,No_Of_1_Obs,Total_No_of_Obs
0,0,2160,169,2329
1,1,1824,169,1993
2,2,1488,169,1657
3,3,1440,169,1609
4,4,2160,169,2329
5,5,2160,169,2329
6,6,2160,169,2329
7,7,2160,169,2329
8,8,2160,169,2329
9,9,2160,169,2329


#### 2.1.2 data info

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26220 entries, 0 to 26219
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Date              26220 non-null  object 
 1   NodeID            26220 non-null  int64  
 2   MotorCurrent      26220 non-null  float64
 3   Frequency         26220 non-null  float64
 4   PIP               26220 non-null  float64
 5   TubingPressure    26220 non-null  float64
 6   CasingPressure    26220 non-null  float64
 7   PIT               26220 non-null  float64
 8   MotorTemperature  26220 non-null  float64
 9   XVib              26220 non-null  float64
 10  Label             26220 non-null  int64  
 11  Time              26220 non-null  object 
dtypes: float64(8), int64(2), object(2)
memory usage: 2.4+ MB


#### 2.1.3 Data Summary

In [6]:
df.describe()

Unnamed: 0,NodeID,MotorCurrent,Frequency,PIP,TubingPressure,CasingPressure,PIT,MotorTemperature,XVib,Label
count,26220.0,26220.0,26220.0,26220.0,26220.0,26220.0,26220.0,26220.0,26220.0,26220.0
mean,5.716018,0.640938,0.844129,0.452275,0.412803,0.072056,0.784967,0.811299,0.139364,0.077346
std,3.451193,0.191091,0.206378,0.168122,0.129487,0.134976,0.097345,0.10403,0.172807,0.267144
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,0.566667,0.836151,0.308721,0.327758,0.034686,0.761956,0.784677,0.004651,0.0
50%,6.0,0.681481,0.90143,0.465881,0.43901,0.042139,0.79438,0.82023,0.014535,0.0
75%,9.0,0.744444,0.936281,0.54139,0.49434,0.061174,0.817076,0.858538,0.253682,0.0
max,11.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## 2.2 Create Subdataset

#### 2.2.1 Filter Data by NodeID

In [7]:
node_id = 0
df_filtered = filter_data_by_node_id(df, node_id=node_id) 
df_filtered.head()

Unnamed: 0,Date,NodeID,MotorCurrent,Frequency,PIP,TubingPressure,CasingPressure,PIT,MotorTemperature,XVib,Label,Time
0,2020-01-10,0,0.690741,0.919376,0.324617,0.427775,0.039333,0.800865,0.83325,0.004845,0,00:00:00
1,2020-01-10,0,0.688889,0.920676,0.324346,0.413583,0.039333,0.800865,0.8335,0.005233,0,01:00:00
2,2020-01-10,0,0.687037,0.920676,0.324301,0.399392,0.039333,0.800865,0.8335,0.004651,0,02:00:00
3,2020-01-10,0,0.688889,0.919258,0.323632,0.399392,0.039758,0.800865,0.832795,0.004651,0,03:00:00
4,2020-01-10,0,0.688889,0.927769,0.327327,0.399392,0.044,0.802338,0.835799,0.004651,0,04:00:00


#### 2.2.2 Creating Subdatasets 

- ### `By: NodeID`

In [8]:
subdfs = create_subdatasets_by_node_id(df)

In [26]:
subdf_0 = subdfs[0]
subdf_0

Unnamed: 0,Date,NodeID,MotorCurrent,Frequency,PIP,TubingPressure,CasingPressure,PIT,MotorTemperature,XVib,Label,Time
0,2020-01-10,0,0.690741,0.919376,0.324617,0.427775,0.039333,0.800865,0.833250,0.004845,0,00:00:00
1,2020-01-10,0,0.688889,0.920676,0.324346,0.413583,0.039333,0.800865,0.833500,0.005233,0,01:00:00
2,2020-01-10,0,0.687037,0.920676,0.324301,0.399392,0.039333,0.800865,0.833500,0.004651,0,02:00:00
3,2020-01-10,0,0.688889,0.919258,0.323632,0.399392,0.039758,0.800865,0.832795,0.004651,0,03:00:00
4,2020-01-10,0,0.688889,0.927769,0.327327,0.399392,0.044000,0.802338,0.835799,0.004651,0,04:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...
2324,2020-04-15,0,0.688148,0.907412,0.245224,0.216929,0.036667,0.830046,0.775163,0.004651,1,20:00:00
2325,2020-04-15,0,0.687407,0.913914,0.245224,0.221321,0.036667,0.830046,0.775163,0.004651,1,21:00:00
2326,2020-04-15,0,0.690926,0.924057,0.245224,0.216929,0.036667,0.830046,0.775163,0.004651,1,22:00:00
2327,2020-04-15,0,0.690556,0.931079,0.245224,0.216929,0.036667,0.830046,0.775163,0.004651,1,23:00:00


In [10]:
subdf_1 = 
subdf_1.head()

Unnamed: 0,Date,NodeID,MotorCurrent,Frequency,PIP,TubingPressure,CasingPressure,PIT,MotorTemperature,XVib,Label,Time
2329,2019-08-19,1,0.674074,0.936281,0.337398,0.463338,0.043639,0.784653,0.876314,0.286822,0,00:00:00
2330,2019-08-19,1,0.683333,0.936281,0.338888,0.468491,0.043722,0.784653,0.876565,0.291279,0,01:00:00
2331,2019-08-19,1,0.674074,0.936281,0.336901,0.470603,0.044278,0.784653,0.874812,0.28062,0,02:00:00
2332,2019-08-19,1,0.67037,0.936281,0.33663,0.465704,0.044222,0.784653,0.874311,0.28876,0,03:00:00
2333,2019-08-19,1,0.666667,0.936281,0.337443,0.462494,0.04325,0.785193,0.876314,0.288372,0,04:00:00


- ### `By: Time` 

In [11]:
subdf_by_time = create_subdatasets_by_time(subdf_0)

In [12]:
subdf_by_time['00:00:00'].head()

Unnamed: 0,Date,NodeID,MotorCurrent,Frequency,PIP,TubingPressure,CasingPressure,PIT,MotorTemperature,XVib,Label,Time
0,2020-01-10,0,0.690741,0.919376,0.324617,0.427775,0.039333,0.800865,0.83325,0.004845,0,00:00:00
24,2020-01-11,0,0.691111,0.920676,0.322395,0.454131,0.048667,0.800865,0.834352,0.004651,0,00:00:00
48,2020-01-12,0,0.688889,0.920676,0.317121,0.482514,0.044,0.800865,0.832248,0.003876,0,00:00:00
72,2020-01-13,0,0.688889,0.920676,0.312288,0.458861,0.044,0.800865,0.832499,0.004845,0,00:00:00
96,2020-01-14,0,0.688889,0.920676,0.310075,0.454131,0.039333,0.800865,0.832248,0.004457,0,00:00:00


In [13]:
subdf_by_time['01:00:00'].head()

Unnamed: 0,Date,NodeID,MotorCurrent,Frequency,PIP,TubingPressure,CasingPressure,PIT,MotorTemperature,XVib,Label,Time
1,2020-01-10,0,0.688889,0.920676,0.324346,0.413583,0.039333,0.800865,0.8335,0.005233,0,01:00:00
25,2020-01-11,0,0.690909,0.920676,0.32043,0.454131,0.048667,0.800865,0.832522,0.006342,0,01:00:00
49,2020-01-12,0,0.688889,0.920676,0.316669,0.482514,0.044,0.800865,0.832248,0.004845,0,01:00:00
73,2020-01-13,0,0.688889,0.920676,0.312379,0.454131,0.044,0.800865,0.832749,0.004845,0,01:00:00
97,2020-01-14,0,0.690741,0.920676,0.31003,0.454131,0.039333,0.800865,0.834001,0.004651,0,01:00:00


In [14]:
subdf_by_time['03:00:00'].head()

Unnamed: 0,Date,NodeID,MotorCurrent,Frequency,PIP,TubingPressure,CasingPressure,PIT,MotorTemperature,XVib,Label,Time
3,2020-01-10,0,0.688889,0.919258,0.323632,0.399392,0.039758,0.800865,0.832795,0.004651,0,03:00:00
27,2020-01-11,0,0.688889,0.920676,0.319198,0.454131,0.048667,0.800865,0.832248,0.005943,0,03:00:00
51,2020-01-12,0,0.690909,0.920676,0.316143,0.482514,0.044,0.800865,0.833614,0.004228,0,03:00:00
75,2020-01-13,0,0.691358,0.920676,0.312815,0.454131,0.044,0.800865,0.832248,0.003876,0,03:00:00
99,2020-01-14,0,0.688889,0.920676,0.310256,0.454131,0.039333,0.800865,0.832248,0.004651,0,03:00:00


In [16]:
subdf_by_time['00:00:00']['Label'].value_counts()

Label
0    90
1     8
Name: count, dtype: int64

In [19]:
subdf_by_time_1 = create_subdatasets_by_time(df)
subdf_by_time_1

{'00:00:00':              Date  NodeID  MotorCurrent  Frequency       PIP  TubingPressure  \
 0      2020-01-10       0      0.690741   0.919376  0.324617        0.427775   
 24     2020-01-11       0      0.691111   0.920676  0.322395        0.454131   
 48     2020-01-12       0      0.688889   0.920676  0.317121        0.482514   
 72     2020-01-13       0      0.688889   0.920676  0.312288        0.458861   
 96     2020-01-14       0      0.688889   0.920676  0.310075        0.454131   
 ...           ...     ...           ...        ...       ...             ...   
 26123  2020-05-02      11      0.548148   0.977893  0.631035        0.409275   
 26147  2020-05-03      11      0.587037   0.967490  0.644493        0.421524   
 26171  2020-05-04      11      0.566667   0.973992  0.648241        0.413161   
 26195  2020-05-05      11      0.527778   0.973992  0.640293        0.409951   
 26219  2020-05-06      11      0.557407   0.973992  0.620061        0.417216   
 
        Casing

In [25]:
grouped_data = subdfs[1].groupby(['Time', 'Label']).size().unstack().reset_index()
grouped_data.columns = ['Time', 'No_Of_0_Obs', 'No_Of_1_Obs']
grouped_data['Total_No_of_Obs'] = grouped_data['No_Of_0_Obs'] + grouped_data['No_Of_1_Obs']
grouped_data

Unnamed: 0,Time,No_Of_0_Obs,No_Of_1_Obs,Total_No_of_Obs
0,00:00:00,76,8,84
1,01:00:00,76,7,83
2,02:00:00,76,7,83
3,03:00:00,76,7,83
4,04:00:00,76,7,83
5,05:00:00,76,7,83
6,06:00:00,76,7,83
7,07:00:00,76,7,83
8,08:00:00,76,7,83
9,09:00:00,76,7,83
