In [62]:
import pandas as pd

# SECTION 3: Statistic Summary

## Load the dataset using pandas and get a summary of the number of machines for each of the four types.

In [63]:
f_type = 'm'  # Type of file (lidar, m, cart, speedo)
ID = ['1', '2', '3', '4', '5', '6', '7']  # ID for each file

In [64]:
m1, m2, m3, m4, m5, m6, m7 = [pd.read_parquet('../data/raw/202201W1-' + f_type + ID[i] + '.parquet', engine='pyarrow') for i in range(len(ID))]
f = [m1, m2, m3, m4, m5, m6, m7]  # Create list with all loaded files

# SECTION 4: Defining the Schemas

## Get the schemas for each of the four types of machines. Note: Schema refers to the definition of the schema on a database. Observe the datatypes of each field.

In [65]:
for i, file in enumerate(f):
    if list(f[0].dtypes) == list(file.dtypes):
        if i == len(f) - 1:
            print(file.dtypes)
    else:
        print('Files do not have a common schema')
        break    

machineID            object
voltage             float64
torque              float64
pressure            float64
vibration           float64
error1              float64
error2              float64
error3              float64
error4              float64
error5              float64
comp1               float64
comp2               float64
comp3               float64
comp4               float64
failure              object
failed                int64
timestamp    datetime64[ns]
dtype: object


# SECTION 5: Exploring the dataset. 

## 5a: Get a statistical summary of the dataset.

In [66]:
for i, file in enumerate(f):
    print(str(i+1) +': File ../data/raw/202201W1-' + f_type + ID[i] + '.parquet ' + '-'*50)
    print(file.describe(), '\n')

1: File ../data/raw/202201W1-m1.parquet --------------------------------------------------
             voltage         torque       pressure      vibration  \
count  604800.000000  604800.000000  604800.000000  604800.000000   
mean      170.606277     446.824396     100.614390      40.393806   
std         4.379654      18.411593       4.181544       2.072381   
min       161.190200     308.052233      94.641472      36.672514   
25%       168.017349     441.405713      98.532824      39.405749   
50%       170.123632     449.441369     100.064489      39.995175   
75%       172.233025     456.298170     101.519663      40.853328   
max       195.314315     479.002376     129.252186      54.826763   

              error1         error2         error3         error4  \
count  604800.000000  604800.000000  604800.000000  604800.000000   
mean        0.025284       0.035926       0.019464       0.017389   
std         0.156988       0.191215       0.138150       0.130717   
min        

## 5b: Get the number of missing values, null values, or NaN values.

In [67]:
print('The number of NaN values in each dataframe is:')
for i, file in enumerate(f):
    print("File ../data/raw/202201W1-" + f_type + ID[i] + ".parquet : {}".format(file.isna().sum().sum()))

The number of NaN values in each dataframe is:
File ../data/raw/202201W1-m1.parquet : 0
File ../data/raw/202201W1-m2.parquet : 0
File ../data/raw/202201W1-m3.parquet : 0
File ../data/raw/202201W1-m4.parquet : 0
File ../data/raw/202201W1-m5.parquet : 0
File ../data/raw/202201W1-m6.parquet : 0
File ../data/raw/202201W1-m7.parquet : 0


In [68]:
print('The number of null values in each dataframe is:')
for i, file in enumerate(f):
    print("File ../data/raw/202201W1-" + f_type + ID[i] + ".parquet : {}".format(file.isnull().sum().sum()))

The number of null values in each dataframe is:
File ../data/raw/202201W1-m1.parquet : 0
File ../data/raw/202201W1-m2.parquet : 0
File ../data/raw/202201W1-m3.parquet : 0
File ../data/raw/202201W1-m4.parquet : 0
File ../data/raw/202201W1-m5.parquet : 0
File ../data/raw/202201W1-m6.parquet : 0
File ../data/raw/202201W1-m7.parquet : 0


## 5c: Establish the ranges of each variable, i.e., get the min and max for each field.

In [69]:
for i, file in enumerate(f):
    print(str(i+1) +': File ../data/raw/202201W1-' + f_type + ID[i] + '.parquet ' + '-'*50)
    print(file.select_dtypes(include=['float64']).agg(['min','max']), '\n')

1: File ../data/raw/202201W1-m1.parquet --------------------------------------------------
        voltage      torque    pressure  vibration  error1  error2  error3  \
min  161.190200  308.052233   94.641472  36.672514     0.0     0.0     0.0   
max  195.314315  479.002376  129.252186  54.826763     1.0     2.0     1.0   

     error4  error5  comp1  comp2  comp3  comp4  
min     0.0     0.0    0.0    0.0    0.0    0.0  
max     1.0     1.0  120.0  150.0  165.0  135.0   

2: File ../data/raw/202201W1-m2.parquet --------------------------------------------------
        voltage      torque    pressure  vibration  error1  error2  error3  \
min  158.580010  356.257513   93.868329  36.214705     0.0     0.0     0.0   
max  204.745244  479.680033  144.635764  52.143377     1.0     2.0     1.0   

     error4  error5  comp1  comp2  comp3  comp4  
min     0.0     0.0    0.0    0.0    0.0    0.0  
max     1.0     1.0  135.0  105.0  105.0  135.0   

3: File ../data/raw/202201W1-m3.parquet ----

## 5d: Get the rate at which data arrives. Use the timestamp field in the dataset.

In [70]:
for i, file in enumerate(f):
    print(str(i+1) +': File ../data/raw/202201W1-' + f_type + ID[i] + '.parquet ' + '-'*50)
    print('Average rate of incoming data = ' + 
          str(1/file['timestamp'].diff().mean().total_seconds()) + ' entries per second', '\n')

1: File ../data/raw/202201W1-m1.parquet --------------------------------------------------
Average rate of incoming data = 1.0 entries per second 

2: File ../data/raw/202201W1-m2.parquet --------------------------------------------------
Average rate of incoming data = 1.0 entries per second 

3: File ../data/raw/202201W1-m3.parquet --------------------------------------------------
Average rate of incoming data = 1.0 entries per second 

4: File ../data/raw/202201W1-m4.parquet --------------------------------------------------
Average rate of incoming data = 1.0 entries per second 

5: File ../data/raw/202201W1-m5.parquet --------------------------------------------------
Average rate of incoming data = 1.0 entries per second 

6: File ../data/raw/202201W1-m6.parquet --------------------------------------------------
Average rate of incoming data = 1.0 entries per second 

7: File ../data/raw/202201W1-m7.parquet --------------------------------------------------
Average rate of incom