In [1]:
import pandas as pd

# SECTION 3: Statistic Summary

## Load the dataset using pandas and get a summary of the number of machines for each of the four types.

In [2]:
f_type = 'lidar'  # Type of file (lidar, m, cart, speedo)
ID = ['In', 'Out', 'S1', 'S2']  # ID for each file

In [3]:
In, Out, S1, S2 = [pd.read_parquet('../data/raw/202201W1-' + f_type + ID[i] + '.parquet', engine='pyarrow') for i in range(len(ID))]
f = [In, Out, S1, S2]  # Create list with all loaded files

# SECTION 4: Defining the Schemas

## Get the schemas for each of the four types of machines. Note: Schema refers to the definition of the schema on a database. Observe the datatypes of each field.

In [4]:
for i, file in enumerate(f):
    if list(f[0].dtypes) == list(file.dtypes):
        if i == len(f) - 1:
            print(file.dtypes)
    else:
        print('Files do not have a common schema')
        break    

machineID            object
tempS1              float64
tempS2              float64
distX               float64
distY               float64
distZ               float64
forceX              float64
forceY              float64
forceZ              float64
timestamp    datetime64[ns]
dtype: object


# SECTION 5: Exploring the dataset. 

## 5a: Get a statistical summary of the dataset.

In [5]:
for i, file in enumerate(f):
    print(str(i+1) +': File ../data/raw/202201W1-' + f_type + ID[i] + '.parquet ' + '-'*50)
    print(file.describe(), '\n')

1: File ../data/raw/202201W1-lidarIn.parquet --------------------------------------------------
             tempS1        tempS2         distX         distY         distZ  \
count  1.209600e+07  1.209600e+07  1.209600e+07  1.209600e+07  1.209600e+07   
mean   1.222702e+01  1.189868e+01 -1.111646e-01 -1.110364e-01 -1.084967e-01   
std    8.030981e+00  7.395107e+00  9.348594e-02  7.947732e-02  1.070118e-01   
min    0.000000e+00  0.000000e+00 -9.570000e-01 -7.710000e-01 -1.169000e+00   
25%    5.900000e+00  6.100000e+00 -1.730000e-01 -1.640000e-01 -1.780000e-01   
50%    1.120000e+01  1.120000e+01 -1.100000e-01 -1.100000e-01 -1.070000e-01   
75%    1.730000e+01  1.660000e+01 -4.900000e-02 -5.900000e-02 -3.900000e-02   
max    1.042000e+02  1.025000e+02  4.880000e-01  5.350000e-01  8.910000e-01   

             forceX        forceY        forceZ  
count  1.209600e+07  1.209600e+07  1.209600e+07  
mean   1.274295e+02  1.139140e+02  1.155029e+02  
std    8.895342e+01  6.663811e+01  6.93819

## 5b: Get the number of missing values, null values, or NaN values.

In [6]:
print('The number of NaN values in each dataframe is:')
for i, file in enumerate(f):
    print("File ../data/raw/202201W1-" + f_type + ID[i] + ".parquet : {}".format(file.isna().sum().sum()))

The number of NaN values in each dataframe is:
File ../data/raw/202201W1-lidarIn.parquet : 0
File ../data/raw/202201W1-lidarOut.parquet : 0
File ../data/raw/202201W1-lidarS1.parquet : 0
File ../data/raw/202201W1-lidarS2.parquet : 0


In [7]:
print('The number of null values in each dataframe is:')
for i, file in enumerate(f):
    print("File ../data/raw/202201W1-" + f_type + ID[i] + ".parquet : {}".format(file.isnull().sum().sum()))

The number of null values in each dataframe is:
File ../data/raw/202201W1-lidarIn.parquet : 0
File ../data/raw/202201W1-lidarOut.parquet : 0
File ../data/raw/202201W1-lidarS1.parquet : 0
File ../data/raw/202201W1-lidarS2.parquet : 0


## 5c: Establish the ranges of each variable, i.e., get the min and max for each field.

In [8]:
for i, file in enumerate(f):
    print(str(i+1) +': File ../data/raw/202201W1-' + f_type + ID[i] + '.parquet ' + '-'*50)
    print(file.select_dtypes(include=['float64']).agg(['min','max']), '\n')

1: File ../data/raw/202201W1-lidarIn.parquet --------------------------------------------------
     tempS1  tempS2  distX  distY  distZ  forceX  forceY  forceZ
min     0.0     0.0 -0.957 -0.771 -1.169     0.0     0.0     0.0
max   104.2   102.5  0.488  0.535  0.891  1008.0   828.0   869.0 

2: File ../data/raw/202201W1-lidarOut.parquet --------------------------------------------------
     tempS1  tempS2  distX  distY  distZ  forceX  forceY  forceZ
min     0.0     0.0 -0.991 -0.830 -5.000     0.0     0.0     0.0
max   100.8   101.6  0.728  0.552  4.998  5000.0  3958.0  3801.0 

3: File ../data/raw/202201W1-lidarS1.parquet --------------------------------------------------
     tempS1  tempS2  distX  distY  distZ  forceX  forceY  forceZ
min     0.0     0.0 -0.908 -0.781 -1.426     0.0     0.0     0.0
max   103.0   108.6  0.527  0.427  1.208  1130.0   737.0   876.0 

4: File ../data/raw/202201W1-lidarS2.parquet --------------------------------------------------
     tempS1  tempS2  dis

## 5d: Get the rate at which data arrives. Use the timestamp field in the dataset.

In [9]:
for i, file in enumerate(f):
    print(str(i+1) +': File ../data/raw/202201W1-' + f_type + ID[i] + '.parquet ' + '-'*50)
    print('Average rate of incoming data = ' + 
          str(1/file['timestamp'].diff().mean().total_seconds()) + ' entries per second', '\n')

1: File ../data/raw/202201W1-lidarIn.parquet --------------------------------------------------
Average rate of incoming data = 20.0 entries per second 

2: File ../data/raw/202201W1-lidarOut.parquet --------------------------------------------------
Average rate of incoming data = 20.0 entries per second 

3: File ../data/raw/202201W1-lidarS1.parquet --------------------------------------------------
Average rate of incoming data = 20.0 entries per second 

4: File ../data/raw/202201W1-lidarS2.parquet --------------------------------------------------
Average rate of incoming data = 20.0 entries per second 

