In [1]:
import pandas as pd

# SECTION 3: Statistic Summary

## Load the dataset using pandas and get a summary of the number of machines for each of the four types.

In [2]:
f_type = 'speedo'  # Type of file (lidar, m, cart, speedo)
ID = ['A', 'B', 'C']  # ID for each file

In [3]:
A, B, C = [pd.read_parquet('../data/raw/202201W1-' + f_type + ID[i] + '.parquet', engine='pyarrow') for i in range(len(ID))]
f = [A, B, C]  # Create list with all loaded files

# SECTION 4: Defining the Schemas

## Get the schemas for each of the four types of machines. Note: Schema refers to the definition of the schema on a database. Observe the datatypes of each field.

In [4]:
for i, file in enumerate(f):
    if list(f[0].dtypes) == list(file.dtypes):
        if i == len(f) - 1:
            print(file.dtypes)
    else:
        print('Files do not have a common schema')
        break    

machineID            object
speedPct            float64
velX                float64
velY                float64
velZ                float64
timestamp    datetime64[ns]
dtype: object


# SECTION 5: Exploring the dataset. 

## 5a: Get a statistical summary of the dataset.

In [5]:
for i, file in enumerate(f):
    print(str(i+1) +': File ../data/raw/202201W1-' + f_type + ID[i] + '.parquet ' + '-'*50)
    print(file.describe(), '\n')

1: File ../data/raw/202201W1-speedoA.parquet --------------------------------------------------
           speedPct          velX          velY          velZ
count  1.209600e+06  1.209600e+06  1.209600e+06  1.209600e+06
mean   5.944089e-01  1.000457e+00 -4.002768e-03 -1.211582e-01
std    2.437940e-01  1.060594e+00  1.062076e+00  6.784054e-01
min    2.000000e-01 -8.000000e+00 -8.000000e+00 -5.867000e+00
25%    4.000000e-01  9.100000e-01 -1.090000e-01 -2.580000e-01
50%    6.000000e-01  9.920000e-01  2.000000e-02 -1.250000e-01
75%    8.000000e-01  1.070000e+00  1.680000e-01 -1.600000e-02
max    1.000000e+00  7.996000e+00  7.996000e+00  4.992000e+00 

2: File ../data/raw/202201W1-speedoB.parquet --------------------------------------------------
           speedPct          velX          velY          velZ
count  1.209600e+06  1.209600e+06  1.209600e+06  1.209600e+06
mean   5.944089e-01  9.966559e-01  1.449825e-02 -1.120151e-01
std    2.437940e-01  7.765714e-01  6.813404e-01  5.548551e-01


## 5b: Get the number of missing values, null values, or NaN values.

In [11]:
print('The number of NaN values in each dataframe is:')
for i, file in enumerate(f):
    print("File ../data/raw/202201W1-" + f_type + ID[i] + ".parquet : {}".format(file.isna().sum().sum()))

The number of NaN values in each dataframe is:
File ../data/raw/202201W1-speedoA.parquet : 0
File ../data/raw/202201W1-speedoB.parquet : 0
File ../data/raw/202201W1-speedoC.parquet : 0


In [12]:
print('The number of null values in each dataframe is:')
for i, file in enumerate(f):
    print("File ../data/raw/202201W1-" + f_type + ID[i] + ".parquet : {}".format(file.isnull().sum().sum()))

The number of null values in each dataframe is:
File ../data/raw/202201W1-speedoA.parquet : 0
File ../data/raw/202201W1-speedoB.parquet : 0
File ../data/raw/202201W1-speedoC.parquet : 0


## 5c: Establish the ranges of each variable, i.e., get the min and max for each field.

In [8]:
for i, file in enumerate(f):
    print(str(i+1) +': File ../data/raw/202201W1-' + f_type + ID[i] + '.parquet ' + '-'*50)
    print(file.select_dtypes(include=['float64']).agg(['min','max']), '\n')

1: File ../data/raw/202201W1-speedoA.parquet --------------------------------------------------
     speedPct   velX   velY   velZ
min       0.2 -8.000 -8.000 -5.867
max       1.0  7.996  7.996  4.992 

2: File ../data/raw/202201W1-speedoB.parquet --------------------------------------------------
     speedPct   velX   velY   velZ
min       0.2 -8.000 -8.000 -4.305
max       1.0  7.996  7.391  6.086 

3: File ../data/raw/202201W1-speedoC.parquet --------------------------------------------------
     speedPct   velX   velY   velZ
min       0.2  0.648 -0.391 -0.582
max       1.0  1.355  0.301  0.398 



## 5d: Get the rate at which data arrives. Use the timestamp field in the dataset.

In [9]:
for i, file in enumerate(f):
    print(str(i+1) +': File ../data/raw/202201W1-' + f_type + ID[i] + '.parquet ' + '-'*50)
    print('Average rate of incoming data = ' + 
          str(1/file['timestamp'].diff().mean().total_seconds()) + ' entries per second', '\n')

1: File ../data/raw/202201W1-speedoA.parquet --------------------------------------------------
Average rate of incoming data = 2.0 entries per second 

2: File ../data/raw/202201W1-speedoB.parquet --------------------------------------------------
Average rate of incoming data = 2.0 entries per second 

3: File ../data/raw/202201W1-speedoC.parquet --------------------------------------------------
Average rate of incoming data = 2.0 entries per second 

