In [2]:
import pandas as pd

# SECTION 3: Statistic Summary

## Load the dataset using pandas and get a summary of the number of machines for each of the four types.

In [3]:
f_type = 'cart'  # Type of file (lidar, m, cart, speedo)
ID = ['1Floor', '1Top', '2', '4x']  # ID for each file

In [4]:
floor, top, two, fourX = [pd.read_parquet('../data/raw/202201W1-' + f_type + ID[i] + '.parquet', engine='pyarrow') for i in range(len(ID))]
f = [floor, top, two, fourX]  # Create list with all loaded files

# SECTION 4: Defining the Schemas

## Get the schemas for each of the four types of machines. Note: Schema refers to the definition of the schema on a database. Observe the datatypes of each field.

In [5]:
for i, file in enumerate(f):
    if list(f[0].dtypes) == list(file.dtypes):
        if i == len(f) - 1:
            print(file.dtypes)
    else:
        print('Files do not have a common schema')
        break    

machineID              object
torque                float64
oilPressure           float64
vibration             float64
posX                  float64
posY                  float64
posZ                  float64
perpForce             float64
normForce             float64
timestamp      datetime64[ns]
dtype: object


# SECTION 5: Exploring the dataset. 

## 5a: Get a statistical summary of the dataset.

In [6]:
for i, file in enumerate(f):
    print(str(i+1) +': File ../data/raw/202201W1-' + f_type + ID[i] + '.parquet ' + '-'*50)
    print(file.describe(), '\n')

1: File ../data/raw/202201W1-cart1Floor.parquet --------------------------------------------------
              torque    oilPressure      vibration           posX  \
count  604800.000000  604800.000000  589802.000000  604800.000000   
mean       54.377976      74.986845      29.514522     129.364821   
std         9.186406       0.276544      31.071173       8.939564   
min        40.047000      74.499000       6.000000     115.146000   
25%        44.666000      74.737000       8.000000     119.989750   
50%        57.965000      74.992000       8.980000     132.951000   
75%        63.012000      75.243000      63.350000     137.826000   
max        65.990000      75.400000     100.000000     140.489000   

                posY           posZ      perpForce      normForce  
count  604800.000000  604800.000000  604800.000000  604800.000000  
mean      -20.608870       0.725581    3041.354130    5623.103438  
std         9.434896       0.124886     991.705278      41.462488  
min    

## 5b: Get the number of missing values, null values, or NaN values.

In [12]:
print('The number of NaN values in each dataframe is:')
for i, file in enumerate(f):
    print("File ../data/raw/202201W1-" + f_type + ID[i] + ".parquet : {}".format(file.isna().sum().sum()))

The number of NaN values in each dataframe is:
File ../data/raw/202201W1-cart1Floor.parquet : 14998
File ../data/raw/202201W1-cart1Top.parquet : 13272
File ../data/raw/202201W1-cart2.parquet : 18820
File ../data/raw/202201W1-cart4x.parquet : 5135


In [13]:
print('The number of null values in each dataframe is:')
for i, file in enumerate(f):
    print("File ../data/raw/202201W1-" + f_type + ID[i] + ".parquet : {}".format(file.isnull().sum().sum()))

The number of null values in each dataframe is:
File ../data/raw/202201W1-cart1Floor.parquet : 14998
File ../data/raw/202201W1-cart1Top.parquet : 13272
File ../data/raw/202201W1-cart2.parquet : 18820
File ../data/raw/202201W1-cart4x.parquet : 5135


## 5c: Establish the ranges of each variable, i.e., get the min and max for each field.

In [9]:
for i, file in enumerate(f):
    print(str(i+1) +': File ../data/raw/202201W1-' + f_type + ID[i] + '.parquet ' + '-'*50)
    print(file.select_dtypes(include=['float64']).agg(['min','max']), '\n')

1: File ../data/raw/202201W1-cart1Floor.parquet --------------------------------------------------
     torque  oilPressure  vibration     posX    posY      posZ    perpForce  \
min  40.047       74.499        6.0  115.146 -35.052  0.533256  1603.762209   
max  65.990       75.400      100.0  140.489  -8.509  0.885784  4354.680100   

       normForce  
min  5550.101001  
max  5685.160000   

2: File ../data/raw/202201W1-cart1Top.parquet --------------------------------------------------
     torque  oilPressure  vibration     posX    posY      posZ    perpForce  \
min  63.153       72.399       7.00  136.966 -11.252  0.849973  3988.301409   
max  93.744       75.000      99.99  167.743  19.745  1.266828  8787.937536   

       normForce  
min  5241.615201  
max  5625.000000   

3: File ../data/raw/202201W1-cart2.parquet --------------------------------------------------
     torque  oilPressure  vibration     posX    posY      posZ    perpForce  \
min  29.000       73.799        2.0  

## 5d: Get the rate at which data arrives. Use the timestamp field in the dataset.

In [10]:
for i, file in enumerate(f):
    print(str(i+1) +': File ../data/raw/202201W1-' + f_type + ID[i] + '.parquet ' + '-'*50)
    print('Average rate of incoming data = ' + 
          str(1/file['timestamp'].diff().mean().total_seconds()) + ' entries per second', '\n')

1: File ../data/raw/202201W1-cart1Floor.parquet --------------------------------------------------
Average rate of incoming data = 1.0 entries per second 

2: File ../data/raw/202201W1-cart1Top.parquet --------------------------------------------------
Average rate of incoming data = 1.0 entries per second 

3: File ../data/raw/202201W1-cart2.parquet --------------------------------------------------
Average rate of incoming data = 1.0 entries per second 

4: File ../data/raw/202201W1-cart4x.parquet --------------------------------------------------
Average rate of incoming data = 1.0 entries per second 

