## **Check** the remarks on `eda_model_cart.ipynb` notebook. They apply here!

In [None]:
import pandas as pd

# SECTION 3: Statistic Summary

## Load the dataset using pandas and get a summary of the number of machines for each of the four types.

In [None]:
f_type = 'speedo'  # Type of file (lidar, m, cart, speedo)
ID = ['A', 'B', 'C']  # ID for each file

In [None]:
A, B, C = [pd.read_parquet('../data/raw/202201W1-' + f_type + ID[i] + '.parquet', engine='pyarrow') for i in range(len(ID))]
f = [A, B, C]  # Create list with all loaded files

# SECTION 4: Defining the Schemas

## Get the schemas for each of the four types of machines. Note: Schema refers to the definition of the schema on a database. Observe the datatypes of each field.

In [None]:
for i, file in enumerate(f):
    if list(f[0].dtypes) == list(file.dtypes):
        if i == len(f) - 1:
            print(file.dtypes)
    else:
        print('Files do not have a common schema')
        break    

# SECTION 5: Exploring the dataset. 

## 5a: Get a statistical summary of the dataset.

In [None]:
for i, file in enumerate(f):
    print(str(i+1) +': File ../data/raw/202201W1-' + f_type + ID[i] + '.parquet ' + '-'*50)
    print(file.describe(), '\n')

## 5b: Get the number of missing values, null values, or NaN values.

In [None]:
print('The number of NaN values in each dataframe is:')
for i, file in enumerate(f):
    print("File ../data/raw/202201W1-" + f_type + ID[i] + ".parquet : {}".format(file.isna().sum().sum()))

In [None]:
print('The number of null values in each dataframe is:')
for i, file in enumerate(f):
    print("File ../data/raw/202201W1-" + f_type + ID[i] + ".parquet : {}".format(file.isnull().sum().sum()))

## 5c: Establish the ranges of each variable, i.e., get the min and max for each field.

In [None]:
for i, file in enumerate(f):
    print(str(i+1) +': File ../data/raw/202201W1-' + f_type + ID[i] + '.parquet ' + '-'*50)
    print(file.select_dtypes(include=['float64']).agg(['min','max']), '\n')

## 5d: Get the rate at which data arrives. Use the timestamp field in the dataset.

In [None]:
for i, file in enumerate(f):
    print(str(i+1) +': File ../data/raw/202201W1-' + f_type + ID[i] + '.parquet ' + '-'*50)
    print('Average rate of incoming data = ' + 
          str(1/file['timestamp'].diff().mean().total_seconds()) + ' entries per second', '\n')