# Verify Tables in the STEDI Human Balance Analytics Project

In this notebook, we are going to verify the number of rows in all stages of the tables in this project.

The rubric requires you to have the tables at various stages to have the following number of rows:

- Landing
  - Customer: 956
  - Accelerometer: 81273
  - Step Trainer: 28680
- Trusted
  - Customer: 482
  - Accelerometer: 40981
  - Step Trainer: 14460
- Curated
  - Customer: 482
  - Machine Learning: 38403
 
Let's now see if the numbers of rows are correct when combined via pandas.

In [12]:
import os
import pandas as pd
import numpy as np

cust_dir = "starter/customer/landing"
acc_dir = "starter/accelerometer/landing"
st_dir = "starter/step_trainer/landing"

def load(path):
    df = pd.DataFrame()'int64'
    for filename in os.listdir(path):
        if filename.endswith('.json'):
            file_path = os.path.join(path, filename)
            df_ = pd.read_json(file_path, lines=True)
            df = pd.concat([df, df_], ignore_index=True)
    return df
cdf = load(cust_dir)
adf = load(acc_dir)
sdf = load(st_dir)

In [2]:
display(adf.head())
adf.info()

Unnamed: 0,user,timestamp,x,y,z
0,Craig.Wu@test.com,2022-06-18 15:00:55.341,1,1,0
1,Craig.Wu@test.com,2022-06-18 15:00:34.143,0,-1,-1
2,Craig.Wu@test.com,2022-06-18 15:00:12.945,1,0,0
3,Craig.Staples@test.com,2022-06-18 15:01:02.272,-1,0,0
4,Craig.Staples@test.com,2022-06-18 15:00:59.006,-1,0,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81273 entries, 0 to 81272
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   user       81273 non-null  object        
 1   timestamp  81273 non-null  datetime64[ns]
 2   x          81273 non-null  int64         
 3   y          81273 non-null  int64         
 4   z          81273 non-null  int64         
dtypes: datetime64[ns](1), int64(3), object(1)
memory usage: 3.1+ MB


In [3]:
print("Number of customer landing rows:", cdf.shape[0])
print("Number of accelerometer landing rows:", adf.shape[0])
print("Number of step trainer landing rows:", sdf.shape[0])

Number of customer landing rows: 956
Number of accelerometer landing rows: 81273
Number of step trainer landing rows: 28680


In [4]:
# customer trusted
cdf_t = cdf[~cdf['shareWithResearchAsOfDate'].isna()]
print("Number of customer trusted rows:", cdf_t.shape[0])

Number of customer trusted rows: 482


In [5]:
# accelerometer trusted
adf_t = adf.merge(cdf_t, how='inner', left_on='user', right_on='email')
adf_t = adf_t[adf.columns]
print("Number of accelerometer trusted rows:", adf_t.shape[0])

Number of accelerometer trusted rows: 40981


In [6]:
# customer curated
cdf_c = cdf_t.merge(adf_t, how='inner', left_on='email', right_on='user')
cdf_c = cdf_c[cdf.columns].drop_duplicates()
print("Number of customer curated rows:", cdf_c.shape[0])

Number of customer curated rows: 482


In [7]:
# step trainer trusted
sdf_t = sdf.merge(cdf_c, how='inner', left_on='serialNumber', right_on='serialNumber')
sdf_t = sdf_t[sdf.columns]
sdf_t = sdf_t.drop_duplicates()
print("Number of step trainer trusted rows:", sdf_t.shape[0])

Number of step trainer trusted rows: 14460


In [23]:
adf_t = adf.merge(cdf_t, how='inner', left_on='user', right_on='email')
adf_t = adf_t[adf.columns]
print("Number of accelerometer trusted rows:", adf_t.shape[0])

Number of accelerometer trusted rows: 40981


In [24]:
# machine learning curated
adf_t['timestamp1'] = (adf_t['timestamp'].astype(np.int64)/1000000).astype(np.int64)
mdf = sdf_t.merge(adf_t, how='inner', left_on='sensorReadingTime', right_on='timestamp1')
print("Number of machine learning curated rows:", mdf.shape[0])

Number of machine learning curated rows: 38403


In [25]:
sdf_t.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14460 entries, 0 to 14459
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   sensorReadingTime   14460 non-null  int64 
 1   serialNumber        14460 non-null  object
 2   distanceFromObject  14460 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 339.0+ KB


In [26]:
adf_t.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40981 entries, 0 to 40980
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   user        40981 non-null  object        
 1   timestamp   40981 non-null  datetime64[ns]
 2   x           40981 non-null  int64         
 3   y           40981 non-null  int64         
 4   z           40981 non-null  int64         
 5   timestamp1  40981 non-null  int64         
dtypes: datetime64[ns](1), int64(4), object(1)
memory usage: 1.9+ MB


In [27]:
adf_t.drop_duplicates('timestamp').info()

<class 'pandas.core.frame.DataFrame'>
Index: 14005 entries, 0 to 40971
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   user        14005 non-null  object        
 1   timestamp   14005 non-null  datetime64[ns]
 2   x           14005 non-null  int64         
 3   y           14005 non-null  int64         
 4   z           14005 non-null  int64         
 5   timestamp1  14005 non-null  int64         
dtypes: datetime64[ns](1), int64(4), object(1)
memory usage: 765.9+ KB


In [29]:
sdf_t.drop_duplicates('sensorReadingTime').info()

<class 'pandas.core.frame.DataFrame'>
Index: 14005 entries, 0 to 14459
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   sensorReadingTime   14005 non-null  int64 
 1   serialNumber        14005 non-null  object
 2   distanceFromObject  14005 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 437.7+ KB


In [30]:
sdf_t.drop_duplicates('sensorReadingTime') \
    .merge(adf_t.drop_duplicates('timestamp'), 
           how='inner', left_on='sensorReadingTime', 
           right_on='timestamp1')

Unnamed: 0,sensorReadingTime,serialNumber,distanceFromObject,user,timestamp,x,y,z,timestamp1
0,1655564172120,98ec1419-44b7-4016-a584-777f96115cce,246,Ben.Davis@test.com,2022-06-18 14:56:12.120,1,0,-1,1655564172120
1,1655564157056,98ec1419-44b7-4016-a584-777f96115cce,230,Ben.Davis@test.com,2022-06-18 14:55:57.056,1,-1,-1,1655564157056
2,1655564115630,98ec1419-44b7-4016-a584-777f96115cce,237,Ben.Davis@test.com,2022-06-18 14:55:15.630,0,0,0,1655564115630
3,1655564209780,98ec1419-44b7-4016-a584-777f96115cce,273,Ben.Davis@test.com,2022-06-18 14:56:49.780,1,-1,-1,1655564209780
4,1655564202248,98ec1419-44b7-4016-a584-777f96115cce,255,Ben.Davis@test.com,2022-06-18 14:56:42.248,1,0,-1,1655564202248
...,...,...,...,...,...,...,...,...,...
12314,1655564417351,de5eb3c3-0c41-41f4-b2d3-1ced6c233327,219,Jacob.Davis@test.com,2022-06-18 15:00:17.351,0,1,-1,1655564417351
12315,1655564485244,de5eb3c3-0c41-41f4-b2d3-1ced6c233327,282,Jacob.Davis@test.com,2022-06-18 15:01:25.244,-1,-1,-1,1655564485244
12316,1655564436749,de5eb3c3-0c41-41f4-b2d3-1ced6c233327,266,Jacob.Davis@test.com,2022-06-18 15:00:36.749,1,1,0,1655564436749
12317,1655564475545,de5eb3c3-0c41-41f4-b2d3-1ced6c233327,219,Jacob.Davis@test.com,2022-06-18 15:01:15.545,1,1,0,1655564475545


In [20]:
adf_t.sort_values(['user', 'timestamp']).head()

Unnamed: 0,user,timestamp,x,y,z
17854,Angie.Abram@test.com,1655564379262,-1,-1,0
17863,Angie.Abram@test.com,1655564379262,-1,-1,-1
17877,Angie.Abram@test.com,1655564379262,0,1,-1
17799,Angie.Abram@test.com,1655564382595,0,1,0
17875,Angie.Abram@test.com,1655564382595,0,-1,-1


In [17]:
sdf_t.head()

Unnamed: 0,sensorReadingTime,serialNumber,distanceFromObject
0,1655564172120,98ec1419-44b7-4016-a584-777f96115cce,246
1,1655564157056,98ec1419-44b7-4016-a584-777f96115cce,230
2,1655564115630,98ec1419-44b7-4016-a584-777f96115cce,237
3,1655564213546,98ec1419-44b7-4016-a584-777f96115cce,216
4,1655564209780,98ec1419-44b7-4016-a584-777f96115cce,273


In [18]:
sdf_t.

serialNumber
98ec1419-44b7-4016-a584-777f96115cce    30
03d010f1-20dd-45dd-b406-357b1ba0564a    30
466bb1b9-5dc3-4607-8f76-e0bcdcf6b9c7    30
b5f6b144-0260-4c4f-83e1-469150f9055b    30
dbfde3b7-69bf-4c56-a9e8-dfd48af6df89    30
                                        ..
facde9be-1073-4959-bbb9-72040129a29b    30
358dea74-8e8a-43be-b48c-f4e799802888    30
abe1cdc7-94f5-4e17-aa55-1342bd7de357    30
23fcbd51-9a56-48d1-8d4a-46a7426f4eb5    30
de5eb3c3-0c41-41f4-b2d3-1ced6c233327    30
Name: count, Length: 482, dtype: int64