# Split Final Data

- Split final/reduced data
- Store them into multiple JSON files
- Create appropriate folder structure for them

## Setup, Load, Explore

In [1]:
import pandas as pd
import os
import shutil

data_path = 'final-data.csv'
customer_fields = ['customerName', 'email', 'phone', 'birthDay', 'serialNumber',
                   'registrationDate', 'lastUpdateDate', 'shareWithResearchAsOfDate',
                   'shareWithPublicAsOfDate', 'shareWithFriendsAsOfDate']
accelerometer_fields = ['user', 'timestamp', 'x', 'y', 'z']
step_trainer_fields = ['sensorReadingTime', 'serialNumber', 'distanceFromObject']
reduced_data_path = '../final_data/'

# How many rows does each file have?
batch_size = 10000

In [2]:
df = pd.read_csv(data_path)
display(df.head())
display(df.info())

Unnamed: 0,customername,email,phone,birthday,serialnumber,registrationdate,lastupdatedate,sharewithresearchasofdate,sharewithpublicasofdate,sharewithfriendsasofdate,row_num,user,timestamp,x,y,z,sensorreadingtime,serialnumber.1,distancefromobject
0,Santosh Clayton,Santosh.Clayton@test.com,8015551212,1900-01-01,50f7b4f3-7af5-4b07-a421-7b902c8d2b7c,1655564376361,1655564376361,1655564000000.0,1655564000000.0,1655564000000.0,1,Santosh.Clayton@test.com,1655564444103,1.0,-1.0,-1.0,1655564444103,50f7b4f3-7af5-4b07-a421-7b902c8d2b7c,218
1,Santosh Clayton,Santosh.Clayton@test.com,8015551212,1900-01-01,50f7b4f3-7af5-4b07-a421-7b902c8d2b7c,1655564376361,1655564376361,1655564000000.0,1655564000000.0,1655564000000.0,1,Santosh.Clayton@test.com,1655564440170,0.0,1.0,-1.0,1655564440170,50f7b4f3-7af5-4b07-a421-7b902c8d2b7c,230
2,Santosh Clayton,Santosh.Clayton@test.com,8015551212,1900-01-01,50f7b4f3-7af5-4b07-a421-7b902c8d2b7c,1655564376361,1655564376361,1655564000000.0,1655564000000.0,1655564000000.0,1,Santosh.Clayton@test.com,1655564440170,-1.0,0.0,-1.0,1655564440170,50f7b4f3-7af5-4b07-a421-7b902c8d2b7c,230
3,Santosh Clayton,Santosh.Clayton@test.com,8015551212,1900-01-01,50f7b4f3-7af5-4b07-a421-7b902c8d2b7c,1655564376361,1655564376361,1655564000000.0,1655564000000.0,1655564000000.0,1,Santosh.Clayton@test.com,1655564416572,1.0,0.0,0.0,1655564416572,50f7b4f3-7af5-4b07-a421-7b902c8d2b7c,268
4,Santosh Clayton,Santosh.Clayton@test.com,8015551212,1900-01-01,50f7b4f3-7af5-4b07-a421-7b902c8d2b7c,1655564376361,1655564376361,1655564000000.0,1655564000000.0,1655564000000.0,1,Santosh.Clayton@test.com,1655564404773,-1.0,-1.0,0.0,1655564404773,50f7b4f3-7af5-4b07-a421-7b902c8d2b7c,244


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81273 entries, 0 to 81272
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   customername               81273 non-null  object 
 1   email                      81273 non-null  object 
 2   phone                      81273 non-null  int64  
 3   birthday                   81273 non-null  object 
 4   serialnumber               81273 non-null  object 
 5   registrationdate           81273 non-null  int64  
 6   lastupdatedate             81273 non-null  int64  
 7   sharewithresearchasofdate  40981 non-null  float64
 8   sharewithpublicasofdate    41766 non-null  float64
 9   sharewithfriendsasofdate   43233 non-null  float64
 10  row_num                    81273 non-null  int64  
 11  user                       81273 non-null  object 
 12  timestamp                  81273 non-null  int64  
 13  x                          81273 non-null  flo

None

In [3]:
# There are 200 emails
df['email'].value_counts()

email
Angie.Mitra@test.com        90
Angie.Ahmed@test.com        90
Danny.Davis@test.com        90
Craig.Fibonnaci@test.com    90
Jaya.Mitra@test.com         89
                            ..
Edward.Clark@test.com       78
Spencer.Harris@test.com     78
Neeraj.Staples@test.com     78
Jacob.Jones@test.com        78
Jaya.Anandh@test.com        77
Name: count, Length: 956, dtype: int64

## Split the data into three DataFrame objects

In [4]:
fields_lower = [f.lower() for f in customer_fields]
c_df = df[fields_lower]
c_df = c_df.drop_duplicates(subset='email')
c_df = c_df.rename(columns=dict(zip(fields_lower, customer_fields)))
display(c_df.head())
display(c_df.info())
print(f"Num of rows with null shareWithResearchAsOfDate: {c_df[c_df['shareWithResearchAsOfDate'].isna()].shape[0]}")

Unnamed: 0,customerName,email,phone,birthDay,serialNumber,registrationDate,lastUpdateDate,shareWithResearchAsOfDate,shareWithPublicAsOfDate,shareWithFriendsAsOfDate
0,Santosh Clayton,Santosh.Clayton@test.com,8015551212,1900-01-01,50f7b4f3-7af5-4b07-a421-7b902c8d2b7c,1655564376361,1655564376361,1655564000000.0,1655564000000.0,1655564000000.0
11,Ben Khatib,Ben.Khatib@test.com,8015551212,1399-01-01,20400202-c1da-4328-8e94-5bacc7d61ecb,1655564414415,1655564414415,1655564000000.0,,1655564000000.0
20,Frank Hansen,Frank.Hansen@test.com,8015551212,1323-01-01,454a7430-d4ff-47cf-8666-7428f8a9894d,1655564131452,1655564131452,,,1655564000000.0
33,David Jones,David.Jones@test.com,8015551212,1111-01-01,fe784255-4faf-47ab-96f7-ea79c6e87f0c,1655564435763,1655564435763,,,
44,Edward Clayton,Edward.Clayton@test.com,8015551212,1754-01-01,3d798cef-bc66-4128-85af-2c9d36c251b9,1655564387419,1655564387419,,1655564000000.0,1655564000000.0


<class 'pandas.core.frame.DataFrame'>
Index: 956 entries, 0 to 31111
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   customerName               956 non-null    object 
 1   email                      956 non-null    object 
 2   phone                      956 non-null    int64  
 3   birthDay                   956 non-null    object 
 4   serialNumber               956 non-null    object 
 5   registrationDate           956 non-null    int64  
 6   lastUpdateDate             956 non-null    int64  
 7   shareWithResearchAsOfDate  482 non-null    float64
 8   shareWithPublicAsOfDate    491 non-null    float64
 9   shareWithFriendsAsOfDate   508 non-null    float64
dtypes: float64(3), int64(3), object(4)
memory usage: 82.2+ KB


None

Num of rows with null shareWithResearchAsOfDate: 474


In [5]:
fields_lower = [f.lower() for f in accelerometer_fields]
a_df = df[fields_lower]
# a_df = a_df.drop_duplicates(subset=['user', 'timestamp'])
a_df = a_df.drop_duplicates()
a_df = a_df.rename(columns=dict(zip(fields_lower, accelerometer_fields)))
display(a_df.head())
display(a_df.info())

Unnamed: 0,user,timestamp,x,y,z
0,Santosh.Clayton@test.com,1655564444103,1.0,-1.0,-1.0
1,Santosh.Clayton@test.com,1655564440170,0.0,1.0,-1.0
2,Santosh.Clayton@test.com,1655564440170,-1.0,0.0,-1.0
3,Santosh.Clayton@test.com,1655564416572,1.0,0.0,0.0
4,Santosh.Clayton@test.com,1655564404773,-1.0,-1.0,0.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81273 entries, 0 to 81272
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   user       81273 non-null  object 
 1   timestamp  81273 non-null  int64  
 2   x          81273 non-null  float64
 3   y          81273 non-null  float64
 4   z          81273 non-null  float64
dtypes: float64(3), int64(1), object(1)
memory usage: 3.1+ MB


None

In [6]:
fields_lower = [f.lower() for f in step_trainer_fields]
st_df = df[fields_lower]
# st_df = st_df.drop_duplicates(subset=['sensorreadingtime', 'serialnumber'])
st_df = st_df.drop_duplicates()
st_df = st_df.rename(columns=dict(zip(fields_lower, step_trainer_fields)))
display(st_df.head())
display(st_df.info())

Unnamed: 0,sensorReadingTime,serialNumber,distanceFromObject
0,1655564444103,50f7b4f3-7af5-4b07-a421-7b902c8d2b7c,218
1,1655564440170,50f7b4f3-7af5-4b07-a421-7b902c8d2b7c,230
3,1655564416572,50f7b4f3-7af5-4b07-a421-7b902c8d2b7c,268
4,1655564404773,50f7b4f3-7af5-4b07-a421-7b902c8d2b7c,244
5,1655564400840,50f7b4f3-7af5-4b07-a421-7b902c8d2b7c,219


<class 'pandas.core.frame.DataFrame'>
Index: 28680 entries, 0 to 81270
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   sensorReadingTime   28680 non-null  int64 
 1   serialNumber        28680 non-null  object
 2   distanceFromObject  28680 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 896.2+ KB


None

## Verify Correctness

Build the combined table back from the DataFrame objects to verify their correctness

In [7]:
df['user'].value_counts()

user
Angie.Mitra@test.com        90
Angie.Ahmed@test.com        90
Danny.Davis@test.com        90
Craig.Fibonnaci@test.com    90
Jaya.Mitra@test.com         89
                            ..
Edward.Clark@test.com       78
Spencer.Harris@test.com     78
Neeraj.Staples@test.com     78
Jacob.Jones@test.com        78
Jaya.Anandh@test.com        77
Name: count, Length: 956, dtype: int64

In [8]:
# Perform the first join
first_merged_df = c_df.merge(a_df, left_on='email', right_on='user', how='inner')

# Perform the second join
merged_df = first_merged_df.merge(st_df,
                                  left_on=['serialNumber', 'timestamp'],
                                  right_on=['serialNumber', 'sensorReadingTime'], how='inner')

# Get the count of the filtered rows
count = len(merged_df)

print(count)

81273


In [9]:
merged_df.head()

Unnamed: 0,customerName,email,phone,birthDay,serialNumber,registrationDate,lastUpdateDate,shareWithResearchAsOfDate,shareWithPublicAsOfDate,shareWithFriendsAsOfDate,user,timestamp,x,y,z,sensorReadingTime,distanceFromObject
0,Santosh Clayton,Santosh.Clayton@test.com,8015551212,1900-01-01,50f7b4f3-7af5-4b07-a421-7b902c8d2b7c,1655564376361,1655564376361,1655564000000.0,1655564000000.0,1655564000000.0,Santosh.Clayton@test.com,1655564444103,1.0,-1.0,-1.0,1655564444103,218
1,Santosh Clayton,Santosh.Clayton@test.com,8015551212,1900-01-01,50f7b4f3-7af5-4b07-a421-7b902c8d2b7c,1655564376361,1655564376361,1655564000000.0,1655564000000.0,1655564000000.0,Santosh.Clayton@test.com,1655564444103,-1.0,0.0,-1.0,1655564444103,218
2,Santosh Clayton,Santosh.Clayton@test.com,8015551212,1900-01-01,50f7b4f3-7af5-4b07-a421-7b902c8d2b7c,1655564376361,1655564376361,1655564000000.0,1655564000000.0,1655564000000.0,Santosh.Clayton@test.com,1655564444103,1.0,-1.0,0.0,1655564444103,218
3,Santosh Clayton,Santosh.Clayton@test.com,8015551212,1900-01-01,50f7b4f3-7af5-4b07-a421-7b902c8d2b7c,1655564376361,1655564376361,1655564000000.0,1655564000000.0,1655564000000.0,Santosh.Clayton@test.com,1655564440170,0.0,1.0,-1.0,1655564440170,230
4,Santosh Clayton,Santosh.Clayton@test.com,8015551212,1900-01-01,50f7b4f3-7af5-4b07-a421-7b902c8d2b7c,1655564376361,1655564376361,1655564000000.0,1655564000000.0,1655564000000.0,Santosh.Clayton@test.com,1655564440170,-1.0,0.0,-1.0,1655564440170,230


In [10]:
df.head()

Unnamed: 0,customername,email,phone,birthday,serialnumber,registrationdate,lastupdatedate,sharewithresearchasofdate,sharewithpublicasofdate,sharewithfriendsasofdate,row_num,user,timestamp,x,y,z,sensorreadingtime,serialnumber.1,distancefromobject
0,Santosh Clayton,Santosh.Clayton@test.com,8015551212,1900-01-01,50f7b4f3-7af5-4b07-a421-7b902c8d2b7c,1655564376361,1655564376361,1655564000000.0,1655564000000.0,1655564000000.0,1,Santosh.Clayton@test.com,1655564444103,1.0,-1.0,-1.0,1655564444103,50f7b4f3-7af5-4b07-a421-7b902c8d2b7c,218
1,Santosh Clayton,Santosh.Clayton@test.com,8015551212,1900-01-01,50f7b4f3-7af5-4b07-a421-7b902c8d2b7c,1655564376361,1655564376361,1655564000000.0,1655564000000.0,1655564000000.0,1,Santosh.Clayton@test.com,1655564440170,0.0,1.0,-1.0,1655564440170,50f7b4f3-7af5-4b07-a421-7b902c8d2b7c,230
2,Santosh Clayton,Santosh.Clayton@test.com,8015551212,1900-01-01,50f7b4f3-7af5-4b07-a421-7b902c8d2b7c,1655564376361,1655564376361,1655564000000.0,1655564000000.0,1655564000000.0,1,Santosh.Clayton@test.com,1655564440170,-1.0,0.0,-1.0,1655564440170,50f7b4f3-7af5-4b07-a421-7b902c8d2b7c,230
3,Santosh Clayton,Santosh.Clayton@test.com,8015551212,1900-01-01,50f7b4f3-7af5-4b07-a421-7b902c8d2b7c,1655564376361,1655564376361,1655564000000.0,1655564000000.0,1655564000000.0,1,Santosh.Clayton@test.com,1655564416572,1.0,0.0,0.0,1655564416572,50f7b4f3-7af5-4b07-a421-7b902c8d2b7c,268
4,Santosh Clayton,Santosh.Clayton@test.com,8015551212,1900-01-01,50f7b4f3-7af5-4b07-a421-7b902c8d2b7c,1655564376361,1655564376361,1655564000000.0,1655564000000.0,1655564000000.0,1,Santosh.Clayton@test.com,1655564404773,-1.0,-1.0,0.0,1655564404773,50f7b4f3-7af5-4b07-a421-7b902c8d2b7c,244


## Store Data

In [11]:
# How many rows does the old file have?

old_file_path = "../cleaned_data/step_trainer/landing"

def get_first_file(directory):
    """
    Return the first file in the specified directory.
    """
    # List all entries in the directory
    entries = os.listdir(directory)

    # Filter out directories, leaving only files
    files = [f for f in entries if os.path.isfile(os.path.join(directory, f))]

    # Return the first file, or None if there are no files
    return files[0] if files else None

first_file = get_first_file(old_file_path)
if first_file:
    print(f"The first file in the directory is: {first_file}")
    num_rows = pd.read_csv(os.path.join(old_file_path, first_file)).shape[0]
    print(f"Number of rows: {num_rows}")
else:
    print("The directory contains no files.")

The first file in the directory is: step_trainer-1655564446759.json
Number of rows: 29969


In [12]:
import time
print(int(time.time()*1000))

1691348231377


In [13]:
def clear_directory(directory):
    """Delete all files and folders in the specified directory."""

    if not os.path.exists(directory):
        return
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        
        # If the item is a file, delete it
        if os.path.isfile(file_path):
            os.remove(file_path)
        
        # If the item is a directory, delete it and its contents
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)

clear_directory(reduced_data_path)

def save_df(df, batch_size, filepath_prefix):
    """
    Save a Pandas DataFrame to multiple files based on the given batch size.

    Parameters:
    - df (DataFrame): The Pandas DataFrame to save.
    - batch_size (int): The number of rows per file.
    - filepath_prefix (str): The path for the output filenames.

    Returns:
    None
    """
    # Ensure the directory exists
    directory = os.path.dirname(filepath_prefix)
    if directory and not os.path.exists(directory):
        os.makedirs(directory)
        
    num_batches = len(df) // batch_size + (1 if len(df) % batch_size else 0)

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size

        # Slicing the dataframe
        subset = df.iloc[start_idx:end_idx]

        # Constructing the filename
        filepath = f"{filepath_prefix}{int(time.time()*1000)}.json"

        print(f"Write {start_idx} to {end_idx} to file {filepath}")

        # Saving the subset to a file
        subset.to_json(filepath, orient='records', lines=True)

save_df(c_df, batch_size, os.path.join(reduced_data_path, 'customer/landing/customer-'))
save_df(a_df, batch_size, os.path.join(reduced_data_path, 'accelerometer/landing/accelerometer-'))
save_df(st_df, batch_size, os.path.join(reduced_data_path, 'step_trainer/landing/step_trainer-'))

Write 0 to 10000 to file ../final_data/customer/landing/customer-1691348231425.json
Write 0 to 10000 to file ../final_data/accelerometer/landing/accelerometer-1691348231445.json
Write 10000 to 20000 to file ../final_data/accelerometer/landing/accelerometer-1691348231495.json
Write 20000 to 30000 to file ../final_data/accelerometer/landing/accelerometer-1691348231576.json
Write 30000 to 40000 to file ../final_data/accelerometer/landing/accelerometer-1691348231724.json
Write 40000 to 50000 to file ../final_data/accelerometer/landing/accelerometer-1691348231810.json
Write 50000 to 60000 to file ../final_data/accelerometer/landing/accelerometer-1691348231881.json
Write 60000 to 70000 to file ../final_data/accelerometer/landing/accelerometer-1691348231931.json
Write 70000 to 80000 to file ../final_data/accelerometer/landing/accelerometer-1691348231983.json
Write 80000 to 90000 to file ../final_data/accelerometer/landing/accelerometer-1691348232031.json
Write 0 to 10000 to file ../final_data