In [3]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint
import multiprocess as mp
from tqdm import tqdm
import time

import tensorflow as tf
print('TF: {}'.format(tf.__version__))

import tensorflow_data_validation as tfdv
print('Data Validation: {}'.format(tfdv.__version__))

# Project imports
from molicel_cycle_life_prediction.dataset.p_series_discharge_cpd_dataset import PSeriesDischargeCPDDataset
from molicel_cycle_life_prediction.dataset.p_series_charge_cc_cvc_dataset import PSeriesChargeCCCVCDataset
from molicel_cycle_life_prediction.feature_generator.incremental_calculation import IncrementalCalculationFeatureGenerator
from molicel_cycle_life_prediction.feature_generator.envelope_based import EnvelopeFeatureGenerator
from molicel_cycle_life_prediction.feature_generator.discharge_sum_feature_generator import DischargeSumFeatureGenerator
from molicel_cycle_life_prediction.feature_generator.severson_nature import SeversonNature


TF: 2.11.0
Data Validation: 1.12.0


In [4]:
# build list of files 
p42_data_path = '/Users/jcheung/Library/CloudStorage/OneDrive-SharedLibraries-BEYONDLIMITS,INC/' \
                       'Molicel - General/06 Results/IPYNB_input_files/Raw_files_csvs/P42/'
p28_data_path = '/Users/jcheung/Library/CloudStorage/OneDrive-SharedLibraries-BEYONDLIMITS,INC/' \
                       'Molicel - General/02 Data/raw/18650-P28A/'
p42_files = {}
p28_files = {}
for file_type in ['train', 'test']:
    p42_files[file_type] = [f'{p42_data_path}/{file_type}/{x}' for x in os.listdir(f'{p42_data_path}/{file_type}/')]
    p28_files[file_type] = [f'{p28_data_path}/{file_type}/{x}' for x in os.listdir(f'{p28_data_path}/{file_type}/')]

### 1.1 Dataset building

In [5]:
LABEL_COLUMN = 'Retention(%)' 
def features_and_labels(row_data):
    label = row_data.pop(LABEL_COLUMN)
    features=row_data
    
    return features, label
    
def create_dataset(filename):
    discharge_class = PSeriesDischargeCPDDataset()
    di_fe = DischargeSumFeatureGenerator()
    en_fe = EnvelopeFeatureGenerator()
    sn = SeversonNature(model_type="full")  # set model_type to 'variance', 'discharge', or 'full'
  
    # instantiate top level class
    discharge = discharge_class.load_data(data_filepaths=filename, para_filepaths=[])
    df = discharge[0]
    
    # generate features from sub-level feature generators
    discharge_sum, discharge_sum_meta = di_fe.generate_features(df)
    env_fe, env_fe_meta = en_fe.generate_features(df)
    severson_fe, severson_meta = sn.generate_features(df)
    
    # compile feature generator outputs to dataframe
    discharge_df = pd.DataFrame(discharge_sum, columns=discharge_sum_meta['column_headers'])
    severson_df = pd.DataFrame(data=severson_fe, columns=severson_meta["column_headers"])
    env_df = pd.DataFrame(env_fe, columns=env_fe_meta['column_headers'])

    # package generated features into one dataframe
    discharge_df_features = ['File Name', 'Cyc#', 'R', 'mAH', 'Retention(%)']
    raw_df = env_df.merge(discharge_df[discharge_df_features], on=['File Name', 'Cyc#'])
    raw_df = raw_df.merge(severson_df, on=['File Name'])
    raw_df['Cyc#'] = raw_df['Cyc#'].astype(int)
    
    return raw_df

def divide_chunks(l, num_chunks):    
    # looping till length l
    for i in range(0, num_chunks):
        yield l[i::num_chunks]

def df_to_tf_dataset(df):
    tf_ds = tf.data.Dataset.from_tensor_slices(dict(df.drop(columns=['File Name']).astype(float)))
    tf_ds = tf_ds.map(features_and_labels)
    return tf_ds

In [7]:
# divide files up equally into lists based on cpus
all_files = {}
cpus_to_use = mp.cpu_count() - 1
for file_type in ['train', 'test']:
    p28 = list(divide_chunks(p28_files[file_type], num_chunks=cpus_to_use))
    p42 = list(divide_chunks(p42_files[file_type], num_chunks=cpus_to_use))
    all_files[file_type] = [p28[x] + p42[x] for x in range(0, cpus_to_use)]

In [8]:
# running in parallel completes in 418s
# running in serial completes in 1660s
# 4x speed increase by running in parallel 
start = time.time()
final_df = {}
with mp.Pool(processes=cpus_to_use) as pool:
    for file_type in ['train', 'test']:
        results = pool.map(create_dataset, all_files['train'])
        final_df[file_type] = pd.concat(results)
        
# tf_ds = df_to_tf_dataset(pd.concat(results))
end_parallel = time.time() - start

Iterating over samples for base feature generation...
Iterating over samples for base feature generation...
Iterating over samples for base feature generation...


  0%|                                                     | 0/13 [00:00<?, ?it/s]

Iterating over samples for base feature generation...


  0%|                                                     | 0/13 [00:00<?, ?it/s]

Iterating over samples for base feature generation...

  0%|                                                     | 0/13 [00:00<?, ?it/s]




  0%|                                                     | 0/13 [00:00<?, ?it/s]

Iterating over samples for base feature generation...
Iterating over samples for base feature generation...

  0%|                                                     | 0/13 [00:00<?, ?it/s]

Iterating over samples for base feature generation...



  0%|                                                     | 0/13 [00:00<?, ?it/s]

Iterating over samples for base feature generation...Iterating over samples for base feature generation...

  0%|                                                     | 0/13 [00:00<?, ?it/s]

Iterating over samples for base feature generation...



  0%|                                                     | 0/13 [00:00<?, ?it/s]




100%|████████████████████████████████████████████| 12/12 [01:32<00:00,  7.67s/it]


Iterating over samples for envelope-based feature generation...


100%|████████████████████████████████████████████| 13/13 [02:14<00:00, 10.32s/it]
 69%|███████████████████████████████▏             | 9/13 [02:14<00:20,  5.05s/it]

Iterating over samples for envelope-based feature generation...


100%|████████████████████████████████████████████| 13/13 [02:15<00:00, 10.45s/it]
 92%|████████████████████████████████████████▌   | 12/13 [02:16<00:02,  2.67s/it]

Iterating over samples for envelope-based feature generation...


100%|████████████████████████████████████████████| 13/13 [02:17<00:00, 10.55s/it]


Iterating over samples for envelope-based feature generation...


100%|████████████████████████████████████████████| 13/13 [02:18<00:00, 10.66s/it]
100%|████████████████████████████████████████████| 13/13 [02:18<00:00, 10.68s/it]


Iterating over samples for envelope-based feature generation...


 15%|██████▉                                      | 2/13 [00:04<00:24,  2.24s/it]

Iterating over samples for envelope-based feature generation...


100%|████████████████████████████████████████████| 13/13 [02:19<00:00, 10.71s/it]


Iterating over samples for envelope-based feature generation...


100%|████████████████████████████████████████████| 13/13 [02:20<00:00, 10.77s/it]


Iterating over samples for envelope-based feature generation...


100%|████████████████████████████████████████████| 12/12 [01:03<00:00,  5.31s/it]
 62%|███████████████████████████▋                 | 8/13 [00:16<00:10,  2.09s/it]

Iterating over samples for cycle slope and intercept calculation feature generation...


100%|███████████████████████████████████████████| 12/12 [00:00<00:00, 146.67it/s]


Iterating over samples for dQV difference calculation feature generation...


100%|████████████████████████████████████████████| 12/12 [00:01<00:00, 10.00it/s]
 69%|███████████████████████████████▏             | 9/13 [00:18<00:08,  2.09s/it]

Iterating over samples for capacity change feature generation...


100%|███████████████████████████████████████████| 12/12 [00:00<00:00, 160.31it/s]
100%|████████████████████████████████████████████| 13/13 [00:26<00:00,  2.03s/it]


Iterating over samples for cycle slope and intercept calculation feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 151.21it/s]


Iterating over samples for dQV difference calculation feature generation...


100%|████████████████████████████████████████████| 13/13 [00:01<00:00, 12.39it/s]


Iterating over samples for capacity change feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 178.74it/s]
100%|████████████████████████████████████████████| 13/13 [00:27<00:00,  2.09s/it]


Iterating over samples for cycle slope and intercept calculation feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 160.30it/s]


Iterating over samples for dQV difference calculation feature generation...


100%|████████████████████████████████████████████| 13/13 [00:26<00:00,  2.06s/it]
 92%|████████████████████████████████████████▌   | 12/13 [00:24<00:02,  2.03s/it]

Iterating over samples for cycle slope and intercept calculation feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 182.65it/s]


Iterating over samples for dQV difference calculation feature generation...


100%|████████████████████████████████████████████| 13/13 [00:01<00:00, 11.96it/s]


Iterating over samples for capacity change feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 197.54it/s]
100%|████████████████████████████████████████████| 13/13 [00:01<00:00, 12.09it/s]


Iterating over samples for capacity change feature generation...


100%|████████████████████████████████████████████| 13/13 [00:26<00:00,  2.05s/it]
100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 176.92it/s]
100%|████████████████████████████████████████████| 13/13 [00:26<00:00,  2.07s/it]


Iterating over samples for cycle slope and intercept calculation feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 179.04it/s]


Iterating over samples for cycle slope and intercept calculation feature generation...


  0%|                                                     | 0/13 [00:00<?, ?it/s]

Iterating over samples for dQV difference calculation feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 161.07it/s]
  0%|                                                     | 0/13 [00:00<?, ?it/s]

Iterating over samples for dQV difference calculation feature generation...


100%|████████████████████████████████████████████| 13/13 [00:26<00:00,  2.05s/it]
 31%|█████████████▊                               | 4/13 [00:00<00:00, 12.62it/s]

Iterating over samples for cycle slope and intercept calculation feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 191.27it/s]
 46%|████████████████████▊                        | 6/13 [00:00<00:00, 13.16it/s]

Iterating over samples for dQV difference calculation feature generation...


100%|████████████████████████████████████████████| 13/13 [00:26<00:00,  2.05s/it]
 92%|████████████████████████████████████████▌   | 12/13 [00:00<00:00, 13.87it/s]

Iterating over samples for cycle slope and intercept calculation feature generation...


100%|████████████████████████████████████████████| 13/13 [00:00<00:00, 13.44it/s]


Iterating over samples for capacity change feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 176.07it/s]
100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 190.79it/s]


Iterating over samples for dQV difference calculation feature generation...


100%|████████████████████████████████████████████| 13/13 [00:01<00:00, 12.83it/s]


Iterating over samples for capacity change feature generation...


 54%|████████████████████████▏                    | 7/13 [02:47<01:01, 10.20s/it]
100%|████████████████████████████████████████████| 13/13 [00:00<00:00, 13.35it/s]


Iterating over samples for capacity change feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 219.04it/s]
100%|████████████████████████████████████████████| 13/13 [00:00<00:00, 15.15it/s]


Iterating over samples for capacity change feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 261.64it/s]
100%|████████████████████████████████████████████| 13/13 [02:52<00:00, 13.30s/it]
 92%|████████████████████████████████████████▌   | 12/13 [02:53<00:02,  2.59s/it]

Iterating over samples for envelope-based feature generation...


100%|████████████████████████████████████████████| 13/13 [02:53<00:00, 13.36s/it]
100%|████████████████████████████████████████████| 13/13 [02:53<00:00, 13.38s/it]


Iterating over samples for envelope-based feature generation...


  0%|                                                     | 0/13 [00:00<?, ?it/s]

Iterating over samples for envelope-based feature generation...


100%|████████████████████████████████████████████| 13/13 [00:14<00:00,  1.10s/it]


Iterating over samples for cycle slope and intercept calculation feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 274.98it/s]


Iterating over samples for dQV difference calculation feature generation...


100%|████████████████████████████████████████████| 13/13 [00:14<00:00,  1.08s/it]
100%|████████████████████████████████████████████| 13/13 [00:00<00:00, 21.69it/s]


Iterating over samples for capacity change feature generation...


  0%|                                                     | 0/13 [00:00<?, ?it/s]

Iterating over samples for cycle slope and intercept calculation feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 286.72it/s]
100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 264.63it/s]


Iterating over samples for dQV difference calculation feature generation...


100%|████████████████████████████████████████████| 13/13 [00:00<00:00, 21.39it/s]


Iterating over samples for capacity change feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 315.96it/s]
100%|████████████████████████████████████████████| 13/13 [00:32<00:00,  2.47s/it]


Iterating over samples for cycle slope and intercept calculation feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 250.32it/s]


Iterating over samples for dQV difference calculation feature generation...


100%|████████████████████████████████████████████| 13/13 [00:00<00:00, 16.92it/s]


Iterating over samples for capacity change feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 275.16it/s]


Iterating over samples for base feature generation...


  0%|                                                     | 0/13 [00:00<?, ?it/s]

Iterating over samples for base feature generation...


  0%|                                                     | 0/13 [00:00<?, ?it/s]

Iterating over samples for base feature generation...


  0%|                                                     | 0/13 [00:00<?, ?it/s]

Iterating over samples for base feature generation...


  0%|                                                     | 0/13 [00:00<?, ?it/s]

Iterating over samples for base feature generation...


  0%|                                                     | 0/13 [00:00<?, ?it/s]

Iterating over samples for base feature generation...


  0%|                                                     | 0/13 [00:00<?, ?it/s]

Iterating over samples for base feature generation...


  0%|                                                     | 0/13 [00:00<?, ?it/s]

Iterating over samples for base feature generation...


  0%|                                                     | 0/13 [00:00<?, ?it/s]

Iterating over samples for base feature generation...


  0%|                                                     | 0/13 [00:00<?, ?it/s]

Iterating over samples for base feature generation...


  0%|                                                     | 0/13 [00:00<?, ?it/s]

Iterating over samples for base feature generation...


100%|████████████████████████████████████████████| 12/12 [01:32<00:00,  7.74s/it]


Iterating over samples for envelope-based feature generation...


100%|████████████████████████████████████████████| 13/13 [02:15<00:00, 10.43s/it]
 92%|████████████████████████████████████████▌   | 12/13 [02:15<00:02,  2.52s/it]

Iterating over samples for envelope-based feature generation...


 85%|█████████████████████████████████████▏      | 11/13 [02:17<00:06,  3.28s/it]
100%|████████████████████████████████████████████| 13/13 [02:17<00:00, 10.58s/it]
 92%|████████████████████████████████████████▌   | 12/13 [02:17<00:02,  2.28s/it]

Iterating over samples for envelope-based feature generation...


  0%|                                                     | 0/13 [00:00<?, ?it/s]

Iterating over samples for envelope-based feature generation...


100%|████████████████████████████████████████████| 13/13 [02:18<00:00, 10.68s/it]


Iterating over samples for envelope-based feature generation...


100%|████████████████████████████████████████████| 13/13 [02:19<00:00, 10.76s/it]
100%|████████████████████████████████████████████| 13/13 [02:19<00:00, 10.77s/it]
  8%|███▍                                         | 1/13 [00:02<00:25,  2.16s/it]

Iterating over samples for envelope-based feature generation...


  0%|                                                     | 0/13 [00:00<?, ?it/s]

Iterating over samples for envelope-based feature generation...


100%|████████████████████████████████████████████| 13/13 [02:20<00:00, 10.81s/it]


Iterating over samples for envelope-based feature generation...


100%|████████████████████████████████████████████| 12/12 [01:04<00:00,  5.41s/it]
 62%|███████████████████████████▋                 | 8/13 [00:17<00:10,  2.19s/it]

Iterating over samples for cycle slope and intercept calculation feature generation...


100%|███████████████████████████████████████████| 12/12 [00:00<00:00, 153.55it/s]


Iterating over samples for dQV difference calculation feature generation...


100%|████████████████████████████████████████████| 12/12 [00:01<00:00,  8.01it/s]


Iterating over samples for capacity change feature generation...


100%|███████████████████████████████████████████| 12/12 [00:00<00:00, 146.30it/s]
100%|████████████████████████████████████████████| 13/13 [00:27<00:00,  2.13s/it]
 92%|████████████████████████████████████████▌   | 12/13 [00:25<00:02,  2.14s/it]

Iterating over samples for cycle slope and intercept calculation feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 119.79it/s]


Iterating over samples for dQV difference calculation feature generation...


100%|████████████████████████████████████████████| 13/13 [00:01<00:00, 11.15it/s]


Iterating over samples for capacity change feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 134.47it/s]
100%|████████████████████████████████████████████| 13/13 [00:28<00:00,  2.18s/it]

 92%|████████████████████████████████████████▌   | 12/13 [00:25<00:02,  2.18s/it]

Iterating over samples for cycle slope and intercept calculation feature generation...
Iterating over samples for cycle slope and intercept calculation feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 170.55it/s]



Iterating over samples for dQV difference calculation feature generation...Iterating over samples for dQV difference calculation feature generation...



100%|████████████████████████████████████████████| 13/13 [00:28<00:00,  2.17s/it]
100%|████████████████████████████████████████████| 13/13 [00:01<00:00, 12.04it/s]


Iterating over samples for capacity change feature generation...


100%|████████████████████████████████████████████| 13/13 [00:01<00:00, 11.51it/s]


Iterating over samples for capacity change feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 172.95it/s]
100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 173.60it/s]


Iterating over samples for cycle slope and intercept calculation feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 167.73it/s]


Iterating over samples for dQV difference calculation feature generation...


100%|████████████████████████████████████████████| 13/13 [00:27<00:00,  2.15s/it]
 31%|█████████████▊                               | 4/13 [00:00<00:00, 12.52it/s]

Iterating over samples for cycle slope and intercept calculation feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 205.48it/s]
100%|████████████████████████████████████████████| 13/13 [00:28<00:00,  2.15s/it]
 46%|████████████████████▊                        | 6/13 [00:00<00:00, 12.57it/s]

Iterating over samples for dQV difference calculation feature generation...


 62%|███████████████████████████▋                 | 8/13 [00:00<00:00, 12.85it/s]

Iterating over samples for cycle slope and intercept calculation feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 175.25it/s]


Iterating over samples for dQV difference calculation feature generation...


100%|████████████████████████████████████████████| 13/13 [00:00<00:00, 13.04it/s]


Iterating over samples for capacity change feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 195.21it/s]
100%|████████████████████████████████████████████| 13/13 [00:28<00:00,  2.17s/it]
 46%|████████████████████▊                        | 6/13 [00:00<00:00, 13.40it/s]

Iterating over samples for cycle slope and intercept calculation feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 193.02it/s]
 92%|████████████████████████████████████████▌   | 12/13 [00:00<00:00, 13.39it/s]

Iterating over samples for dQV difference calculation feature generation...


100%|████████████████████████████████████████████| 13/13 [00:00<00:00, 13.78it/s]


Iterating over samples for capacity change feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 219.50it/s]
100%|████████████████████████████████████████████| 13/13 [00:00<00:00, 13.54it/s]


Iterating over samples for capacity change feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 243.33it/s]
100%|████████████████████████████████████████████| 13/13 [00:00<00:00, 14.93it/s]


Iterating over samples for capacity change feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 236.16it/s]
100%|████████████████████████████████████████████| 13/13 [02:55<00:00, 13.47s/it]


Iterating over samples for envelope-based feature generation...


100%|████████████████████████████████████████████| 13/13 [02:55<00:00, 13.49s/it]


Iterating over samples for envelope-based feature generation...


100%|████████████████████████████████████████████| 13/13 [02:56<00:00, 13.55s/it]


Iterating over samples for envelope-based feature generation...


100%|████████████████████████████████████████████| 13/13 [00:14<00:00,  1.10s/it]


Iterating over samples for cycle slope and intercept calculation feature generation...

100%|████████████████████████████████████████████| 13/13 [00:14<00:00,  1.08s/it]




100%|████████████████████████████████████████████| 13/13 [00:14<00:00,  1.09s/it]
100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 270.97it/s]


Iterating over samples for dQV difference calculation feature generation...
Iterating over samples for cycle slope and intercept calculation feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 267.20it/s]


Iterating over samples for dQV difference calculation feature generation...


100%|████████████████████████████████████████████| 13/13 [00:00<00:00, 19.95it/s]


Iterating over samples for capacity change feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 294.15it/s]
100%|████████████████████████████████████████████| 13/13 [00:00<00:00, 19.57it/s]


Iterating over samples for capacity change feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 277.52it/s]
100%|████████████████████████████████████████████| 13/13 [00:32<00:00,  2.47s/it]


Iterating over samples for cycle slope and intercept calculation feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 233.97it/s]


Iterating over samples for dQV difference calculation feature generation...


100%|████████████████████████████████████████████| 13/13 [00:00<00:00, 14.82it/s]


Iterating over samples for capacity change feature generation...


100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 243.94it/s]


In [19]:
start = time.time()
final_df = {}
for file_type in ['train', 'test']:
    results = create_dataset(p42_files[file_type] + p28_files[file_type])
    final_df[file_type] = results
end_serial = time.time() - start

Iterating over samples for base feature generation...


100%|██████████████████████████████████████████| 142/142 [13:14<00:00,  5.60s/it]


Iterating over samples for envelope-based feature generation...


100%|██████████████████████████████████████████| 142/142 [07:02<00:00,  2.98s/it]


Iterating over samples for cycle slope and intercept calculation feature generation...


100%|█████████████████████████████████████████| 142/142 [00:01<00:00, 117.33it/s]


Iterating over samples for dQV difference calculation feature generation...


100%|██████████████████████████████████████████| 142/142 [00:47<00:00,  3.00it/s]


Iterating over samples for capacity change feature generation...


100%|█████████████████████████████████████████| 142/142 [00:01<00:00, 111.23it/s]


Iterating over samples for base feature generation...


100%|████████████████████████████████████████████| 35/35 [05:40<00:00,  9.73s/it]


Iterating over samples for envelope-based feature generation...


100%|████████████████████████████████████████████| 35/35 [00:36<00:00,  1.04s/it]


Iterating over samples for cycle slope and intercept calculation feature generation...


100%|███████████████████████████████████████████| 35/35 [00:00<00:00, 231.30it/s]


Iterating over samples for dQV difference calculation feature generation...


100%|████████████████████████████████████████████| 35/35 [00:03<00:00, 10.57it/s]


Iterating over samples for capacity change feature generation...


100%|███████████████████████████████████████████| 35/35 [00:00<00:00, 245.79it/s]


### 1.2 Dataset Validation

In [30]:
train_stats = tfdv.generate_statistics_from_dataframe(dataframe=final_df['train'])
test_stats = tfdv.generate_statistics_from_dataframe(dataframe=final_df['test'])

tfdv.visualize_statistics(
  lhs_statistics=train_stats, lhs_name='TRAIN_DATASET',
  rhs_statistics=test_stats, rhs_name='TEST_DATASET')


In [None]:
training_dir = '/Users/jcheung/Library/CloudStorage/OneDrive-SharedLibraries-BEYONDLIMITS,INC/' \
                       'Molicel - General/06 Results/IPYNB_input_files/Raw_files_csvs/P42/mini_train'

CSV_COL_TYPE = {'Step time': tf.float32, 
          'V': tf.float32, 'I': tf.float32, 
          'P': tf.float32, 'R': tf.float32, 
          'T': tf.float32, 'mAH': tf.float32,
          'WH': tf.float32, 'End Status': tf.string,
          'Action': tf.string, 'Advance Cycle': tf.int32}

def create_dataset(pattern, mode='eval'):
    # batch default set to 1 so we can filter after
    dataset = tf.data.experimental.make_csv_dataset(file_pattern = pattern,
                                                    batch_size=1,
                                                    header=True,
                                                    select_columns = CSV_COL_TYPE.keys(),
                                                    column_defaults=CSV_COL_TYPE.values())
    if mode == 'train':
        dataset = dataset.shuffle(1000).repeat()
        
    return dataset

In [None]:
# Define reducer
# Reducer requires 3 functions - init_func, reduce_func, finalize_func. 
# init_func - to define initial value
# reducer_func - operation to perform on values with same key
# finalize_func - value to return in the end.
def init_func(_):
    return 0.0

def reduce_func(state, value):
    return state + value['features']

def finalize_func(state):
    return state

retention_reducer = tf.data.experimental.Reducer(init_func, reduce_func, finalize_func)

In [None]:

dataset = create_dataset(pattern=f"{training_dir}/*.csv", mode='train')
dataset = dataset.unbatch().filter(lambda x: x["Action"] == 'CP\D' and (x["End Status"] == 'ET') or (x["End Status"] == 'EV'))

# dataset.map(lambda x: tf.group_by_reducer(x['Advance Cycle']: ))


# dataset = dataset.filter(lambda x: tf.reduce_all(tf.equal(x['End Status'], ['EV', 'ET'])))
dataset = dataset.batch(5)


In [None]:
for x in dataset.take(1):
    

In [None]:
dataset3 = tf.data.Dataset.zip((dataset1, dataset2))

In [None]:
# print sample from the csv dataset
for features in dataset.take(1):
    for i, (name, value) in enumerate(features.items()):
        if i > 10:
            break
        print(f"{name:20s}: {value}")
    print(f"[total: {len(features)} features]")

### 1.2 Dataset mode preprocessing


In [None]:
data

In [None]:
    df_origin_sum_ccd = df_origin_ccd.copy()[
        (df_origin_ccd['End Status'] == 'EV') | (df_origin_ccd['End Status'] == 'ET')]
    df_origin_cpd = row_data[(row_data['Action'] == 'CP\D')]
    df_origin_sum_cpd = df_origin_cpd.copy()[
        (df_origin_cpd['End Status'] == 'EV') | (df_origin_cpd['End Status'] == 'ET')]
    df_origin_cccvc = row_data[(row_data['Action'] == 'CC-CV\C')]
    df_origin_sum_cccvc = df_origin_cccvc.copy()[(df_origin_cccvc['End Status'] == 'EC')]  # Cut-off Current (EC)



### 1.2 TF Data Validation

In [None]:
train_dataset