In [3]:
import json

with open('batch_timings_no_dask.json', 'r') as f:
    timings = json.load(f)




In [4]:
len(timings)

10

In [5]:
len(timings[0])

6

In [6]:
timings[0].keys()

dict_keys(['batch', 'read_and_filter', 'timing_articles', 'process_articles', 'write_articles', 'batch_time'])

In [7]:
with open('batch_timings_dask.json', 'r') as f:
    timings = json.load(f)

In [8]:
len(timings)

10

In [9]:
timings[1]

{'batch': 1,
 'read_and_filter': 0.001463174819946289,
 'timing_articles': ['[{"load_models": 3.4263627529144287}, {"segment": 0.06183314323425293, "entire_article": [{"tokenize_and_tensor": 0.02331233024597168, "sentence_prediction": 0.8606388568878174, "realign": 0.0003037452697753906, "sentence_get_entities": 0.1118166446685791, "sentence_result_json": 2.384185791015625e-07}, {"tokenize_and_tensor": 0.0006234645843505859, "sentence_prediction": 1.8398139476776123, "realign": 0.00024366378784179688, "sentence_get_entities": 0.0003440380096435547, "sentence_result_json": 2.384185791015625e-07}, {"tokenize_and_tensor": 0.0006351470947265625, "sentence_prediction": 0.033391475677490234, "realign": 0.0004944801330566406, "sentence_get_entities": 0.0024437904357910156, "sentence_result_json": 2.384185791015625e-07}, {"tokenize_and_tensor": 0.0007305145263671875, "sentence_prediction": 0.033454179763793945, "realign": 0.00043892860412597656, "sentence_get_entities": 0.0018553733825683594, 

In [27]:
import json
import numpy as np

def process_timing_data(timings):
    # Create lists to store the timing data
    segment_times = []
    process_article_times = []
    write_article_times = []
    load_models_times = []
    entity_prediction_times = []
    batch_times = []
    sentence_times = []

    # Loop over each batch's timings
    for batch in timings:
        process_article_times.append(batch.get('process_articles', 0))
        write_article_times.append(batch.get('write_articles', 0))
        load_models_times.append(batch.get('load_models', 0))
        batch_times.append(batch.get('batch_time', 0))

        # Extract prediction times from timing_articles
        timing_articles = batch.get('timing_articles', [])
        for article_timing in timing_articles:
            article_data = json.loads(article_timing)

            if 'segment' in article_data:
                segment_times.append(article_data['segment'])
            elif 'load_models' in article_data:
                load_models_times.append(article_data['load_models'])
#             print(len(article_data))
            
            for article in article_data:
                if 'entire_article' in article:
#                     article = json.loads(article)
                    for sentence in article['entire_article']:
                        sentence_times.extend(sentence.values())
                        entity_prediction_time = sentence['sentence_prediction']
                        entity_prediction_times.append(entity_prediction_time)

    # Convert to numpy arrays for easier mathematical operations
    segment_times = np.array(segment_times)
    process_article_times = np.array(process_article_times)
    write_article_times = np.array(write_article_times)
    load_models_times = np.array(load_models_times)
    entity_prediction_times = np.array(entity_prediction_times)
    batch_times = np.array(batch_times)
    sentence_times = np.array(sentence_times)

    # Total number of files, articles, and sentences
    total_files = len(timings)
    total_articles = len(segment_times) + len(load_models_times)
    total_sentences = len(sentence_times)

    # Calculate statistics
    avg_segment_time = np.mean(segment_times)
    avg_process_article_time = np.mean(process_article_times)
    avg_write_article_time = np.mean(write_article_times)
    avg_load_models_time = np.mean(load_models_times)
    avg_entity_prediction_time = np.mean(entity_prediction_times)
    avg_sentence_time = np.mean(sentence_times)
    avg_batch_time = np.mean(batch_times)
    total_time = np.sum(batch_times)

    # Print statistics
    print(f"Total number of files: {total_files}")
    print(f"Total number of articles: {total_articles}")
    print(f"Total number of sentences: {total_sentences}")

    print(f"Average segmentation time per article: {avg_segment_time:.6f} seconds")
    print(f"Average time to process an article: {avg_process_article_time:.6f} seconds")
    print(f"Average time to write an article: {avg_write_article_time:.6f} seconds")
    print(f"Average time to load models: {avg_load_models_time:.6f} seconds")
    print(f"Average time for entity prediction: {avg_entity_prediction_time:.6f} seconds")
    print(f"Average time per sentence: {avg_sentence_time:.6f} seconds")
    print(f"Average time per batch: {avg_batch_time:.6f} seconds")
    print(f"Total processing time: {total_time:.2f} seconds, {total_time/60:.2f} minutes")

# Example usage
import json
import numpy as np


for item in [('No Dask', 'batch_timings_no_dask.json'),
            ('Dask + Model', 'batch_timings_dask.json'),
            ('Dask + TorchServe', 'batch_timings_dask_ts.json'),
            ('Dask + TorchServe + Batch', 'batch_timings_dask_ts_batch.json')]:
    detail, file = item
    print('--'*10, detail, '--'*10, '\n')
    # Load timings data from JSON
    with open(file, 'r') as f:
        timings = json.load(f)
    process_timing_data(timings)

-------------------- No Dask -------------------- 

Total number of files: 10
Total number of articles: 10
Total number of sentences: 10115
Average segmentation time per article: nan seconds
Average time to process an article: 24.058415 seconds
Average time to write an article: 3.503594 seconds
Average time to load models: 0.000000 seconds
Average time for entity prediction: 0.045121 seconds
Average time per sentence: 0.009668 seconds
Average time per batch: 27.564041 seconds
Total processing time: 275.64 seconds, 4.59 minutes
-------------------- Dask + Model -------------------- 

Total number of files: 10
Total number of articles: 10
Total number of sentences: 10115
Average segmentation time per article: nan seconds
Average time to process an article: 23.852114 seconds
Average time to write an article: 3.368521 seconds
Average time to load models: 0.000000 seconds
Average time for entity prediction: 0.045865 seconds
Average time per sentence: 0.009825 seconds
Average time per batch: