In [27]:
len(timings)

10

In [28]:
timings

[{'read_and_filter': 0.002782583236694336,
  'timing_articles': ['{"load_models": 3.2682816982269287}',
   '{"segment": 0.022104740142822266, "entire_article": [{"tokenize_and_tensor": 0.02461385726928711, "sentence_prediction": 0.6678199768066406, "realign": 0.0006096363067626953, "sentence_get_entities": 0.13004708290100098, "sentence_result_json": 2.384185791015625e-07}, {"tokenize_and_tensor": 0.0009357929229736328, "sentence_prediction": 1.8331937789916992, "realign": 0.00048089027404785156, "sentence_get_entities": 0.003199338912963867, "sentence_result_json": 4.76837158203125e-07}, {"tokenize_and_tensor": 0.0008454322814941406, "sentence_prediction": 0.03347659111022949, "realign": 0.00035858154296875, "sentence_get_entities": 0.0013875961303710938, "sentence_result_json": 2.384185791015625e-07}, {"tokenize_and_tensor": 0.0007135868072509766, "sentence_prediction": 0.033489227294921875, "realign": 0.0003325939178466797, "sentence_get_entities": 0.0009908676147460938, "sentence_r

In [37]:
import json
import numpy as np

def process_timing_data(timings):
    # Create lists to store the timing data
    segment_times = []
    process_article_times = []
    write_article_times = []
    load_models_times = []
    entity_prediction_times = []
    batch_times = []
    sentence_times = []

    # Loop over each batch's timings
    for batch in timings:
        process_article_times.append(batch.get('process_articles', 0))
        write_article_times.append(batch.get('write_articles', 0))
        load_models_times.append(batch.get('load_models', 0))
        batch_times.append(batch.get('batch_time', 0))

        # Extract prediction times from timing_articles
        timing_articles = batch.get('timing_articles', [])
        for article_timing in timing_articles:
            article_data = json.loads(article_timing)

            if 'segment' in article_data:
                segment_times.append(article_data['segment'])
            elif 'load_models' in article_data:
                load_models_times.append(article_data['load_models'])

            for sentence in article_data.get('entire_article', []):
                sentence_times.extend(sentence.values())
                entity_prediction_time = sentence.get('sentence_prediction', 0)
                entity_prediction_times.append(entity_prediction_time)

    # Convert to numpy arrays for easier mathematical operations
    segment_times = np.array(segment_times)
    process_article_times = np.array(process_article_times)
    write_article_times = np.array(write_article_times)
    load_models_times = np.array(load_models_times)
    entity_prediction_times = np.array(entity_prediction_times)
    batch_times = np.array(batch_times)
    sentence_times = np.array(sentence_times)

    # Total number of files, articles, and sentences
    total_files = len(timings)
    total_articles = len(segment_times) + len(load_models_times)
    total_sentences = len(sentence_times)

    # Calculate statistics
    avg_segment_time = np.mean(segment_times)
    avg_process_article_time = np.mean(process_article_times)
    avg_write_article_time = np.mean(write_article_times)
    avg_load_models_time = np.mean(load_models_times)
    avg_entity_prediction_time = np.mean(entity_prediction_times)
    avg_sentence_time = np.mean(sentence_times)
    avg_batch_time = np.mean(batch_times)
    total_time = np.sum(batch_times)

    # Print statistics
    print(f"Total number of files: {total_files}")
    print(f"Total number of articles: {total_articles}")
    print(f"Total number of sentences: {total_sentences}")

    print(f"Average segmentation time per article: {avg_segment_time:.6f} seconds")
    print(f"Average time to process an article: {avg_process_article_time:.6f} seconds")
    print(f"Average time to write an article: {avg_write_article_time:.6f} seconds")
    print(f"Average time to load models: {avg_load_models_time:.6f} seconds")
    print(f"Average time for entity prediction: {avg_entity_prediction_time:.6f} seconds")
    print(f"Average time per sentence: {avg_sentence_time:.6f} seconds")
    print(f"Average time per batch: {avg_batch_time:.6f} seconds")
    print(f"Total processing time: {total_time:.2f} seconds, {total_time/60:.2f} minutes")

# Example usage
import json
import numpy as np

# Load timings data from JSON
with open('batch_timings.json', 'r') as f:
    timings = json.load(f)
    
process_timing_data(timings)

Total number of files: 61
Total number of articles: 790
Total number of sentences: 62545
Average segmentation time per article: 0.019474 seconds
Average time to process an article: 22.077229 seconds
Average time to write an article: 3.823938 seconds
Average time to load models: 1.572068 seconds
Average time for entity prediction: 0.044304 seconds
Average time per sentence: 0.009495 seconds
Average time per batch: 25.903057 seconds
Total processing time: 1580.09 seconds, 26.33 minutes
