In [1]:
import glob
import multiprocessing
import os
import sys
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from pandas.core.common import flatten
import uuid
from tqdm import tqdm
from datetime import datetime
import json

import pandas as pd
from pandas.core.frame import DataFrame

OUTPUT_BATCH_PATH="data/batches"
OUTPUT_MEASUREMENT_STATS_PATH="data/measurements"
Path(OUTPUT_BATCH_PATH).mkdir(parents=True, exist_ok=True)
Path(OUTPUT_MEASUREMENT_STATS_PATH).mkdir(parents=True, exist_ok=True)

In [2]:
# # Get Experimentname
# if len(sys.argv) == 2:
#     experimentname = sys.argv[1]
# else:
#     experimentname = input("Enter experiment name:")
# extracted_path = os.path.join("./data/extract/", experimentname)
experimentname = "gtbf"
extracted_path = os.path.join("./data/extract/", experimentname)
scanned_dir = list(os.scandir(extracted_path))
datafolders = list(filter(lambda x: x.is_dir(), scanned_dir))
invocation_files = list(filter(lambda x: x.is_file(), scanned_dir))
workers=multiprocessing.cpu_count()

In [3]:
# create week batches:
batches = {}
for invocation_file in invocation_files:
    file_name=Path(invocation_file).stem
    invocation_date = datetime.strptime(file_name[:8], "%Y%m%d")
    cw = invocation_date.isocalendar()[1]
    batch_key = f"{invocation_date.year}-{cw}"
    if batch_key not in batches:
        batches[batch_key] = []
    batches[batch_key].append(invocation_file)

In [4]:
with open('src/driver/deploy_config_full.json') as json_file:
    experiment_data = json.load(json_file)

valid_regions = []
for pr in experiment_data['experiment-provider-locations']:
    valid_regions.append(f"{pr['provider']}_{pr['region']}")
# valid_regions

In [5]:
batches

{'2021-33': [<DirEntry '20210822T080004402.json'>,
  <DirEntry '20210816T210004302.json'>,
  <DirEntry '20210817T133003781.json'>,
  <DirEntry '20210817T193005584.json'>,
  <DirEntry '20210822T180003595.json'>,
  <DirEntry '20210822T083003088.json'>,
  <DirEntry '20210819T113004110.json'>,
  <DirEntry '20210816T190003776.json'>,
  <DirEntry '20210818T123003104.json'>,
  <DirEntry '20210817T043003166.json'>,
  <DirEntry '20210818T000004491.json'>,
  <DirEntry '20210821T163002646.json'>,
  <DirEntry '20210816T100003443.json'>,
  <DirEntry '20210819T173003176.json'>,
  <DirEntry '20210816T023003375.json'>,
  <DirEntry '20210822T093003598.json'>,
  <DirEntry '20210818T063003526.json'>,
  <DirEntry '20210820T130004363.json'>,
  <DirEntry '20210816T213003715.json'>,
  <DirEntry '20210822T143003641.json'>,
  <DirEntry '20210822T213004412.json'>,
  <DirEntry '20210819T120003647.json'>,
  <DirEntry '20210821T050028547.json'>,
  <DirEntry '20210819T023004159.json'>,
  <DirEntry '20210818T1300049

In [6]:
def measure_error(folders):   
    measurements = {}
    for vr in valid_regions:
        measurements[vr] = 0

    for f in folders:
        folder_subset = Path(f).stem[19:].lower()
        if folder_subset in measurements.keys():
            measurements[folder_subset] += 1
    return measurements

In [7]:
folders = []
def process_batch(batch_file):
    batch_filename=batch_files[0].name
    batch_filter_key = batch_filename[:12]
    pattern = batch_filter_key
    folders = list(filter(lambda df: df.name.startswith(pattern), datafolders))
    
    measurement = measure_error(folders)
    lst = list(measurement.values())
    dirty_measurement = all(lst) == 1
    dirty_measurement_info = measurement
        
    dfs = []
    for folder in folders:
        invocation, provider, region = Path(folder).stem.split("_")
        file = os.path.join(folder, "saafdemo-basicExperiment-0MBs-run0.csv")
        if os.path.exists(file):
            df = pd.read_csv(file, skiprows=4)
            # Drop last row --> contains metadata
            df = df.iloc[:-1 , :]
        else:
            df = pd.DataFrame()
            df.insert(0, "error", 'missing csv file') 

        df.insert(0, "folder_uuid", str(uuid.uuid1()))
        df.insert(0, "dirty_measurement", dirty_measurement)
        df.insert(0, "region", region)
        df.insert(0, "provider", provider)
        df.insert(0, "workload_invocation", invocation)
        df.insert(0, "driver_invocation", batch_filename)
        dfs.append(df)
    return { "df": pd.concat(dfs), "measurements" : measurement }

In [None]:
# process in batches:
batch_no = 0
for batch, batch_files in batches.items():
    batch_no += 1
    batch_file_length = len(batch_files)
    batch_id = f"{batch}-{batch_file_length}"
    batch_parquet = f"{batch_id}.parquet"
    print(f"Processing batch {batch_id} - {batch_no} of {len(batches)}")

    if os.path.exists(os.path.join(OUTPUT_BATCH_PATH, batch_parquet)):
        print('batch already processed, skipping')
        continue 
    
    with ThreadPoolExecutor(max_workers=workers) as tpe:
        batch_frames = list(tqdm(tpe.map(process_batch, batch_files), total=len(batch_files)))
    batch_dataset = pd.concat([ bf['df'] for bf in batch_frames])
    measurement_dataset = pd.DataFrame([ bf['measurements'] for bf in batch_frames])
    batch_dataset = batch_dataset.sort_values(by=['driver_invocation', 'workload_invocation', 'provider', 'region', '1_run_id', '2_thread_id'])
    batch_dataset.to_parquet(os.path.join(OUTPUT_BATCH_PATH, batch_parquet))
    measurement_dataset.to_parquet(os.path.join(OUTPUT_MEASUREMENT_STATS_PATH, batch_parquet))

Processing batch 2021-33-336 - 1 of 11


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 336/336 [04:35<00:00,  1.22it/s]


Processing batch 2021-35-336 - 2 of 11


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 336/336 [04:40<00:00,  1.20it/s]


Processing batch 2021-36-336 - 3 of 11


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 336/336 [04:36<00:00,  1.21it/s]


Processing batch 2021-37-336 - 4 of 11


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 336/336 [04:49<00:00,  1.16it/s]


Processing batch 2021-34-336 - 5 of 11


 69%|███████████████████████████████████████████████████████████████████████████████████████▋                                       | 232/336 [03:13<00:58,  1.79it/s]