In [1]:
import os
import pandas as pd
import numpy as np
import glob
import time
from pandas.errors import EmptyDataError

In [2]:
def clean(path):
    with open(path, 'r') as fr:
        # reading line by line
        lines = fr.readlines()
        ptr = 0
    
        with open(path, 'w') as fw:
            for line in lines:
                if ptr == 0:
                    fw.write(line)
                else:
                    split = line.split(",")
                    if split[0].isdigit():
                        fw.write(line)
                ptr+=1

In [31]:
def compute(path):
    try:
        cluster,compiler,filename = path.split("\\")[1:] # split path
        filename = filename[:-4] # get rid of .txt
        node,benchmark,cores,run,iterations,delay = filename.split("-") # split file name

        
            
        table = pd.read_csv(path) # import file
        table.columns = table.columns.str.lstrip() # remove spaces from start of line            
            
        # Set variables
        table["cluster"] = cluster
        table["compiler"] = compiler
        table["node"] = node
        table["benchmark"] = benchmark
        table["cores"] = cores
        table["run"] = run
        table["iterations"] = iterations
        table["delay"] = delay
            
        # Define type
        table["iteration"] = pd.to_numeric(table["iteration"], errors='coerce')
        table["delay"] = pd.to_numeric(table["delay"], errors='coerce')
        table["time"] = pd.to_numeric(table["time"], errors='coerce')
        table["name"] = table["name"].astype(str)

        # Remove leading and trailing spaces
        table["name"] = table["name"].str.strip()

        # Split name to name and chunksize
        table[["name","chunk size"]] = table["name"].str.extract(r"((?:[ ]?[a-zA-Z_]+)+)(?: (\d+))?")
        
        # Define type for chunk size
        table["chunk size"] = pd.to_numeric(table["chunk size"], errors='coerce')
        
        # Add metrics
        table["mean"] = table.groupby("name")["time"].transform(np.mean)
        table["median"] = table.groupby("name")["time"].transform(np.median)
        table["std"] = table.groupby("name")["time"].transform(np.std)
        table["min"] = table.groupby("name")["time"].transform(min)
        table["max"] = table.groupby("name")["time"].transform(max)
        return table
    except EmptyDataError:
        print("Empty file %s" % path)

In [29]:
paths = sorted(glob.glob('./**/*.txt', recursive=True))

In [32]:
savepath = "archer2-1000i-schedbench.csv"
if not os.path.exists(savepath):
    new = True
    
for path in paths:
    clean(path)                                                         
    df = compute(path)
    if new:
        df.to_csv(savepath,index=False,mode='w')
        new = False
    else:
        df.to_csv(savepath,index=False,mode='at',header=False)