# Data Processing

In [30]:
import os
import pandas as pd
from dir_paths import DATA_DIR, WORKLOADS_PKL_PATH, RESULTS_PKL_PATH, RESULTS_DIR, PROCESSED_DATA_DIR
import re
from math import floor, log10

os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)

## Workloads DataFrame Creation

In [31]:
data_files_info = []

for fname in os.listdir(DATA_DIR):
    fpath = os.path.join(DATA_DIR, fname)
    if os.path.isfile(fpath):
        with open(fpath, 'r') as f:
            match = re.match(r'alpha_([0-9.]+)_objects_(\d+)_requests_(\d+)(?:\((\d+)\))?', fname)
            alpha, objects_theoretical, requests, file_nr = match.groups()
            items = set(line.strip() for line in f if line.strip())
            data_files_info.append({
                'alpha' : float(alpha),
                'objects_theoretical' : int(objects_theoretical),
                'requests' : int(requests),
                'file_nr' : int(file_nr) if file_nr else 0,
                'objects_actual': len(items)
            })

workloads_df = pd.DataFrame(data_files_info)
workloads_df.to_pickle(WORKLOADS_PKL_PATH)

print(f"Alpha values: {sorted(workloads_df['alpha'].unique())}")
display(workloads_df)

Alpha values: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6]


Unnamed: 0,alpha,objects_theoretical,requests,file_nr,objects_actual
0,0.6,30000,100000,2,26195
1,1.4,5000,100000,8,2620
2,1.0,4000,100000,4,3956
3,1.2,20000,100000,14,7712
4,0.2,5000,100000,2,5000
...,...,...,...,...,...
1595,1.0,4000,100000,0,3950
1596,0.4,20000,100000,9,19610
1597,1.6,50000,100000,4,1770
1598,1.6,5000,100000,14,1458


## Results DataFrame Creation

In [32]:
def round_to_sigfig(x, sigfig=1):
    if x == 0:
        return 0
    return round(x, -int(floor(log10(abs(x)))) + (sigfig - 1))


results_data = []

for filename in os.listdir(RESULTS_DIR):
    match = re.match(
        r'alpha_([0-9.]+)_objects_(\d+)_requests_(\d+)(?:\((\d+)\))?', filename)
    if not match:
        raise Exception(
            f"Filename does not match expected pattern: {filename}")
    alpha, objects_theoretical, requests, file_nr = match.groups()
    file_nr = int(file_nr) if file_nr else 0
    objects_actual = workloads_df.loc[
        (workloads_df['alpha'] == float(alpha)) &
        (workloads_df['objects_theoretical'] == int(objects_theoretical)) &
        (workloads_df['file_nr'] == int(file_nr))
    ]['objects_actual'].values[0]

    if not objects_actual:
        raise Exception(
            f"Something went wrong trying to get the actual number of objects. "
            f"Filename: {filename}, "
            f"Alpha: {alpha}, "
            f"Objects theoretical: {objects_theoretical}, "
            f"File nr: {file_nr}"
        )

    filepath = os.path.join(RESULTS_DIR, filename)
    with open(filepath, "r") as f:
        for line in f:
            # Example line:
            # mySIEVE cache size      400B, 100000 req, miss ratio 0.5632, byte miss ratio 0.5632
            parts = line.strip().split(", ")

            if len(parts) != 4:
                raise Exception(
                    f"Something went wrong. Filename {filename}, Parts: {parts} (should have 4 parts)")

            m = re.match(
                r'.*?(\w+)\s+cache size\s+(\d+)B,\s+(\d+)\s+req,\s+miss ratio\s+([0-9.]+),',
                line
            )

            if not m:
                raise Exception(
                    f"Regex did not match. Line: {line}")

            algorithm, cache_size, reqs, miss_ratio = m.groups()

            results_data.append({
                "algorithm": str(algorithm),
                "cache_size_absolute": int(cache_size),
                "cache_size_relative_rounded": round_to_sigfig(int(cache_size) / int(objects_actual), 1),
                "alpha": float(alpha),
                "objects_actual": int(objects_actual),
                "objects_theoretical": int(objects_theoretical),
                "requests": int(reqs),
                "miss_ratio": float(miss_ratio),
                "file_nr": int(file_nr)
            })

results_df = pd.DataFrame(results_data)
results_df.to_pickle(RESULTS_PKL_PATH)

print(f"Algorithms: {results_df['algorithm'].unique()}")
print(f"Relative cache sizes: {sorted(results_df['cache_size_relative_rounded'].unique())}")

display(results_df)

Algorithms: ['myFIFO' 'myLRU' 'mySIEVE']
Relative cache sizes: [0.0, 5e-05, 6e-05, 7e-05, 8e-05, 9e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009, 0.001, 0.002, 0.003, 0.004, 0.007, 0.008, 0.01, 0.02, 0.03, 0.04, 0.08, 0.1, 0.2, 0.3, 0.4, 0.8]


Unnamed: 0,algorithm,cache_size_absolute,cache_size_relative_rounded,alpha,objects_actual,objects_theoretical,requests,miss_ratio,file_nr
0,myFIFO,0,0.0000,1.2,4069,5000,100000,1.0000,1
1,myFIFO,1,0.0002,1.2,4069,5000,100000,0.9382,1
2,myFIFO,3,0.0007,1.2,4069,5000,100000,0.8525,1
3,myFIFO,4,0.0010,1.2,4069,5000,100000,0.8205,1
4,myFIFO,8,0.0020,1.2,4069,5000,100000,0.7293,1
...,...,...,...,...,...,...,...,...,...
79606,mySIEVE,200,0.2000,0.8,1000,1000,100000,0.3930,16
79607,mySIEVE,300,0.3000,0.8,1000,1000,100000,0.3061,16
79608,mySIEVE,400,0.4000,0.8,1000,1000,100000,0.2432,16
79609,mySIEVE,800,0.8000,0.8,1000,1000,100000,0.0730,16
