# Data Processing

In [2]:
import os
import pandas as pd
from dir_paths import DATA_DIR, WORKLOADS_PKL_PATH, RESULTS_PKL_PATH, RESULTS_DIR, PROCESSED_DATA_DIR
import re
from math import floor, log10

os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)

## Workloads DataFrame Creation

In [73]:
data_files_info = []

for fname in os.listdir(DATA_DIR):
    fpath = os.path.join(DATA_DIR, fname)
    if os.path.isfile(fpath):
        with open(fpath, 'r') as f:
            match = re.match(r'alpha_([0-9.]+)_objects_(\d+)_requests_(\d+)(?:\((\d+)\))?', fname)
            alpha, objects_theoretical, requests, file_nr = match.groups()
            items = set(line.strip() for line in f if line.strip())
            data_files_info.append({
                'alpha' : float(alpha),
                'objects_theoretical' : int(objects_theoretical),
                'requests' : int(requests),
                'file_nr' : int(file_nr) if file_nr else 0,
                'objects_actual': len(items)
            })

workloads_df = pd.DataFrame(data_files_info)
workloads_df.to_pickle(WORKLOADS_PKL_PATH)

print(f"Alpha values: {sorted(workloads_df['alpha'].unique())}")
display(workloads_df)

Alpha values: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6]


Unnamed: 0,alpha,objects_theoretical,requests,file_nr,objects_actual
0,0.6,30000,100000,2,26195
1,1.4,5000,100000,8,2620
2,1.0,4000,100000,4,3956
3,1.2,20000,100000,14,7712
4,0.2,5000,100000,2,5000
...,...,...,...,...,...
1595,1.0,4000,100000,0,3950
1596,0.4,20000,100000,9,19610
1597,1.6,50000,100000,4,1770
1598,1.6,5000,100000,14,1458


In [3]:
workloads_df = pd.read_pickle(WORKLOADS_PKL_PATH)

print("Details about the workloads:")
print("Alpha values used:", sorted(workloads_df['alpha'].unique()))
print("Theoretical objects used:", sorted(workloads_df['objects_theoretical'].unique()))
print("Numbers of requests used:", sorted(workloads_df['requests'].unique()))

Details about the workloads:
Alpha values used: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6]
Theoretical objects used: [1000, 2000, 3000, 4000, 5000, 10000, 20000, 30000, 40000, 50000]
Numbers of requests used: [100000]


## Results DataFrame Creation

In [72]:
def round_to_sigfig(x, sigfig=1):
    if x == 0:
        return 0
    return round(x, -int(floor(log10(abs(x)))) + (sigfig - 1))

# Reverse engineering the relative cache sizes backwards, due to rounding issue in simulations
relative_theoretical = [0.0001, 0.0002, 0.0004, 0.0008, 0.001, 0.002, 0.003, 0.004, 0.008, 0.01, 0.02, 0.03, 0.04, 0.08, 0.1, 0.2, 0.3, 0.4, 0.8]
absolute_to_relative = {} # (absolute, working set, requests) : relative

def get_relative_size(absolute_size, working_set, requests):
    if (absolute_size, working_set, requests) not in absolute_to_relative: 
        prod = {floor(relative * working_set) : relative for relative in relative_theoretical}
        absolute_to_relative[(absolute_size, working_set, requests)] = prod[absolute_size]
    
    return absolute_to_relative[(absolute_size, working_set, requests)]

workloads_df = pd.read_pickle(WORKLOADS_PKL_PATH)
results_data = []

for filename in os.listdir(RESULTS_DIR):
    match = re.match(
        r'alpha_([0-9.]+)_objects_(\d+)_requests_(\d+)(?:\((\d+)\))?', filename)
    if not match:
        raise Exception(
            f"Filename does not match expected pattern: {filename}")
    alpha, objects_theoretical, requests, file_nr = match.groups()
    file_nr = int(file_nr) if file_nr else 0
    objects_actual = workloads_df.loc[
        (workloads_df['alpha'] == float(alpha)) &
        (workloads_df['objects_theoretical'] == int(objects_theoretical)) &
        (workloads_df['file_nr'] == int(file_nr))
    ]['objects_actual'].values[0]

    if not objects_actual:
        raise Exception(
            f"Something went wrong trying to get the actual number of objects. "
            f"Filename: {filename}, "
            f"Alpha: {alpha}, "
            f"Objects theoretical: {objects_theoretical}, "
            f"File nr: {file_nr}"
        )

    filepath = os.path.join(RESULTS_DIR, filename)
    with open(filepath, "r") as f:
        for line in f:
            # Example line:
            # mySIEVE cache size      400B, 100000 req, miss ratio 0.5632, byte miss ratio 0.5632
            parts = line.strip().split(", ")

            if len(parts) != 4:
                raise Exception(
                    f"Something went wrong. Filename {filename}, Parts: {parts} (should have 4 parts)")

            m = re.match(
                r'.*?(\w+)\s+cache size\s+(\d+)B,\s+(\d+)\s+req,\s+miss ratio\s+([0-9.]+),',
                line
            )

            if not m:
                raise Exception(
                    f"Regex did not match. Line: {line}")

            algorithm, cache_size, reqs, miss_ratio = m.groups()
            relative_size = get_relative_size(int(cache_size), int(objects_actual), int(reqs))

            results_data.append({
                "algorithm": str(algorithm).strip("my").upper(),
                "cache_size_absolute": int(cache_size),
                "cache_size_relative_rounded": relative_size,
                "alpha": float(alpha),
                "objects_actual": int(objects_actual),
                "objects_theoretical": int(objects_theoretical),
                "requests": int(reqs),
                "miss_ratio": float(miss_ratio),
                "file_nr": int(file_nr)
            })

results_df = pd.DataFrame(results_data)
results_df.to_pickle(RESULTS_PKL_PATH)

display(results_df)

Unnamed: 0,algorithm,cache_size_absolute,cache_size_relative_rounded,alpha,objects_actual,objects_theoretical,requests,miss_ratio,file_nr
0,FIFO,0,0.0002,1.2,4069,5000,100000,1.0000,1
1,FIFO,1,0.0004,1.2,4069,5000,100000,0.9382,1
2,FIFO,3,0.0008,1.2,4069,5000,100000,0.8525,1
3,FIFO,4,0.0010,1.2,4069,5000,100000,0.8205,1
4,FIFO,8,0.0020,1.2,4069,5000,100000,0.7293,1
...,...,...,...,...,...,...,...,...,...
79366,SIEVE,100,0.1000,0.8,1000,1000,100000,0.5013,16
79367,SIEVE,200,0.2000,0.8,1000,1000,100000,0.3930,16
79368,SIEVE,300,0.3000,0.8,1000,1000,100000,0.3061,16
79369,SIEVE,400,0.4000,0.8,1000,1000,100000,0.2432,16


In [68]:
results_df = pd.read_pickle(RESULTS_PKL_PATH)

print("Details about the simulation results:")
print(f"Algorithms: {results_df['algorithm'].unique()}")
print(f"Relative cache sizes: {sorted(results_df['cache_size_relative_rounded'].unique())}")

Details about the simulation results:
Algorithms: ['myFIFO' 'myLRU' 'mySIEVE']
Relative cache sizes: [0.0001, 0.0002, 0.0004, 0.0008, 0.001, 0.002, 0.004, 0.008, 0.01, 0.02, 0.04, 0.08, 0.1, 0.2, 0.4, 0.8]


In [62]:
# For each (alpha, objects_theoretical), check if all file_nr groups have the same number of rows
print("\nThese are the combinations for alpha and objects_theoretical where each synthetic workload had a different number of simulations.")
print("This is due to the objects_actual being different!")
grouped = results_df.groupby(['alpha', 'objects_theoretical', 'file_nr']).size().reset_index(name='count')
check = grouped.groupby(['alpha', 'objects_theoretical'])['count'].nunique() == 1
check_df = check.reset_index()
display(check_df[~check_df['count']])


These are the combinations for alpha and objects_theoretical where each synthetic workload had a different number of simulations.
This is due to the objects_actual being different!


Unnamed: 0,alpha,objects_theoretical,count
5,0.2,10000,False
23,0.6,4000,False
24,0.6,5000,False
32,0.8,3000,False
33,0.8,4000,False
41,1.0,2000,False
50,1.2,1000,False
59,1.2,50000,False
66,1.4,20000,False
67,1.4,30000,False


In [63]:
min_relative_cache_df = results_df.groupby('objects_theoretical')['cache_size_relative_rounded'].min().reset_index()
min_relative_cache_df.columns = ['objects_theoretical', 'min_relative_cache_size']
display(min_relative_cache_df)

Unnamed: 0,objects_theoretical,min_relative_cache_size
0,1000,0.0008
1,2000,0.0004
2,3000,0.0002
3,4000,0.0002
4,5000,0.0001
5,10000,0.0001
6,20000,0.0001
7,30000,0.0001
8,40000,0.0001
9,50000,0.0001


In [65]:
min_relative_cache_df = results_df.groupby('objects_actual')['cache_size_relative_rounded'].min().reset_index()
min_relative_cache_df.columns = ['objects_actual', 'min_relative_cache_size']
display(min_relative_cache_df)
print(min_relative_cache_df.to_string())

Unnamed: 0,objects_actual,min_relative_cache_size
0,807,0.0010
1,813,0.0010
2,819,0.0010
3,821,0.0010
4,822,0.0010
...,...,...
990,42686,0.0001
991,42688,0.0001
992,42718,0.0001
993,42724,0.0001


     objects_actual  min_relative_cache_size
0               807                   0.0010
1               813                   0.0010
2               819                   0.0010
3               821                   0.0010
4               822                   0.0010
5               824                   0.0010
6               826                   0.0010
7               827                   0.0010
8               829                   0.0010
9               833                   0.0010
10              835                   0.0010
11              836                   0.0010
12              837                   0.0010
13              838                   0.0010
14              853                   0.0010
15              967                   0.0010
16              971                   0.0010
17              972                   0.0010
18              973                   0.0010
19              974                   0.0010
20              976                   0.0010
21        