# Data Processing

In [2]:
import os
import pandas as pd
from dir_paths import DATA_DIR, PROCESSED_DATA_DIR
import re

data_files_info = []

for fname in os.listdir(DATA_DIR):
    fpath = os.path.join(DATA_DIR, fname)
    if os.path.isfile(fpath):
        with open(fpath, 'r') as f:
            match = re.match(r'alpha_([0-9.]+)_objects_(\d+)_requests_(\d+)(?:\((\d+)\))?', fname)
            alpha, objects_theoretical, requests, file_nr = match.groups()
            items = set(line.strip() for line in f if line.strip())
            data_files_info.append({
                'alpha' : float(alpha),
                'objects_theoretical' : int(objects_theoretical),
                'requests' : int(requests),
                'file_nr' : int(file_nr) if file_nr else 0,
                'objects_actual': len(items)
            })

data_df = pd.DataFrame(data_files_info)
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)
data_df.to_pickle(os.path.join(PROCESSED_DATA_DIR, "data_df.pkl"))
data_df

Unnamed: 0,alpha,objects_theoretical,requests,file_nr,objects_actual
0,0.6,30000,100000,2,26195
1,1.4,5000,100000,8,2620
2,1.0,4000,100000,4,3956
3,1.2,20000,100000,14,7712
4,0.2,5000,100000,2,5000
...,...,...,...,...,...
1595,1.0,4000,100000,0,3950
1596,0.4,20000,100000,9,19610
1597,1.6,50000,100000,4,1770
1598,1.6,5000,100000,14,1458


Create DataFrame which contains all result data in a structured manner

In [3]:
import os
import pandas as pd
from dir_paths import RESULTS_DIR, PROCESSED_DATA_DIR
import re
from math import floor, log10


def round_to_sigfig(x, sigfig=1):
    if x == 0:
        return 0
    return round(x, -int(floor(log10(abs(x)))) + (sigfig - 1))


results_data = []

for filename in os.listdir(RESULTS_DIR):
    match = re.match(
        r'alpha_([0-9.]+)_objects_(\d+)_requests_(\d+)(?:\((\d+)\))?', filename)
    if not match:
        raise Exception(
            f"Filename does not match expected pattern: {filename}")
    alpha, objects_theoretical, requests, file_nr = match.groups()
    file_nr = int(file_nr) if file_nr else 0
    objects_actual = data_df.loc[
        (data_df['alpha'] == float(alpha)) &
        (data_df['objects_theoretical'] == int(objects_theoretical)) &
        (data_df['file_nr'] == int(file_nr))
    ]['objects_actual'].values[0]

    if not objects_actual:
        raise Exception(
            f"Something went wrong trying to get the actual number of objects. "
            f"Filename: {filename}, "
            f"Alpha: {alpha}, "
            f"Objects theoretical: {objects_theoretical}, "
            f"File nr: {file_nr}"
        )

    filepath = os.path.join(RESULTS_DIR, filename)
    with open(filepath, "r") as f:
        for line in f:
            # Example line:
            # mySieve cache size      400B, 100000 req, miss ratio 0.5632, byte miss ratio 0.5632
            parts = line.strip().split(", ")

            if len(parts) != 4:
                raise Exception(
                    f"Something went wrong. Filename {filename}, Parts: {parts} (should have 4 parts)")

            m = re.match(
                r'.*?(\w+)\s+cache size\s+(\d+)B,\s+(\d+)\s+req,\s+miss ratio\s+([0-9.]+),',
                line
            )

            if not m:
                raise Exception(
                    f"Regex did not match. Line: {line}")

            algorithm, cache_size, reqs, miss_ratio = m.groups()

            results_data.append({
                "algorithm": str(algorithm),
                "cache_size_absolute": int(cache_size),
                "cache_size_relative_rounded": round_to_sigfig(int(cache_size) / int(objects_actual), 1),
                "alpha": float(alpha),
                "objects_actual": int(objects_actual),
                "objects_theoretical": int(objects_theoretical),
                "requests": int(reqs),
                "miss_ratio": float(miss_ratio),
                "file_nr": int(file_nr)
            })

results_df = pd.DataFrame(results_data)
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)
results_df.to_pickle(os.path.join(PROCESSED_DATA_DIR, "results_df.pkl"))

relative_cache_sizes = sorted(results_df['cache_size_relative_rounded'].unique())
print(f"Relative cache sizes: {relative_cache_sizes}")
results_df

Relative cache sizes: [0.003, 0.01, 0.03, 0.1, 0.2, 0.4, 0.8]


Unnamed: 0,algorithm,cache_size_absolute,cache_size_relative_rounded,alpha,objects_actual,objects_theoretical,requests,miss_ratio,file_nr
0,mySieve,122,0.03,1.2,4069,5000,100000,0.2277,1
1,mySieve,406,0.10,1.2,4069,5000,100000,0.1451,1
2,mySieve,813,0.20,1.2,4069,5000,100000,0.1066,1
3,mySieve,1627,0.40,1.2,4069,5000,100000,0.0713,1
4,mySieve,3255,0.80,1.2,4069,5000,100000,0.0444,1
...,...,...,...,...,...,...,...,...,...
23353,myLRU,800,0.80,0.8,1000,1000,100000,0.0801,16
23354,myFIFO,100,0.10,0.8,1000,1000,100000,0.6658,16
23355,myFIFO,200,0.20,0.8,1000,1000,100000,0.5261,16
23356,myFIFO,400,0.40,0.8,1000,1000,100000,0.3420,16


Find valuable metrics

In [139]:
metrics_df = results_df.groupby(
    [col for col in results_df.columns if col not in ['miss_ratio', "file_nr", "cache_size_absolute", "objects_actual"]]
)['miss_ratio'].agg(['count', 'mean', 'std', 'min', 'max']).reset_index()

metrics_df

Unnamed: 0,algorithm,cache_size_relative_rounded,alpha,objects_theoretical,requests,count,mean,std,min,max
0,myFIFO,0.003,0.2,40000,100000,20,0.997125,0.000265,0.9968,0.9977
1,myFIFO,0.003,0.2,50000,100000,20,0.997305,0.000179,0.9970,0.9976
2,myFIFO,0.003,0.4,40000,100000,20,0.995860,0.000154,0.9956,0.9961
3,myFIFO,0.003,0.4,50000,100000,20,0.996080,0.000265,0.9955,0.9966
4,myFIFO,0.003,0.6,50000,100000,20,0.986740,0.000303,0.9862,0.9872
...,...,...,...,...,...,...,...,...,...,...
1168,mySieve,0.800,1.6,10000,100000,20,0.016260,0.000291,0.0158,0.0168
1169,mySieve,0.800,1.6,20000,100000,20,0.017255,0.000336,0.0167,0.0179
1170,mySieve,0.800,1.6,30000,100000,20,0.017680,0.000284,0.0169,0.0179
1171,mySieve,0.800,1.6,40000,100000,20,0.017840,0.000260,0.0173,0.0183
