# Dataset Creation

This notebook transforms the raw eviction traces from policy model training (evictions.txt files) into a form that can provide meaningful information to the RAG implemented in RAG_application.ipynb. The final dataset is structured as a dictionary where each trace name is a key that indexes both a pandas dataframe containing numerical data on that trace and textual metadata, which includes overall performance metrics for the trace (e.g. hit rate) and descriptions of the policy model architecture and workload.

## Function to Generate Dataframe for a Single Trace

In [1]:
import pandas as pd
import json
import numpy as np
import re
import os

def load_and_modify_data(file_path, max_cache_set_size, disassembly_file, source_directory):

    def get_prefix(pc):
        # pc is likely in form "0x401a0f", we extract "401a0"
        return pc[2:2+5]

    def find_function_code(func_name, directory):
        # Regex to find where the function definition starts
        pattern = re.compile(r'^\s*[a-zA-Z_][a-zA-Z0-9_*\s]*\b' + re.escape(func_name) + r'\s*\(', re.MULTILINE)
        for filename in os.listdir(directory):
            if filename.endswith('.c') or filename.endswith('.h'):
                filepath = os.path.join(directory, filename)
                with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
                    content = f.read()

                    # Find function start matches
                    for match in pattern.finditer(content):
                        start_index = match.start()
                        brace_pos = content.find('{', start_index)
                        if brace_pos == -1:
                            continue

                        # Count braces to determine function end
                        brace_count = 0
                        end_pos = brace_pos
                        for i, ch in enumerate(content[brace_pos:], start=brace_pos):
                            if ch == '{':
                                brace_count += 1
                            elif ch == '}':
                                brace_count -= 1
                                if brace_count == 0:
                                    end_pos = i
                                    break

                        function_code = content[start_index:end_pos+1].strip()
                        return function_code
        return None

    # Read eviction data
    rows = []
    with open(file_path, 'r') as f:
        for line in f:
            try:
                row = json.loads(line.strip())  # Parse each line as JSON
                rows.append(row)
            except json.JSONDecodeError:
                continue  # Skip malformed lines

    df = pd.DataFrame(rows)

    # Rename columns for clarity
    df.rename(columns={
        'pc': 'program_counter',
        'address': 'memory_address',
        'set_id': 'cache_set_id',
        'cache_lines': 'current_cache_lines',
        'access_history': 'recent_access_history',
        'evict': 'evict',
        'cache_line_scores': 'cache_line_eviction_scores'
    }, inplace=True)

    # Modify 'evict' column
    df['evict'] = df['evict'].apply(lambda x: 'Cache Miss' if x else 'Cache Hit')

    # Extract current cache line addresses
    def extract_cache_line_addresses(cache_lines):
        if isinstance(cache_lines, list) and len(cache_lines) > 0:
            return [entry[0] for entry in cache_lines]  # Extract only the addresses
        return []

    df['current_cache_line_addresses'] = df['current_cache_lines'].apply(extract_cache_line_addresses)

    # Preprocessing for recency and reuse distance
    last_access_time = {}  
    address_access_indices = {}
    for idx, address in enumerate(df['memory_address']):
        address_access_indices.setdefault(address, []).append(idx)

    # Build next access time mapping
    next_access_time_mapping = {}
    for address, indices in address_access_indices.items():
        for i, current_idx in enumerate(indices):
            if i + 1 < len(indices):
                next_idx = indices[i + 1]
                reuse_distance = next_idx - current_idx
            else:
                reuse_distance = 'not accessed again'
            next_access_time_mapping[current_idx] = reuse_distance

    # Mapping from cache_set_id to indices
    cache_set_access_indices = {}
    for idx, cache_set_id in enumerate(df['cache_set_id']):
        cache_set_access_indices.setdefault(cache_set_id, []).append(idx)

    evicted_addresses = []
    accessed_address_recency = []
    accessed_address_reuse_distance = []
    evicted_address_reuse_distance = []
    miss_types = []

    for idx, row in df.iterrows():
        memory_address = row['memory_address']
        cache_set_id = row['cache_set_id']
        evict_status = row['evict']  
        current_cache_lines = row['current_cache_line_addresses']

        # Determine recency
        if memory_address in last_access_time:
            recency = idx - last_access_time[memory_address]
        else:
            recency = 'First Access'
        accessed_address_recency.append(recency)

        # Update last access time
        last_access_time[memory_address] = idx

        # Get reuse distance for accessed address
        reuse_distance = next_access_time_mapping.get(idx, 'not accessed again')
        accessed_address_reuse_distance.append(reuse_distance)

        evicted_address = None
        evicted_reuse_distance = None
        miss_type = None

        if evict_status == 'Cache Miss':
            # Determine miss type
            num_cache_lines = len(current_cache_lines) if isinstance(current_cache_lines, list) else 0
            if num_cache_lines >= max_cache_set_size:
                miss_type = 'Capacity Miss'
            else:
                miss_type = 'Conflict Miss'

            cache_set_indices = cache_set_access_indices.get(cache_set_id, [])
            current_idx_position = cache_set_indices.index(idx)
            if current_idx_position + 1 < len(cache_set_indices):
                next_idx = cache_set_indices[current_idx_position + 1]
                next_cache_lines = df.at[next_idx, 'current_cache_line_addresses']
                evicted_candidates = set(current_cache_lines) - set(next_cache_lines) if next_cache_lines else set()
                if evicted_candidates:
                    evicted_address = evicted_candidates.pop()
            else:
                evicted_address = None

            # Evicted address reuse distance
            if evicted_address:
                future_accesses = address_access_indices.get(evicted_address, [])
                future_indices = [i for i in future_accesses if i > idx]
                if future_indices:
                    evicted_reuse_distance = future_indices[0] - idx
                else:
                    evicted_reuse_distance = 'not accessed again'
        else:
            miss_type = None

        evicted_addresses.append(evicted_address)
        evicted_address_reuse_distance.append(evicted_reuse_distance)
        miss_types.append(miss_type)

    # Add new columns
    df['evicted_address'] = evicted_addresses
    df['accessed_address_recency'] = accessed_address_recency
    df['accessed_address_reuse_distance'] = accessed_address_reuse_distance
    df['evicted_address_reuse_distance'] = evicted_address_reuse_distance
    df['miss_type'] = miss_types

    # Parse disassembly and track functions
    disassembly_dict = {}
    current_function = None
    prefix = None

    function_header_regex = re.compile(r'^([0-9a-fA-F]{16}) <([^>]+)>:$')
    address_line_regex = re.compile(r'^\s*([0-9a-fA-F]{6}):')

    with open(disassembly_file, "r") as f:
        for line in f:
            line = line.rstrip("\n")
            func_match = function_header_regex.match(line)
            if func_match:
                current_function = func_match.group(2)
                continue
            addr_match = address_line_regex.match(line)
            if addr_match:
                address_hex = addr_match.group(1)
                prefix = address_hex[:5]
                if prefix not in disassembly_dict:
                    disassembly_dict[prefix] = {
                        "lines": [],
                        "function_name": current_function
                    }
                disassembly_dict[prefix]["lines"].append(line)
            else:
                if line.strip() and prefix and prefix in disassembly_dict:
                    disassembly_dict[prefix]["lines"].append(line)

    # Cache for function codes
    function_code_cache = {}

    # Add assembly_code, function_name, function_code columns
    assembly_codes = []
    function_names = []
    function_codes = []

    for pc in df['program_counter']:
        pfx = get_prefix(pc)
        entry = disassembly_dict.get(pfx, {"lines": [], "function_name": None})
        assembly_code = "\n".join(entry["lines"])
        func_name = entry["function_name"]

        # Retrieve function code if we have a function name
        func_code = None
        if func_name:
            if func_name in function_code_cache:
                func_code = function_code_cache[func_name]
            else:
                func_code = find_function_code(func_name, source_directory)
                function_code_cache[func_name] = func_code

        assembly_codes.append(assembly_code)
        function_names.append(func_name)
        function_codes.append(func_code)

    df['assembly_code'] = assembly_codes
    df['function_name'] = function_names
    df['function_code'] = function_codes

    # Reorder columns for better readability (optional)
    fixed_columns = [
        'program_counter', 'memory_address', 'cache_set_id', 'evict',
        'miss_type', 'evicted_address', 'accessed_address_recency',
        'accessed_address_reuse_distance', 'evicted_address_reuse_distance',
        'function_name', 'function_code', 'assembly_code'
    ]
    remaining_cols = [c for c in df.columns if c not in fixed_columns]
    df = df[fixed_columns + remaining_cols]

    return df


## Function to Load and Process Multiple Traces

In [2]:
def process_and_generate_metadata(file_paths, cache_set_sizes, policy_descriptions, workload_descriptions, disassembly_files, source_files):
    """
    Processes multiple cache trace files and generates metadata and descriptions for each.
    
    Args:
    - file_paths (list of str): List of file paths to the cache trace files.
    - cache_set_sizes (list of int): List of max cache set sizes for each file path.
    - policy_descriptions (list of str): List of replacement policy descriptions for each file.
    - workload_descriptions (list of str): List of workload descriptions for each file.
    
    Returns:
    - dict: A dictionary where each key is a file name (without extension) and each value is a dictionary
            containing the DataFrame, metadata, and combined description for each file.
    """
    processed_data = {}

    for file_path, max_cache_set_size, policy_desc, workload_desc, disassembly, source in zip(file_paths, cache_set_sizes, policy_descriptions, workload_descriptions, disassembly_files, source_files):
        # Extract the base name of the file without extension for a meaningful dictionary key
        file_name = file_path.split('/')[-1].split('.')[0]
        
        # Process the file to create the DataFrame
        df_modified = load_and_modify_data(file_path, max_cache_set_size, disassembly, source)
        
        # Generate the metadata
        metadata_text = generate_metadata(df_modified)
        
        # Combine the policy and workload descriptions
        combined_description = f"Replacement Policy: {policy_desc}\nWorkload: {workload_desc}"
        
        # Store the processed DataFrame, metadata, and description in the dictionary
        processed_data[file_name] = {
            "data_frame": df_modified,
            "metadata": metadata_text,
            "description": combined_description
        }

    return processed_data

def generate_metadata(df_modified):
    """
    Generates metadata text from the processed DataFrame.

    Args:
    - df_modified (DataFrame): The DataFrame for which metadata needs to be generated.

    Returns:
    - str: Metadata text summarizing cache performance.
    """
    # 1. Overall Cache Miss Rate
    total_accesses = len(df_modified)
    total_misses = df_modified['evict'].value_counts().get('Cache Miss', 0)
    miss_rate = (total_misses / total_accesses) * 100

    # 2. Miss Type Distribution (Capacity vs. Conflict Misses)
    misses_df = df_modified[df_modified['evict'] == 'Cache Miss']
    miss_type_counts = misses_df['miss_type'].value_counts()
    capacity_misses = miss_type_counts.get('Capacity Miss', 0)
    conflict_misses = miss_type_counts.get('Conflict Miss', 0)
    total_misses_count = capacity_misses + conflict_misses
    capacity_miss_percentage = (capacity_misses / total_misses_count) * 100 if total_misses_count > 0 else 0
    conflict_miss_percentage = (conflict_misses / total_misses_count) * 100 if total_misses_count > 0 else 0

    # 3. Eviction Efficiency Metrics
    reuse_distance_threshold = 1000  # Define threshold
    max_reuse_distance = df_modified[['accessed_address_reuse_distance', 'evicted_address_reuse_distance']].replace('not accessed again', np.nan).dropna().astype(float).max().max()
    max_reuse_distance = max_reuse_distance + 1000 if not np.isnan(max_reuse_distance) else reuse_distance_threshold + 1000

    df_modified['evicted_address_reuse_distance_numeric'] = df_modified['evicted_address_reuse_distance'].replace('not accessed again', max_reuse_distance).fillna(max_reuse_distance).astype(float)
    df_modified['accessed_address_reuse_distance_numeric'] = df_modified['accessed_address_reuse_distance'].replace('not accessed again', max_reuse_distance).fillna(max_reuse_distance).astype(float)

    total_evictions = df_modified['evicted_address'].notna().sum()
    long_reuse_evictions = df_modified[df_modified['evicted_address_reuse_distance_numeric'] > reuse_distance_threshold]
    percentage_long_reuse_evictions = (len(long_reuse_evictions) / total_evictions) * 100 if total_evictions > 0 else 0

    # 4. Correlation Between Recency and Cache Misses
    df_modified['is_miss'] = df_modified['evict'].apply(lambda x: 1 if x == 'Cache Miss' else 0)
    df_modified['accessed_address_recency_numeric'] = df_modified['accessed_address_recency'].replace('First Access', np.nan).astype(float)
    valid_indices = df_modified['accessed_address_recency_numeric'].dropna().index
    recency_data = df_modified.loc[valid_indices, 'accessed_address_recency_numeric']
    is_miss_data = df_modified.loc[valid_indices, 'is_miss']
    correlation = recency_data.corr(is_miss_data) if not recency_data.empty else np.nan

    # 5. Evictions where the evicted address has a lower reuse distance than the accessed address
    evictions_df = df_modified[df_modified['evicted_address'].notna()].copy()
    evictions_with_lower_reuse = evictions_df[evictions_df['evicted_address_reuse_distance_numeric'] < evictions_df['accessed_address_reuse_distance_numeric']]
    num_evictions_with_lower_reuse = len(evictions_with_lower_reuse)
    percentage_evictions_with_lower_reuse = (num_evictions_with_lower_reuse / total_evictions) * 100 if total_evictions > 0 else 0

    # Prepare the metadata text
    metadata_text = f"""
    Cache Performance Summary: {total_accesses} total accesses, {total_misses} total misses, {miss_rate:.2f}% miss rate,
    {capacity_miss_percentage:.2f}% capacity misses, {conflict_miss_percentage:.2f}% conflict misses, {total_evictions} total evictions, 
    {num_evictions_with_lower_reuse} ({percentage_evictions_with_lower_reuse:.2f}%) wrong evictions where evicted line has lower reuse distance. 
    The correlation between accessed address recency and cache misses is {correlation:.2f}.
    """
    return metadata_text

## Set arguments and create dataset

In [5]:
#file_paths = [
#    './eviction_traces/bzip_evictions_mlp.txt',
#    './eviction_traces/bzip_evictions_parrot.txt',
#    './eviction_traces/bzip_evictions_lru.txt',
#    './eviction_traces/astar_evictions_lru.txt',
#    './eviction_traces/ibm_evictions_mlp.txt'
#]
file_paths = [
    './evictions/astar_evictions_lru.txt',
    './evictions/astar_evictions_belady.txt',
    './evictions/astar_evictions_mlp.txt',
    './evictions/astar_evictions_parrot.txt',
    './evictions/lbm_evictions_lru.txt',
    './evictions/lbm_evictions_belady.txt',
    './evictions/lbm_evictions_mlp.txt',
    './evictions/lbm_evictions_parrot.txt',
    './evictions/mcf_evictions_lru.txt',
    './evictions/mcf_evictions_belady.txt',
    './evictions/mcf_evictions_mlp.txt',
    './evictions/mcf_evictions_parrot.txt'
#    './evictions/gems_evictions_parrot.txt',
#    './evictions/lbm_evictions_parrot.txt',
#    './evictions/leslie3d_evictions_parrot.txt',
#    './evictions/milc_evictions_parrot.txt',
]

#cache_set_sizes = [16, 16, 16, 16, 16, 16, 16, 16, 16]  # Associativity for each trace #Why 9?
cache_set_sizes = [16, 16, 16, 16, 16, 16, 16, 16, 16]  # Associativity for each trace

#policy_descriptions = [
#    "MLP: learned policy using a simple multi-layer perceptron to approximate belady's optimal policy",
#    "PARROT: learned policy using attention to approximate belady's optimal policy",
#    "LRU: evict the least recently used cache line",
#    "LRU: evict the least recently used cache line",
#    "MLP: learned policy using a simple multi-layer perceptron to approximate belady's optimal policy"
#    ""
#]
policy_descriptions = [
    "LRU: evict the least recently used cache line",
    "Belady: optimal replacement policy. Evict the cache line with highest reuse distance",
    "MLP: learned policy using a simple multi-layer perceptron to approximate belady's optimal policy",
    "PARROT: learned policy using attention to approximate belady's optimal policy",
    "LRU: evict the least recently used cache line",
    "Belady: optimal replacement policy. Evict the cache line with highest reuse distance",
    "MLP: learned policy using a simple multi-layer perceptron to approximate belady's optimal policy",
    "PARROT: learned policy using attention to approximate belady's optimal policy",
    "LRU: evict the least recently used cache line",
    "Belady: optimal replacement policy. Evict the cache line with highest reuse distance",
    "MLP: learned policy using a simple multi-layer perceptron to approximate belady's optimal policy",
    "PARROT: learned policy using attention to approximate belady's optimal policy"
#    "PARROT: learned policy using attention to approximate belady's optimal policy",
#    """
#    "PARROT: learned policy using attention to approximate belady's optimal policy",
#    "PARROT: learned policy using attention to approximate belady's optimal policy",
#    "PARROT: learned policy using attention to approximate belady's optimal policy",
#    """
    ""
]
    
#workload_descriptions = [
#    "The bzip workload has six parts: two JPEGs, a binary, a tar file with source code, an HTML file, and an archive with both highly and minimally compressible files. Each is compressed and decompressed at three blocking factors, with the decompressed output compared to the original.",
#    "The bzip workload has six parts: two JPEGs, a binary, a tar file with source code, an HTML file, and an archive with both highly and minimally compressible files. Each is compressed and decompressed at three blocking factors, with the decompressed output compared to the original.",
#    "The bzip workload has six parts: two JPEGs, a binary, a tar file with source code, an HTML file, and an archive with both highly and minimally compressible files. Each is compressed and decompressed at three blocking factors, with the decompressed output compared to the original.",
#    "astar is a 2D path-finding library used in game AI. It includes three A* algorithms: one for passable/non-passable maps, another modified for varied terrain and move speed, and a third for graph-based maps with regional relationships. It also includes functions for determining map regions.",
#    "The lbm workload simulates fluid dynamics and involves large, regular memory accesses, creating high memory bandwidth demands. This pattern results in frequent cache misses and high cache eviction rates, stressing the cache hierarchy as it continually loads new data. Consequently, lbm heavily tests a cache’s ability to manage large data sets with low temporal locality, making it an effective benchmark for evaluating cache replacement policies."    
#]
workload_descriptions = [
    "astar is derived from a portable 2D path-finding library that is used in game's AI. This library implements three different path-finding algorithms: First is the well known A* algorithm for maps with passable and non-passable terrain types. Second is a modification of the A* path finding algorithm for maps with different terrain types and different move speed. Third is an implementation of A* algorithm for graphs. This is formed by map regions with neighborhood relationship. The library also includes pseudo-intellectual functions for map region determination.",
    "astar is derived from a portable 2D path-finding library that is used in game's AI. This library implements three different path-finding algorithms: First is the well known A* algorithm for maps with passable and non-passable terrain types. Second is a modification of the A* path finding algorithm for maps with different terrain types and different move speed. Third is an implementation of A* algorithm for graphs. This is formed by map regions with neighborhood relationship. The library also includes pseudo-intellectual functions for map region determination.",
    "astar is derived from a portable 2D path-finding library that is used in game's AI. This library implements three different path-finding algorithms: First is the well known A* algorithm for maps with passable and non-passable terrain types. Second is a modification of the A* path finding algorithm for maps with different terrain types and different move speed. Third is an implementation of A* algorithm for graphs. This is formed by map regions with neighborhood relationship. The library also includes pseudo-intellectual functions for map region determination.",
    "astar is derived from a portable 2D path-finding library that is used in game's AI. This library implements three different path-finding algorithms: First is the well known A* algorithm for maps with passable and non-passable terrain types. Second is a modification of the A* path finding algorithm for maps with different terrain types and different move speed. Third is an implementation of A* algorithm for graphs. This is formed by map regions with neighborhood relationship. The library also includes pseudo-intellectual functions for map region determination.",
    "lbm workload implements the so-called Lattice Boltzmann Method (LBM) to simulate incompressible fluids in 3D. It is the computationally most important part of a larger code which is used in the field of material science to simulate the behavior of fluids with free surfaces, in particluar the formation and movement of gas bubbles in metal foams. For benchmarking purposes and easy optimization for different architectures, the code makes extensive use of macros which hide the details of the data access.",
    "lbm workload implements the so-called Lattice Boltzmann Method (LBM) to simulate incompressible fluids in 3D. It is the computationally most important part of a larger code which is used in the field of material science to simulate the behavior of fluids with free surfaces, in particluar the formation and movement of gas bubbles in metal foams. For benchmarking purposes and easy optimization for different architectures, the code makes extensive use of macros which hide the details of the data access.",
    "lbm workload implements the so-called Lattice Boltzmann Method (LBM) to simulate incompressible fluids in 3D. It is the computationally most important part of a larger code which is used in the field of material science to simulate the behavior of fluids with free surfaces, in particluar the formation and movement of gas bubbles in metal foams. For benchmarking purposes and easy optimization for different architectures, the code makes extensive use of macros which hide the details of the data access.",
    "lbm workload implements the so-called Lattice Boltzmann Method (LBM) to simulate incompressible fluids in 3D. It is the computationally most important part of a larger code which is used in the field of material science to simulate the behavior of fluids with free surfaces, in particluar the formation and movement of gas bubbles in metal foams. For benchmarking purposes and easy optimization for different architectures, the code makes extensive use of macros which hide the details of the data access.",
    "mcf is a C-based benchmark derived from MCF, a program for single-depot vehicle scheduling in public transportation. It uses integer arithmetic to solve large-scale minimum-cost flow problems with a network simplex algorithm. The goal is to minimize fleet size and operational costs by scheduling timetabled trips efficiently. The benchmark employs MCF Version 1.2, integrating it with column generation to accelerate the solution process, relying heavily on pointer and integer arithmetic.",
    "mcf is a C-based benchmark derived from MCF, a program for single-depot vehicle scheduling in public transportation. It uses integer arithmetic to solve large-scale minimum-cost flow problems with a network simplex algorithm. The goal is to minimize fleet size and operational costs by scheduling timetabled trips efficiently. The benchmark employs MCF Version 1.2, integrating it with column generation to accelerate the solution process, relying heavily on pointer and integer arithmetic.",
    "mcf is a C-based benchmark derived from MCF, a program for single-depot vehicle scheduling in public transportation. It uses integer arithmetic to solve large-scale minimum-cost flow problems with a network simplex algorithm. The goal is to minimize fleet size and operational costs by scheduling timetabled trips efficiently. The benchmark employs MCF Version 1.2, integrating it with column generation to accelerate the solution process, relying heavily on pointer and integer arithmetic.",
    "mcf is a C-based benchmark derived from MCF, a program for single-depot vehicle scheduling in public transportation. It uses integer arithmetic to solve large-scale minimum-cost flow problems with a network simplex algorithm. The goal is to minimize fleet size and operational costs by scheduling timetabled trips efficiently. The benchmark employs MCF Version 1.2, integrating it with column generation to accelerate the solution process, relying heavily on pointer and integer arithmetic."

#    "GemsFDTD is a 3D FDTD solver for Maxwell’s equations, used to compute the radar cross section (RCS) of a conducting object. It follows three steps: initialization, timestepping (99% of execution time), and post-processing. It employs the Yee scheme on a staggered grid, Huygens' surfaces for wave generation, and Uni-axial Perfectly Matched Layer (UPML) for boundary absorption. The RCS is computed using near-to-far-field transformation with Fast Fourier Transforms (FFT).",
#    """
#    "lbm workload implements the so-called Lattice Boltzmann Method (LBM) to simulate incompressible fluids in 3D. It is the computationally most important part of a larger code which is used in the field of material science to simulate the behavior of fluids with free surfaces, in particluar the formation and movement of gas bubbles in metal foams. For benchmarking purposes and easy optimization for different architectures, the code makes extensive use of macros which hide the details of the data access.",
#    "leslie3d benchmark is a Computational Fluid Dynamics solver based on LESlie3d, used to study turbulence phenomena like mixing and combustion. For CPU2006, it solves a temporal mixing layer problem, a key benchmark for turbulent mixing physics. It employs a finite-volume algorithm with MacCormack Predictor-Corrector time integration, achieving fourth-order spatial and second-order temporal accuracy. The benchmark version, 437.leslie3d, minimizes file I/O to focus on CPU and memory performance.",
#    "MILC is a set of C-based codes developed by the MIMD Lattice Computation (MILC) collaboration for simulating SU(3) lattice gauge theory on parallel machines. It is widely used in DOE and NSF supercomputer centers. The SPEC CPU2006 benchmark 433.milc runs the serial version of the su3imp program, essential for optimizing parallel performance. It generates gauge fields for lattice gauge theory, enabling the study of quarks and gluons in quantum field theory."
#    """
]

#disassembly_files = [
#    "./workload_source/bzip2_src/bzip3_disassembly.txt",
#    "./workload_source/bzip2_src/bzip3_disassembly.txt",
#    "./workload_source/bzip2_src/bzip3_disassembly.txt",
#    "./workload_source/astar_src/astar_disassembly.txt",
#    "./workload_source/lbm_src/lbm_disassembly.txt"
#]
disassembly_files = [
    "./workload_disassembly/astar.txt",
    "./workload_disassembly/astar.txt",
    "./workload_disassembly/astar.txt",
    "./workload_disassembly/astar.txt",
    "./workload_disassembly/lbm.txt",
    "./workload_disassembly/lbm.txt",
    "./workload_disassembly/lbm.txt",
    "./workload_disassembly/lbm.txt",
    "./workload_disassembly/mcf.txt",
    "./workload_disassembly/mcf.txt",
    "./workload_disassembly/mcf.txt",
    "./workload_disassembly/mcf.txt"
#    """
#    "./workload_disassembly/lbm/lbm.txt",
#    "./workload_disassembly/leslie3d/leslie3d.txt",
#    "./workload_disassembly/milc/milc.txt"
#    """
]

#source_files = [
#    "./workload_source/bzip2_src",
#    "./workload_source/bzip2_src",
#    "./workload_source/bzip2_src",
#    "./workload_source/astar_src",
#    "./workload_source/lbm_src"
#]
source_files = [
    "./workload_source/astar",
    "./workload_source/astar",
    "./workload_source/astar",
    "./workload_source/astar",
    "./workload_source/lbm",
    "./workload_source/lbm",
    "./workload_source/lbm",
    "./workload_source/lbm",
    "./workload_source/mcf",
    "./workload_source/mcf",
    "./workload_source/mcf",
    "./workload_source/mcf"
#    """
#    "./workload_source/lbm",
#    "./workload_source/leslie3d",
#    "./workload_source/milc"
#    """
]

# Process all files and store the results
processed_data = process_and_generate_metadata(file_paths, cache_set_sizes, policy_descriptions, workload_descriptions, disassembly_files, source_files)

## Save and Load Data

In [6]:
import pickle

# Save processed_data to a file
with open('processed_data.pkl', 'wb') as f:
    pickle.dump(processed_data, f)


In [7]:
import pickle

# Load processed_data from a file
with open('processed_data.pkl', 'rb') as f:
    loaded_data = pickle.load(f)

# loaded_data now contains the same structure and data as processed_data

## Verification

In [8]:
# Sample some rows

#Adjust display options to avoid truncation
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

In [9]:
# Sample some metadata
#loaded_data['bzip_evictions_mlp']['metadata']
loaded_data['astar_evictions_parrot']['metadata']

'\n    Cache Performance Summary: 140704 total accesses, 92843 total misses, 65.98% miss rate,\n    100.00% capacity misses, 0.00% conflict misses, 92802 total evictions, \n    57057 (61.48%) wrong evictions where evicted line has lower reuse distance. \n    The correlation between accessed address recency and cache misses is 0.34.\n    '

## Make Finetuning Dataset - Only if finetuning LLaMa

In [10]:
import json

def generate_llm_training_data_from_dataframe(processed_data):
    """
    Generates training data for an LLM from a unified DataFrame that includes metadata and descriptions.
    
    Args:
    - processed_data (dict): Dictionary where each key is a file name and each value includes the DataFrame, metadata, and description.

    Returns:
    - list: A list of dictionaries with 'prompt' and 'completion' pairs for LLM training.
    """
    training_data = []
    
    for file_name, file_info in processed_data.items():
        df = file_info["data_frame"]
        policy_description = file_info["description"].split("\n")[0]  # Replacement policy description
        workload_description = file_info["description"].split("\n")[1]  # Workload description
        
        for _, row in df.iterrows():
            # Create a prompt based on the requested fields
            prompt = (
                f"{policy_description}\n"
                f"{workload_description}\n"
                f"Program Counter: {row['program_counter']}\n"
                f"Memory Address: {row['memory_address']}\n"
                f"Cache State: {row['current_cache_line_addresses']}\n"
                f"Access History: {row['recent_access_history']}\n"
                f"Assembly Code: {row['assembly_code']}\n"
                f"Source Function: {row['function_name']}\n"
            )

            # Create the expected completion based on the requested fields
            completion = (
                f"Evict: {row['evict']}\n"
                f"Miss Type: {row['miss_type'] if pd.notna(row['miss_type']) else 'N/A'}\n"
                f"Evicted Address: {row['evicted_address'] if pd.notna(row['evicted_address']) else 'N/A'}\n"
                f"Cache Line Scores: {row['cache_line_eviction_scores']}\n"
            )

            # Append the prompt and completion as a dictionary
            training_data.append({
                "prompt": prompt,
                "completion": completion
            })

    return training_data

# Example usage
llm_training_data = generate_llm_training_data_from_dataframe(loaded_data)

# Optionally, save to a JSON file for training
with open('llm_training_data_with_program_semantics.json', 'w') as f:
    json.dump(llm_training_data, f, indent=2)