To generate random data for various workloads ( for Random Forest and feature selection)

In [1]:
import sys
import os
import csv
import random
import re
import numpy as np
import pandas as pd

In [2]:

def read_metric(file_path = './db_bench_output.txt'):
    metrics = {}
    micros_op_pattern = re.compile(r"(\d+\.\d+) micros/op (\d+) ops/sec")
    write_read_rate_pattern = re.compile(r"Write rate: (\d+) bytes/second\nRead rate: (\d+) ops/second")
    with open(file_path, 'r') as file:
        file_content = file.read()
        
        micros_op_match = micros_op_pattern.search(file_content)
        if micros_op_match:
            metrics['micros_per_op'] = float(micros_op_match.group(1))
            metrics['ops_per_sec'] = int(micros_op_match.group(2))
            

        write_read_rate_match = write_read_rate_pattern.search(file_content)
        if write_read_rate_match:
            metrics['write_rate'] = int(write_read_rate_match.group(1))
            metrics['read_rate'] = int(write_read_rate_match.group(2))

    return   metrics['ops_per_sec'] , metrics['micros_per_op']


In [3]:
# Define the parameter ranges
parameter_ranges = {
    'max_background_compactions': [1, 8],
    'max_background_flushes': [1, 8],
    'write_buffer_size': [4*1024*1024, 1024*1024*1024],
    'max_write_buffer_number': [2, 8],
    'min_write_buffer_number_to_merge': [1, 5],
    'max_bytes_for_level_multiplier': [2, 16],
    'block_size': [4*1024, 128*1024],
    'level0_file_num_compaction_trigger': [2, 16],
    'level0_slowdown_writes_trigger': [2, 32],
    'level0_stop_writes_trigger': [2, 32],
    'target_file_size_multiplier': [1, 8], 
    'target_file_size_base': [33554432, 134217728]
}

In [4]:

# Add the parent directory to sys.path
sys.path.append(os.path.abspath('../'))

# Now you can import the module
import rocksdb_module

In [5]:
# Function to generate random samples within the given ranges and collect performance metrics
def generate_random_samples(n_samples=1500, parameter_ranges=parameter_ranges, csv_file='./rocksdb_performance_data.csv'):
    # Check if the CSV file exists, if not, write the header
    if not os.path.exists(csv_file):
        with open(csv_file, mode='w', newline='') as file:
            writer = csv.DictWriter(file, fieldnames=list(parameter_ranges.keys()) + ['ops_per_sec', 'micros_per_op'])
            writer.writeheader()
    
    for _ in range(n_samples):
        samples = {}
        for param, (low, high) in parameter_ranges.items():
            if isinstance(low, int) and isinstance(high, int):
                samples[param] = np.random.randint(low, high + 1)
            else:
                samples[param] = np.random.uniform(low, high)
        
        wltype = "fillseq" # change workload as necessary 

        num_operations = 100000
        db_path = ""  # Ensure this is correctly set or managed in your actual use case

        # Load the workload on the rocksdb_module
        rec_temp = {key: str(value) for key, value in samples.items()}
        other_params = {}
        rocksdb_module.run_workload(wltype, num_operations, db_path, rec_temp, other_params)

        # Read the operations and microseconds from the metric function
        ops, micro = read_metric()
        print(f"ops: {ops}, micro: {micro}")

        # Add the performance metrics to the sample
        samples['ops_per_sec'] = ops
        samples['micros_per_op'] = micro
        
        # Append the sample to the CSV file
        with open(csv_file, mode='a', newline='') as file:
            writer = csv.DictWriter(file, fieldnames=samples.keys())
            writer.writerow(samples)

    print(f"Data generation complete. The dataset is appended to '{csv_file}'.")

# Generate the dataset and keep appending data to the CSV file
generate_random_samples()

RocksDB:    version 8.10.0
Date:       Wed May 29 18:17:26 2024
CPU:        12 * Intel(R) Core(TM) i7-10750H CPU @ 2.60GHz
CPUCache:   12288 KB
RocksDB:    version 8.10.0                           
Date:       Wed May 29 18:17:36 2024


run_workload
recommendations
cmd: /mnt/c/Users/edeep/Final_Rocksdb/rocksdb/db_bench --benchmarks=fillseq --num=100000 --compression_type=none --key_size=1024 --value_size=10240 --block_size=15833 --level0_file_num_compaction_trigger=15 --level0_slowdown_writes_trigger=11 --level0_stop_writes_trigger=21 --max_background_compactions=5 --max_background_flushes=3 --max_bytes_for_level_multiplier=15 --max_write_buffer_number=3 --min_write_buffer_number_to_merge=5 --target_file_size_base=61402544 --target_file_size_multiplier=4 --write_buffer_size=594934666100736
ops: 12000, micro: 83.205
run_workload
recommendations
cmd: /mnt/c/Users/edeep/Final_Rocksdb/rocksdb/db_bench --benchmarks=fillseq --num=100000 --compression_type=none --key_size=1024 --value_size=10240 --block_size=22040 --level0_file_num_compaction_trigger=16 --level0_slowdown_writes_trigger=4 --level0_stop_writes_trigger=18 --max_background_compactions=5 --max_background_flushes=1 --max_bytes_for_level_multiplier=8 --max_write_bu

CPU:        12 * Intel(R) Core(TM) i7-10750H CPU @ 2.60GHz
CPUCache:   12288 KB
... finished 100000 ops                              

ops: 11858, micro: 84.33
run_workload
recommendations
cmd: /mnt/c/Users/edeep/Final_Rocksdb/rocksdb/db_bench --benchmarks=fillseq --num=100000 --compression_type=none --key_size=1024 --value_size=10240 --block_size=109654 --level0_file_num_compaction_trigger=5 --level0_slowdown_writes_trigger=20 --level0_stop_writes_trigger=2 --max_background_compactions=5 --max_background_flushes=6 --max_bytes_for_level_multiplier=10 --max_write_buffer_number=7 --min_write_buffer_number_to_merge=2 --target_file_size_base=96899329 --target_file_size_multiplier=4 --write_buffer_size=905678295138304
ops: 11106, micro: 90.033
Data generation complete. The dataset is appended to './rocksdb_performance_data.csv'.


RocksDB:    version 8.10.0
Date:       Wed May 29 18:17:46 2024
CPU:        12 * Intel(R) Core(TM) i7-10750H CPU @ 2.60GHz
CPUCache:   12288 KB
... finished 100000 ops                              