## Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import subprocess

%matplotlib inline

## Global Variables

In [2]:
TEST_PROGRAM_PATH = "../custom_ds/main"

## Functions

In [28]:
def run_command(command):
    sproc = subprocess.Popen(command.split(' '), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    output, err = map(lambda byte: byte.decode('utf-8'), sproc.communicate())
    if err:
        raise Exception(err)
    return output.strip()

def compile_cpp(path, **kwargs):
    command = f"g++ {path}.cpp -o {path}.ignoreme"
    return run_command(command.strip())

def execute_cpp(path, **kwargs):
    command = f"./{path}.ignoreme {kwargs.get('file_path', '')} {kwargs.get('kmer_size', '')} {kwargs.get('hash_map_size', '')} {kwargs.get('fp_size', '')} {kwargs.get('use_buffer', '')}"
    return run_command(command.strip())

## Dataframe Helper Functions

In [61]:
def create_dataframe(columns, data = []):
    return pd.DataFrame(columns = columns, data = data)

def plot_graphic(test_name, dataframe, x_axis, y_axis):
    plt.title(test_name)
    plt.xlabel(x_axis)
    plt.ylabel(y_axis)
    plt.plot(dataframe[x_axis], dataframe[y_axis], '-')
    plt.show()

## Test Parameters
We want to define which tests do we want to do (for example, test the k-mer size influence on this algorithm).
Let's define a test suite structure, defining the parameter we want to test. The key should be the test name, and the value is a dict with the parameters we want to pass to our test function:
```python3
test_suite = {
  <test_name>: {
      'test_file_path': str,
      'kmer_size': int,
      'hash_map_size': int,
      'fp_size': int,
  }
}
```

In [59]:
def run_test_kmer(test):
    df = create_dataframe(columns)
    for kmer_size in test['kmer_size']:
        out = execute_cpp(
            TEST_PROGRAM_PATH, 
            file_path = test["file_path"], 
            kmer_size = kmer_size, 
            hash_map_size = test["hash_map_size"],
            fp_size = test["fp_size"])
        df = df.append(create_dataframe(columns, [out.split(' ')]))
    return df

def run_test_hash_map_size(test):
    df = create_dataframe(columns)
    for hash_map_size in test['hash_map_size']:
        out = execute_cpp(
            TEST_PROGRAM_PATH, 
            file_path = test["file_path"], 
            kmer_size = test["kmer_size"], 
            hash_map_size = hash_map_size,
            fp_size = test["fp_size"])
        df = df.append(create_dataframe(columns, [out.split(' ')]))
    return df

def run_test_fp_size_hash_map_size(test):
    df = create_dataframe(columns)
    for hash_map_size in test['hash_map_size']:
        for fp_size in test['fp_size']:
            out = execute_cpp(
                TEST_PROGRAM_PATH, 
                file_path = test["file_path"], 
                kmer_size = test["kmer_size"], 
                hash_map_size = hash_map_size,
                fp_size = fp_size)
            df = df.append(create_dataframe(columns, [out.split(' ')]))
    return df

def run_tests(test):
    df = create_dataframe(columns)
    for hash_map_size in test['hash_map_size']:
        for fp_size in test['fp_size']:
            for use_buffer in test['use_buffer']:
                out = execute_cpp(
                    TEST_PROGRAM_PATH, 
                    file_path = test["file_path"], 
                    kmer_size = test["kmer_size"], 
                    hash_map_size = hash_map_size,
                    fp_size = fp_size,
                    use_buffer = use_buffer)
                test_str = f"hash_map_size {hash_map_size} fp_size {fp_size} use_buffer {use_buffer}"
                df = df.append(create_dataframe(columns, [[test_str] + out.split(' ')]))
    return df

In [63]:
test_suite = {
    "test_kmer_size": {
        "file_path": "../datasets/dna.5MB",
        "kmer_size": [7, 8, 9, 10, 11],
        "hash_map_size": 20,
        "fp_size": 3,
        'use_buffer': [0, 1],
    },
    "test_hash_map_size": {
        "file_path": "../datasets/dna.10MB",
        "kmer_size": 14,
        "hash_map_size": [
            23,
            24,
            25,  
            26,
            28,
            30,
        ],
        "fp_size": 3,
        'use_buffer': [0, 1],
    },
    "test_fp_size_hash_map_size": {
        "file_path": "../datasets/dna.10MB",
        "kmer_size": 14,
        "hash_map_size": [
            23,
            24,
            25,  
        ],
        'fp_size': [
            3,
            4,
            5
        ],
        'use_buffer': [0, 1],
    },
    "test_fp_size_hash_map_size_5": {
        "file_path": "../datasets/dna.5MB",
        "kmer_size": 14,
        "hash_map_size": [
            23,
            24,
        ],
        'fp_size': [
            3,
            4,
            5,
        ],
        'use_buffer': ["0", "1"],
    },
}

## Test

In [42]:
compile_cpp(TEST_PROGRAM_PATH)

Exception: In file included from ../custom_ds/main.cpp:9:
../custom_ds/Hash.cpp:11:15: warning: integer constant is so large that it is unsigned
   11 |     return (((11400714819323198485 * key)) >> (64 - nBits)) & ((1 << nBits) - 1);
      |               ^~~~~~~~~~~~~~~~~~~~
../custom_ds/main.cpp:40:15: warning: integer constant is so large that it is unsigned
   40 |     return (((11400714819323198485 * key)) >> (64 - SHIFT)) & ((1 << SHIFT) - 1);
      |               ^~~~~~~~~~~~~~~~~~~~


In [55]:
columns = [
    "test",
    "true_positives", 
    "true_negatives", 
    "false_positives", 
    "false_negatives", 
    "sensibility", 
    "specificity",
    "found_kmers",
]

In [None]:
test_fp_size_hash_map_size_5 = run_tests(test_suite["test_fp_size_hash_map_size_5"])

In [62]:
test_fp_size_hash_map_size_5

Unnamed: 0,test,true_positives,true_negatives,false_positives,false_negatives,sensibility,specificity,found_kmers
0,hash_map_size 23 fp_size 3 use_buffer 0,16384,0,0,0,1,1,16384
0,hash_map_size 23 fp_size 3 use_buffer 1,16384,0,0,0,1,1,16384
0,hash_map_size 23 fp_size 4 use_buffer 0,16384,0,0,0,1,1,16384
0,hash_map_size 23 fp_size 4 use_buffer 1,16384,0,0,0,1,1,16384
0,hash_map_size 23 fp_size 5 use_buffer 0,16384,0,0,0,1,1,16384
0,hash_map_size 23 fp_size 5 use_buffer 1,16384,0,0,0,1,1,16384
0,hash_map_size 24 fp_size 3 use_buffer 0,16384,0,0,0,1,1,16384
0,hash_map_size 24 fp_size 3 use_buffer 1,16384,0,0,0,1,1,16384
0,hash_map_size 24 fp_size 4 use_buffer 0,16384,0,0,0,1,1,16384
0,hash_map_size 24 fp_size 4 use_buffer 1,16384,0,0,0,1,1,16384


- calcular sens, spec sem considerar o buffer
- [ ]consultar arestas
- [x] hashear fingerprint
- reportar resultados semana que vem (começar a trabalhar no texto)

4194304