# Check Data

This is a set of quick functions to generate simple dummy datasets. These will be used to adjust and "check" the AI anomaly detection's reliability. Once generated, they will be tested in numerous validation runs.

In [2]:
# Preflight, load
from bigstick import LoadedModel as lm
import src.config as c
from string import ascii_uppercase as ABC
import json
from pathlib import Path
from datetime import datetime
from dataclasses import dataclass

In [31]:
# Define outputs


@dataclass
class OutputReport:
    """Reporting
    Provide a JSON file and the subsequent information
    and this will build a report to output.
    """

    job_name: str
    write_dest: str = c.RESULTS_PATH
    script_start_time: datetime = datetime.now()
    heading: str = None

    def __post_init__(self) -> None:
        self.heading = {
            "job_name": self.job_name,
            "script_start_time": self.script_start_time.strftime("%Y-%m-%d %H:%M:%S"),
        }

        Path(self.write_dest).mkdir(parents=True, exist_ok=True)

    def write(self, data: dict, script_finish_time: datetime = datetime.now()) -> Path:
        """Write data report

        Returns:
            Path: a Path object of the written report
        """

        self.heading["script_finish_time"] = script_finish_time.strftime(
            "%Y-%m-%d %H:%M:%S"
        )
        report = {**self.heading, **{"results": data}}
        report_file = f"{self.write_dest}/{self.job_name}_{self.script_start_time.strftime('%Y%m%d-%H%M%S')}.json"

        with open(
            report_file,
            "w+",
        ) as f:
            json.dump(report, f)

        return Path(report_file)

## Small 1-dimensional Array

In [36]:
job_name = "small-1d-array"

In [37]:
sample = {}
for letter in ABC:
    sample[letter] = 0
    
sample['F'] = 1

_Let's check this a few times._

In [38]:
results = {}

output = OutputReport(job_name=job_name, script_start_time=datetime.now())

for i in range(0, c.TRIALS):
    resp = lm(json_mode=True, base_url=f"http://{c.GPU_NODE}:11434").Simple(
        query=f"""
        Find any anomalies in this data: {json.dumps(sample)}.
        Respond only with JSON containing the following keys and values:
            - "rank": <the rank you assigned to the anomaly>,
            - "line": <the line number of the data>,
            - "data": <the relevant data>,
            - "explanation: <the explanation for your choice>
            
        """
    )
    result = json.loads(resp.model_dump_json())["text"].strip("\n")
    results[i] = json.loads(result)

output.write(data=results, script_finish_time=datetime.now())


PosixPath('results/small-1d-array_20240910-132903.json')

### Parse the results

In [None]:
import json

# trials = c.TRIALS
trials = 1000

results_1d_array_1000 = json.load(open(f"results/abc-1d-array_{trials}.json", "r"))

expected_result_data = {"F": 1}

exact_matches = 0
inexact_matches = 0
inexact_matches_correct_line = 0
non_matches = 0
non_match_records = {}

for i in results_1d_array_1000:
    if all(
        key in list(results_1d_array_1000[i].keys())
        for key in ["data", "explanation", "line"]
    ):
        if (
            results_1d_array_1000[i]["data"] == expected_result_data
            and results_1d_array_1000[i]["line"] == 6
        ):
            exact_matches += 1
        elif (
            results_1d_array_1000[i]["data"] == expected_result_data
            or "F" in results_1d_array_1000[i]["explanation"]
            or all(char in results_1d_array_1000[i]["data"] for char in ["F", ":", "1"])
        ):
            inexact_matches += 1

            if results_1d_array_1000[i]["line"] == 6:
                inexact_matches_correct_line += 1

        else:
            non_matches += 1
            non_match_records[i] = results_1d_array_1000[i]
    elif "anomalies" in list(results_1d_array_1000[i].keys()):
        for anom in results_1d_array_1000[i]["anomalies"]:
            if all(key in list(anom.keys()) for key in ["data", "explanation", "line"]):
                if anom["data"] == expected_result_data and anom["line"] == 6:
                    exact_matches += 1
                elif (
                    anom["data"] == expected_result_data
                    or "F" in anom["explanation"]
                    or all(char in anom["data"] for char in ["F", ":", "1"])
                ):
                    inexact_matches += 1
                else:
                    non_matches += 1
                    non_match_records[i] = results_1d_array_1000[i]
            else:
                non_matches += 1
                non_match_records[i] = results_1d_array_1000[i]
    else:
        non_matches += 1
        non_match_records[i] = results_1d_array_1000[i]


print(
    f"""
    {exact_matches=}
    {inexact_matches=}
    {inexact_matches_correct_line=}
    {non_matches=}
    """
)
print(json.dumps(non_match_records, indent=4))

## Large 1d-array

The small array is performant enough (around 3-5s per query) that I need to see when this performance tapers off.

In [33]:
job_name = "large-1d-array"

In [32]:
import pandas as pd
from string import ascii_uppercase as ABC
import random

data_size = 80192
chunk_size = 64
iterations = data_size // chunk_size
min = 10 ** (len(str(iterations)) - 1)

# Replace a random value
ran_replace = random.choice(range(iterations))
print(f"find the random replacement here: {ran_replace}")

result = []

for i in range(iterations):
    value = (
        "".join(random.choice(ABC) for _ in range(chunk_size))
        if i != ran_replace
        else "0000"
    )
    result.append(value)

container = pd.DataFrame(
    result, columns=["entry"], index=range(min, (len(result) + min))
)
size_of_df = container.memory_usage(index=True).sum()

print(f"Container size in memory: {size_of_df} bytes")

with open(f"llama-data/data/{job_name}.csv", "w+") as f:
    f.write(container.to_csv(index_label="index"))


find the random replacement here: 1196
Container size in memory: 10156 bytes


In [35]:
results = {}

input_data = open(f"{c.DATA_PATH}/{job_name}.csv", "r").read()
output = OutputReport(job_name=job_name, script_start_time=datetime.now())

for i in range(0, c.TRIALS):
    resp = lm(json_mode=True, base_url=f"http://{c.GPU_NODE}:11434").Simple(
        query=f"""
        Find any anomalies in this data: {input_data}.
        Respond only with JSON containing the following keys and values:
            - "rank": <the rank you assigned to the anomaly>,
            - "line": <the line number of the data>,
            - "data": <the relevant data>,
            - "explanation: <the explanation for your choice>
            
        """
    )
    result = json.loads(resp.model_dump_json())["text"].strip("\n")
    results[i] = json.loads(result)

output.write(data=results, script_finish_time=datetime.now())

PosixPath('results/large-1d-array_20240910-132634.json')

## Second Check for 1d-array
The principle here is to force the LLM to reevaluate the data. This will take the original process and simply add another check to it.

In [7]:
results = {}

for i in range(0, c.TRIALS):
    resp_1 = lm(json_mode=True, base_url=f"http://{c.GPU_NODE}:11434").Simple(
        query=f"""
        Find any anomalies in this data: {json.dumps(sample)}.
        Respond only with JSON containing the following keys and values:
            - "rank": <the rank you assigned to the anomaly>,
            - "line": <the line number of the data>,
            - "data": <the relevant data>,
            - "explanation: <the explanation for your choice>
            
        """
    )

    result = json.loads(resp_1.model_dump_json())["text"].strip("\n")
    results[i]["resp_1"] = json.loads(result)

    # Second query

    resp_2 = lm(json_mode=True, base_url=f"http://{c.GPU_NODE}:11434").Simple(
        query=f"""
        {json.dumps(result)}
        
        Reivew this data. It should be formatted as a JSON dataset that contains the following keys:
            - "rank": <the rank you assigned to the anomaly>,
            - "line": <the line number of the data>,
            - "data": <the relevant data>,
            - "explanation: <the explanation for your choice>
        
        Ensure that the formatting is correct.
        Ensure that the values for the keys are correct by comparing to the original entry in this data:
        {json.dumps(sample)}
        """
    )
    
    result = json.loads(resp_2.json())["text"].strip("\n")
    results[i]["resp_2"] = json.loads(result)

with open(f"results/abc-1d-array_{c.TRIALS}.json", "w+") as f:
    json.dump(results, f)

KeyError: 0