# Check Data

This is a set of quick functions to generate simple dummy datasets. These will be used to adjust and "check" the AI anomaly detection's reliability. Once generated, they will be tested in numerous validation runs.

In [1]:
# Preflight, load
from bigstick import LoadedModel as lm
import src.config as c
from string import ascii_uppercase as ABC
import json


## Small 1-dimensional Array

In [2]:
sample = {}
for letter in ABC:
    sample[letter] = 0
    
sample['F'] = 1

_Let's check this a few times._

In [3]:
results = {}

for i in range(0, c.TRIALS):
    resp = lm(json_mode=True, base_url=f"http://{c.GPU_NODE}:11434").Simple(
        query=f"""
        Find any anomalies in this data: {json.dumps(sample)}.
        Respond only with JSON containing the following keys and values:
            - "rank": <the rank you assigned to the anomaly>,
            - "line": <the line number of the data>,
            - "data": <the relevant data>,
            - "explanation: <the explanation for your choice>
            
        """
    )
    result = json.loads(resp.json())["text"].strip("\n")
    results[i] = json.loads(result)

with open(f"results/abc-1d-array_{c.TRIALS}.json", "w+") as f:
    json.dump(results, f)

### Parse the results

In [10]:
import json

# trials = c.TRIALS
trials = 1000

results_1d_array_1000 = json.load(open(f"results/abc-1d-array_{trials}.json", "r"))

expected_result_data = {"F": 1}

exact_matches = 0
inexact_matches = 0
inexact_matches_correct_line = 0
non_matches = 0
non_match_records = {}

for i in results_1d_array_1000:
    if all(
        key in list(results_1d_array_1000[i].keys())
        for key in ["data", "explanation", "line"]
    ):
        if (
            results_1d_array_1000[i]["data"] == expected_result_data
            and results_1d_array_1000[i]["line"] == 6
        ):
            exact_matches += 1
        elif (
            results_1d_array_1000[i]["data"] == expected_result_data
            or "F" in results_1d_array_1000[i]["explanation"]
            or all(char in results_1d_array_1000[i]["data"] for char in ["F", ":", "1"])
        ):
            inexact_matches += 1

            if results_1d_array_1000[i]["line"] == 6:
                inexact_matches_correct_line += 1

        else:
            non_matches += 1
            non_match_records[i] = results_1d_array_1000[i]
    elif "anomalies" in list(results_1d_array_1000[i].keys()):
        for anom in results_1d_array_1000[i]["anomalies"]:
            if all(key in list(anom.keys()) for key in ["data", "explanation", "line"]):
                if anom["data"] == expected_result_data and anom["line"] == 6:
                    exact_matches += 1
                elif (
                    anom["data"] == expected_result_data
                    or "F" in anom["explanation"]
                    or all(char in anom["data"] for char in ["F", ":", "1"])
                ):
                    inexact_matches += 1
                else:
                    non_matches += 1
                    non_match_records[i] = results_1d_array_1000[i]
            else:
                non_matches += 1
                non_match_records[i] = results_1d_array_1000[i]
    else:
        non_matches += 1
        non_match_records[i] = results_1d_array_1000[i]


print(
    f"""
    {exact_matches=}
    {inexact_matches=}
    {inexact_matches_correct_line=}
    {non_matches=}
    """
)
print(json.dumps(non_match_records, indent=4))


    exact_matches=473
    inexact_matches=524
    inexact_matches_correct_line=0
    non_matches=3
    
{
    "209": {
        "rank": 2,
        "line": 1,
        "data": {
            "A": 0,
            "B": 0,
            "C": 0,
            "D": 0,
            "E": 0,
            "F": 1,
            "G": 0,
            "H": 0,
            "I": 0,
            "J": 0,
            "K": 0,
            "L": 0,
            "M": 0,
            "N": 0,
            "O": 0,
            "P": 0,
            "Q": 0,
            "R": 0,
            "S": 0,
            "T": 0,
            "U": 0,
            "V": 0,
            "W": 0,
            "X": 0,
            "Y": 0,
            "Z": 0
        },
        "explanation": "Anomaly found due to the presence of a single non-zero value in an otherwise uniform distribution of zeros. The rank is set to 2 as it is not completely normal but also not extremely anomalous."
    },
    "655": {
        "anomalies": [
            {
                "r

## Large 1d-array

The small array is performant enough (around 3-5s per query) that I need to see when this performance tapers off.

In [6]:
import pandas as pd
from string import ascii_uppercase as ABC
import random

data_size = 8192
chunk_size = 64
iterations = data_size // chunk_size
min = 10**(len(str(iterations))-1)

# Replace a random value
ran_replace = random.choice(range(iterations))
print(f"find the random replacement here: {ran_replace}")

result = []

for i in range(iterations):
    value = "".join(random.choice(ABC) for _ in range(chunk_size)) if i != ran_replace else '*'
    result.append(value)

container = pd.DataFrame(result,columns=["entry"],index=range(min,(len(result)+min)))
size_of_df = container.memory_usage(index=True).sum()

print(f"Container size in memory: {size_of_df} bytes")
print(container.to_csv(index_label='index'))


find the random replacement here: 15
Container size in memory: 1156 bytes
index,entry
100,UDAMGGBHXEOMJUHXIEZYHKUOGECIYXJESCYWVIQFRTWGHSOZTPDASINHCXSLFOEG
101,HHHIZWASTRNOSHQIELRFZABZQJRKNMBKSYSOLJEHZVVMFYKLVJQBTIMFXTOFIWSR
102,QAQIXDXNHDBBPRJEOOZQSTCBVVVXJJBCHXDWVGORFXOSTOBBEPYRTYAGGCLEWLZI
103,EYRNYTMTWAUVEVBRLDKHSBVEEKZISVMTPLRKALYRAWRAMFNVSXVKSDMWKFLFTQQE
104,WYEUQJWUELJLYAVBVBVXXCGXTPNMESGDGPSVGNJRSPOWGKCPROQQXPPHVRBLKXWJ
105,TUONNQNENJDJYDYVKRUDBJYBCXVCEPNESGJWSBTJCHESITDSJGAWNRZTAVNYYKYU
106,ISOXNUTKOAOAJORRWCZXUIHVFYDIJUFQHMDXDKEHUTBSGDSLICWONUZCJLMOOFWK
107,SBVSVWFIASPJVXOVVDNUIGHHGNEGFUZFMVUROOVROVMOAGJIGFHDHBMMORQLBHOE
108,DOVZWWHBGLMVFNRKKNLXJYHSWHFQEVHYUBZPQTQBOVGVZDCDHRESVWKKRRVRPXHX
109,MZIGVEAPJRKJGUGGOSUUIPOXCXJIRWJHAAVSGYEMJDORKBZSATBWFJZVYBBOHKZB
110,WBXLKKUGQZIVIJSJEXYGYFEWLWDSSHVHYLQVHXODAHROMKUCCNMFJJBTFJQTKNFN
111,BGNXIKBUGVBKGMTKKBJDDISADUXMAOQHCLFICIMIJHMIFTAQHRCWLFZJNAEQMWBD
112,CJQVSROZINKLGWEINXZALZHSMFLWPGWRCWSWTHWXGVUBPMCWSXHPYGUDVIDSQDPA
113,EZBYYWNAFMLQK

In [7]:
results = {}

for i in range(0, c.TRIALS):
    resp = lm(json_mode=True, base_url=f"http://{c.GPU_NODE}:11434").Simple(
        query=f"""
        Find any anomalies in this data: {container.to_csv(index_label='index')}.
        Respond only with JSON containing the following keys and values:
            - "rank": <the rank you assigned to the anomaly>,
            - "line": <the line number of the data>,
            - "data": <the relevant data>,
            - "explanation: <the explanation for your choice>
            
        """
    )
    result = json.loads(resp.json())["text"].strip("\n")
    results[i] = json.loads(result)

with open(f"results/abc-1d_{size_of_df}B-array_{c.TRIALS}.json", "w+") as f:
    json.dump(results, f)

## Second Check for 1d-array
The principle here is to force the LLM to reevaluate the data. This will take the original process and simply add another check to it.

In [7]:
results = {}

for i in range(0, c.TRIALS):
    resp_1 = lm(json_mode=True, base_url=f"http://{c.GPU_NODE}:11434").Simple(
        query=f"""
        Find any anomalies in this data: {json.dumps(sample)}.
        Respond only with JSON containing the following keys and values:
            - "rank": <the rank you assigned to the anomaly>,
            - "line": <the line number of the data>,
            - "data": <the relevant data>,
            - "explanation: <the explanation for your choice>
            
        """
    )

    result = json.loads(resp_1.json())["text"].strip("\n")
    results[i]["resp_1"] = json.loads(result)

    # Second query

    resp_2 = lm(json_mode=True, base_url=f"http://{c.GPU_NODE}:11434").Simple(
        query=f"""
        {json.dumps(result)}
        
        Reivew this data. It should be formatted as a JSON dataset that contains the following keys:
            - "rank": <the rank you assigned to the anomaly>,
            - "line": <the line number of the data>,
            - "data": <the relevant data>,
            - "explanation: <the explanation for your choice>
        
        Ensure that the formatting is correct.
        Ensure that the values for the keys are correct by comparing to the original entry in this data:
        {json.dumps(sample)}
        """
    )
    
    result = json.loads(resp_2.json())["text"].strip("\n")
    results[i]["resp_2"] = json.loads(result)

with open(f"results/abc-1d-array_{c.TRIALS}.json", "w+") as f:
    json.dump(results, f)

KeyError: 0