In [1]:
import json

def save_json(file_path, data):
    """
    Save data to a JSON file.

    Args:
        file_path (str): Path to the JSON file.
        data (dict): Data to save.
    """
    with open(file_path, 'w') as f:
        json.dump(data, f, indent=4)

def load_json(file_path):
    """
    Load data from a JSON file.

    Args:
        file_path (str): Path to the JSON file.

    Returns:
        dict: Data loaded from the file.
    """
    with open(file_path, 'r') as f:
        return json.load(f)

In [2]:
def sort_list_of_dicts(data, key, reverse=False):
    """
    Sort a list of dictionaries by the specified key.

    Args:
        data (list): List of dictionaries to sort.
        key (str): Key to sort by.
        reverse (bool): Sort in descending order if True, ascending if False.

    Returns:
        list: Sorted list of dictionaries.
    """
    return sorted(data, key=lambda x: x[key], reverse=reverse)

In [3]:
correct_json_path = "/data_ssd/mvtec_ad/mvtec-test_llava-onevision.json"
correct_data = load_json(correct_json_path)

generated_json_path = "/home/omote/experiment_only_program/iam-lmms-finetune/eval_output/eval_llava-onevision_mvtec-ad_2025-05-26T20_43_52/eval_output.json"
generated_data = load_json(generated_json_path)

assert len(correct_data) == len(generated_data), "Length of correct and generated data does not match."

correct_data = sort_list_of_dicts(correct_data, "id")
generated_data = sort_list_of_dicts(generated_data, "id")


In [4]:


total_data_num = len(correct_data)
anomaly_data_num = 0
normal_data_num = 0

model_predict_anomaly_data_num = 0
model_predict_normal_data_num = 0

matched_data_num = 0
anomaly_matched_data_num = 0



for i in range(total_data_num):
    assert correct_data[i]["id"] == generated_data[i]["id"], f"ID mismatch at index {i}."
    if correct_data[i]["conversations"][-1]["value"] == "None":
        normal_data_num += 1
    else:
        anomaly_data_num += 1
    
    if generated_data[i]["conversations"][-1]["value"] != "None":
        model_predict_anomaly_data_num += 1
    else:
        model_predict_normal_data_num += 1

    #正常画像の検出判定
    if (correct_data[i]["conversations"][-1]["value"] == "None") and  (generated_data[i]["conversations"][-1]["value"] == "None"):
        matched_data_num += 1
    # 異常画像の検出判定
    elif (correct_data[i]["conversations"][-1]["value"] != "None") and (generated_data[i]["conversations"][-1]["value"] != "None"):
        matched_data_num += 1
        anomaly_matched_data_num += 1

In [5]:
print(f"Total data number: {total_data_num}")
print(f"Normal data number: {normal_data_num}")
print(f"Anomaly data number: {anomaly_data_num}")

print(f"Model predict normal data number: {model_predict_normal_data_num}")
print(f"Model predict anomaly data number: {model_predict_anomaly_data_num}")

print(f"Matched data number: {matched_data_num}")
print(f"Anomaly matched data number: {anomaly_matched_data_num}")

Total data number: 1725
Normal data number: 467
Anomaly data number: 1258
Model predict normal data number: 1582
Model predict anomaly data number: 143
Matched data number: 610
Anomaly matched data number: 143


In [6]:
accuracy = matched_data_num / total_data_num
print(f"Accuracy: {accuracy}")
precision = anomaly_matched_data_num / model_predict_anomaly_data_num
print(f"Precision: {precision}")
recall = anomaly_matched_data_num / anomaly_data_num
print(f"Recall: {recall}")
f1_score = 2 * (precision * recall) / (precision + recall)
print(f"F1 Score: {f1_score}")

Accuracy: 0.3536231884057971
Precision: 1.0
Recall: 0.1136724960254372
F1 Score: 0.2041399000713776
