In [1]:
import os
from mhr.utils.utils import load_json_file,process_jsonl


def read_results_from_file(file_path):
    if os.path.isdir(file_path):
        file_path = os.path.join(file_path, "results.json")
    result_dict = load_json_file(file_path)["results"]
    return result_dict

# baseline_file = "/mnt/petrelfs/songmingyang/code/tools/lmms-eval/scripts/logs/llava/robustlmm/dr_algo/0601_2205_eval_all_llava_baseline"
# ptnft0_file = "/mnt/petrelfs/songmingyang/code/tools/lmms-eval/scripts/logs/llava/robustlmm/dr_algo/0604_1109_eval_all_dr_algo_pt0_ft0"
# ptnft1_file = "/mnt/petrelfs/songmingyang/code/tools/lmms-eval/scripts/logs/llava/robustlmm/dr_algo/0603_1918_eval_all_dr_algo_pt0_ft1"
# ptnft2_file = "/mnt/petrelfs/songmingyang/code/tools/lmms-eval/scripts/logs/llava/robustlmm/dr_algo/0604_2258_eval_all_dr_algo_pt0_ft2"
share4v_base_file="/mnt/petrelfs/songmingyang/code/tools/lmms-eval/scripts/logs/robustlmm/sharegpt4v/0723_0003_share4v_baseline_sharegpt4v_model_args_f37f0e"
baseline_file = "/mnt/petrelfs/songmingyang/code/tools/lmms-eval/scripts/logs/robustlmm/dr_algo/0605_2135_eval_all_llava_baseline_llava_model_args_ca10f3"
mplug_owl_chat_file="/mnt/petrelfs/songmingyang/code/tools/lmms-eval/scripts/logs/robustlmm/other_models/0806_2242_mplug_owl_mplug_owl_model_args_2b40be"

baseline_data = read_results_from_file(baseline_file)
share4v_data = read_results_from_file(share4v_base_file)
mplug_owl_chat_data = read_results_from_file(mplug_owl_chat_file)

In [2]:

metric_list = ["vqav2","gqa","vizwiz_vqa","scienceqa_img","pope","mme","seedbench","seedbench-2","refcoco","refcoco+","mmmu"]
metric1_list = ["vqav2","textvqa","ok_vqa","gqa","vizwiz_vqa","scienceqa","scienceqa_img","refcoco","refcoco+","flickr30k","pope","seedbench","mmmu"]
metric2_list = ["pope","mme","seedbench","seedbench-2","mmbench_en_dev","mmbench_cn_dev","llava_bench_coco","llava_in_the_wild","mmmu"]
metric2_list = ["pope","mme","seedbench","seedbench-2","mmmu"]
metric_dict = {
    "vqav2": ["exact_match,none"],
    "textvqa": ["exact_match,none"],
    "gqa":["exact_match,none"],
    "mme":["mme_percetion_score,none","mme_cognition_score,none"],
    "mmmu":["mmmu_acc,none"],
    "pope":["pope_accuracy,none"],
    "refcoco":["refcoco_CIDEr,none"],
    "refcoco+":["refcoco_CIDEr,none"],
    "scienceqa":["exact_match,none"],
    "scienceqa_img":["exact_match,none"],
    "seedbench":["seed_all,none"],
    "seedbench-2":["seed_all,none"],
    "vizwiz_vqa":["exact_match,none"],
    "flickr30k":["flickr_CIDEr,none"],
    "hallusion_bench_image":["aAcc,none"],
    "llava_bench_coco":["gpt_eval_llava_all,none"],
    "llava_in_the_wild":["gpt_eval_llava_all,none"],
    "mmbench_cn_dev":["gpt_eval_score,none"],
    "mmbench_en_dev":["gpt_eval_score,none"],
    "mmvet":["gpt_eval_score,none"],
    "ok_vqa":["exact_match,none"],
    "textvqa":["exact_match,none"],
}
llava_paper_dict=dict(
    # vqav2=78.5,
    gqa=62.0,
    vizwiz_vqa=50.0,
    scienceqa_img=66.8,
    # textvqa=58.2,
    pope=85.9,
    mme=1510.7,
    mmbench_en_dev=64.3,
    mmbench_cn_dev=58.3,
    seedbench=58.6,
    llava_in_the_wild=63.4,
    mmvet=30.5,
)


ft0_train_data_length = 561.6*1000
ft1_train_data_length = 242.2*1000




In [3]:

method_dict = {
    "LLaVA 1.5":{"data":baseline_data,},
    "ShareGPT4V":{"data":share4v_data,},
    "mPLUG-Owl":{"data":mplug_owl_chat_data,},
}
method_list = list(method_dict.keys())

In [4]:
def get_results_to_list(method_list,method_dict,metric_list,metric_dict):
    res = {}
    for method in method_list:
        data = method_dict[method]["data"]
        method_res = []
        for metric in metric_list:
            for metric_idx,metric_item in enumerate(metric_dict[metric]):
                if method == "baseline" and metric_idx == 0 and llava_paper_dict.get(metric,-1) != -1:
                    metric_res = llava_paper_dict[metric]
                else:
                    metric_res = data[metric].get(metric_item,-1)
                    if metric not in ["mme","mmbench_cn_dev","mmbench_en_dev","llava_bench_coco","llava_in_the_wild"]:
                        metric_res *= 100
                method_res.append(metric_res)
        res[method] = method_res
    return res 

def form_latex_str(res,method_dict):
    res_str = ""
    method_list = list(res.keys())
    res_dict = {method:f"{method} " for method in method_list}
    avg_dict = {method:[] for method in method_list}
    for i in range(len(res[method_list[0]])):
        compare_list = sorted([res[method][i] for method in method_list])
        for method in method_list:
            if res[method][i] == compare_list[-1]:
                res_dict[method] += f"& \\textbf{{{res[method][i]:.1f}}}"
            elif res[method][i] == compare_list[-2]:
                res_dict[method] += f"& \\underline{{{res[method][i]:.1f}}}"
            else:
                res_dict[method] += f"& {res[method][i]:.1f}"
            avg_dict[method].append(res[method][i])
    avg_dict = {method:f" & {sum(avg_dict[method])/len(avg_dict[method]):.1f}" for method in method_list}
    for method in method_list:
        res_str += res_dict[method] + avg_dict[method] + "\\\\ \n"
    return res_str
        

In [5]:
res = get_results_to_list(method_list,method_dict,metric1_list,metric_dict)
res_str = form_latex_str(res,method_dict)
print(res_str)

LLaVA 1.5 & \underline{76.6}& \underline{46.0}& \underline{53.2}& \underline{61.9}& \underline{54.2}& \underline{70.4}& \textbf{69.3}& \underline{29.4}& \underline{28.5}& \underline{74.9}& \textbf{86.9}& \underline{60.6}& \textbf{35.3} & 57.5\\ 
ShareGPT4V & \textbf{78.6}& \textbf{50.2}& \textbf{54.0}& \textbf{63.3}& \textbf{59.1}& \textbf{71.4}& \underline{68.9}& \textbf{37.6}& \textbf{34.3}& \textbf{78.5}& \underline{86.8}& \textbf{63.2}& \underline{35.1} & 60.1\\ 
mPLUG-Owl & 27.6& 6.3& 2.4& 28.2& 50.6& 0.1& 0.0& 0.0& 0.0& 2.2& 0.0& 17.6& 24.9 & 12.3\\ 



In [31]:
res = get_results_to_list(method_list,method_dict,metric2_list,metric_dict)
res_str = form_latex_str(res,method_dict)
print(res_str)

baseline & 558.0K & 665.0K & 85.9& \underline{1510.7}& \textbf{349.6}& 58.6& \textbf{58.0}& 35.3 & 349.7\\ 
all+$\alpha=1.0$ & 558.0K & 581.7K & \textbf{87.2}& 1470.6& 329.6& 61.0& 57.2& 34.8 & 340.1\\ 
toc+$\alpha=1.0$ & 558.0K & 561.5K & 86.6& 1510.5& 316.8& 60.6& 57.1& 35.2 & 344.5\\ 
all+$\alpha=0.8$+aug & 558.0K & 664.3K & 86.9& 1505.0& 300.7& 60.9& 57.1& 35.2 & 341.0\\ 
toc+$\alpha=0.8$+aug & 558.0K & 665.4K & \underline{87.1}& 1480.6& \underline{347.9}& \underline{61.0}& 57.2& \underline{36.0} & 345.0\\ 
all+$\alpha=1.0$+aug & 558.0K & 665.7K & 86.9& \textbf{1511.3}& 291.8& \textbf{61.3}& \underline{57.4}& \textbf{36.3} & 340.8\\ 



In [None]:
## one aspect