In [1]:
import os
import sys
import json

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

dirs = ["..", "/home/hanlv/workspace/code/research/infodemic/LLM/swift/examples/pytorch/llm/my_inferencing/create_prompt_llm"]
for _dir in dirs:
    if _dir not in sys.path:
        sys.path.append(_dir)

from swift.llm import (
    get_model_tokenizer, get_template, inference, 
)
from swift.tuners import Swift
from custom import CustomModelType, CustomTemplateType
import covmis, liar2
import prompt_rag

ckpt_dir_cvomis = "/home/hanlv/workspace/code/research/infodemic/LLM/swift/examples/pytorch/llm/output/covmis/Llama-3-8B-Instruct/with_llama3_info/brave/data1-split=8:2-ratio=1.0/dora-r=8/lr=9e-5-20240626-01:48:13/checkpoint-609"
ckpt_dir_liar2 = "/home/hanlv/workspace/code/research/infodemic/LLM/swift/examples/pytorch/llm/output/liar2/Llama-3-8B-Instruct/with_llama3_info/brave/data1.2-split=8:1:1-ratio=1.0-epochs=1/dora-r=8/lr=1.5e-4-20240723-10:07:49/checkpoint-611"

ckpt_dir = ckpt_dir_liar2

with open(f"{ckpt_dir}/sft_args.json", "r") as f:
    sft_args = json.load(f)

def get_model_template():
    model_type, template_type = sft_args["model_type"], sft_args["template_type"]
    model, tokenizer = get_model_tokenizer(
        model_type, model_kwargs={'device_map': 'auto'},
        # model_dir=sft_args["model_cache_dir"],
        use_flash_attn=sft_args["use_flash_attn"]
    )
    model = Swift.from_pretrained(model, ckpt_dir, inference_mode=True)
    if sft_args["sft_type"] == 'adalora':
        model = model.to(model.dtype)
    model.generation_config.max_new_tokens = 512
    # model.generation_config.temperature = None
    model.generation_config.do_sample = False

    template = get_template(template_type, tokenizer)

    return model, template

model, template = get_model_template()


[INFO:swift] Successfully registered `/home/hanlv/workspace/code/research/infodemic/LLM/swift/swift/llm/data/dataset_info.json`
[INFO:swift] Loading the model using model_dir: /home/css/models/Meta-Llama-3-8B-Instruct
[INFO:swift] Setting torch_dtype: torch.bfloat16
[INFO:swift] model_config: LlamaConfig {
  "_name_or_path": "/home/css/models/Meta-Llama-3-8B-Instruct",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.42.4",
  "use

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

[INFO:swift] model.max_model_len: 8192


In [2]:
label_convert_liar2 = {'pants-fire': 0, 'false': 1, 'barely-true': 2, 'half-true': 3, 'mostly-true': 4, 'true': 5}

search_engine = "brave"

dataset = "liar2" # liar2, covmis
data_type = "valid" # train, test, valid

if dataset == "covmis":
    data = covmis.load_train()
    data_search =  covmis.load_train_search(search_engine=search_engine)
    data_search_llm = covmis.load_train_llm(search_engine=search_engine)
    claim_key = 'claim'
    claimant_key = 'None'
    LABEL_TRUE = 2
    LABEL_FALSE = 0
    true_labels_original = [LABEL_TRUE]
    false_labels_original = [LABEL_FALSE]

    save_data = lambda data: covmis.save_train(data)
    save_search = lambda data: covmis.save_train_search(
        data, search_engine=search_engine)
    save_search_llm = lambda data: covmis.save_train_llm(
        data, search_engine=search_engine)
elif dataset == "liar2":
    data = liar2.load_data(data_type)
    data_search = liar2.load_data_search(data_type, search_engine)
    data_search_llm = liar2.load_data_llm(data_type, search_engine)
    claim_key = 'statement'
    claimant_key = 'None'
    true_labels_original = [
        label_convert_liar2['true'], 
        # label_convert_liar2['mostly-true'],
        # label_convert_liar2['half-true']
    ]
    
    false_labels_original = [
        # label_convert_liar2['barely-true'], 
        label_convert_liar2['false'],
        label_convert_liar2['pants-fire']
    ]

    save_data = lambda data: liar2.save_data(data, data_type)
    save_search = lambda data: liar2.save_data_search(
        data, data_type, search_engine)
    save_search_llm = lambda data: liar2.save_data_llm(
        data, data_type, search_engine)
else:
    raise Exception("数据集错误")

CVOMIS

In [3]:
from tqdm.notebook import tqdm

covmis_dir = "/home/hanlv/workspace/data/machine_learning/dataset/research/misinformation_dataset/COVMIS-2024/data.json"

# with open(f"/home/hanlv/workspace/data/machine_learning/dataset/research/misinformation_dataset/COVMIS-2024/data2.json", "w") as f:
#             json.dump(data_covmis, f, indent=4)

K = 5
prior_knowledge_version = "1"
model_name = "llama3"

for i in tqdm(range(len(data_search_llm))):
    item = data_search_llm[i]

    if data[i]["id"] != item["id"]:
        raise Exception("data 与 data_search_llm 的 id 不匹配！")
    
    if int(data[i]["label"]) not in (true_labels_original + false_labels_original):
        prompt = prompt_rag.get_prompt_with_prior_knowledge(
            data[i][claim_key], 
            search_engine,
            data_search[i][f"{search_engine}_search_results"], 
            item[f"prior_knowledge_{model_name}_v{prior_knowledge_version}_K={K}"], 
            K=K,
            claim_date=data[i]["date"],
            # claimant=data[i].get(claimant_key), # data_version 为 *.1 需使用
            justification=data[i].get('justification'),
            known_info=True, 
            rag_info=True,
            justify_info=False,
            ids=None
        )
        pred_raw = inference(model, template, prompt)[0].strip()

        if pred_raw.startswith("TRUE"):
            data[i]["label2"] = true_labels_original[0]
        elif pred_raw.startswith("FALSE"):
            data[i]["label2"] = false_labels_original[0]
        else:
            raise Exception(f"Error label: {pred_raw}")

        # dict_list.append({"query": prompt, "response": label})
    else:
        if data[i]["label"] in true_labels_original:
            data[i]["label2"] = true_labels_original[0]
        elif data[i]["label"] in false_labels_original:
            data[i]["label2"] = false_labels_original[0]
        else:
            raise Exception()
        

  0%|          | 0/2297 [00:00<?, ?it/s]



In [None]:
# save_data(data)