# Generate Instruction for Graph Judgement
Source: `target.source`, Output: `train_instructions_llama.json`
```json
[
    {
        "instruction": "Is this true: Philippine one hundred-peso note face value 100?",
        "input": "",
        "output": "Yes, this is true."
    },
    {
        "instruction": "Is this true: Philippine one hundred-peso note face value Philippine president?",
        "input": "",
        "output": "No, this is not true."
    }
]
```

In [None]:
"""
Target: formulate json data for kg completion model
Format: 
    [
        {
            "instruction": "Is this true: ...?",
            "input": "",
            "output": "Yes, this is true." or "No, this is not true."
        },
        ..
    ]
"""

import os
import time
import json
import functools
import ast
import random
from tqdm import tqdm

# read the text to be denoised
triples = []
dataset_path = './GPT3.5_result_rebel_sub/' # ./GPT3.5_result_rebel/ ./GPT3.5_result_webnlg/

# read triples
with open(dataset_path + f'train.source', 'r') as f:
    for l in f.readlines():
        triples.append(ast.literal_eval(l.strip()))

# generate training data 
res_list = []
for triple_list in tqdm(triples):            
    tail_list = [x[-1] for x in triple_list]
    for idx in range(len(triple_list)):
        if len(triple_list[idx]) == 1:
            continue
        elif len(triple_list[idx]) == 2:
            inst_pos = f"Is this true: {triple_list[idx][0]} {triple_list[idx][1]}"
            output_pos = "Yes, this is true."
            temp_dict_pos = {"instruction": inst_pos, "input": "", "output": output_pos}
            res_list.append(temp_dict_pos)
        else:
            # positive instance
            inst_pos = f"Is this true: {triple_list[idx][0]} {triple_list[idx][1]} {triple_list[idx][2]}?"
            output_pos = "Yes, this is true."
            temp_dict_pos = {"instruction": inst_pos, "input": "", "output": output_pos}
            res_list.append(temp_dict_pos)
            
            # negative instance----randomly select tail entity
            neg_tail_list = [x for x in tail_list if x != triple_list[idx][2]]
            if len(neg_tail_list) >= 1:
                neg_tail = random.choice(neg_tail_list)
                inst_neg = f"Is this true: {triple_list[idx][0]} {triple_list[idx][1]} {neg_tail}?"
                output_neg = "No, this is not true."
                temp_dict_neg = {"instruction": inst_neg, "input": "", "output": output_neg}
                res_list.append(temp_dict_neg)
            
# write into file
with open(dataset_path + f'train_instructions_llama.json', 'w') as f:
    json.dump(res_list, f, indent=4)

# Generate test data for Graph Judgement

Generate test data with i-th iteration generated graphs

Example:
```csv
prompt,response
Is this true: Coburg Peak instance of Rocky peak?,**
Is this true: Coburg Peak located in Erul Heights?,**
```

In [None]:
"""
Target: fomulate generated graph to the text data format in KG completion model.
"""

import csv
import os
import time
import json
import functools
import ast
import random
from tqdm import tqdm

# read triples generated
triples = []
dataset_path = './GPT3.5_result_GenWiki-Hard/'
Iteration = 3

# read triples
with open(dataset_path + f'Graph_Iteration{Iteration}/test_generated_graphs.txt', 'r') as f:
    for l in f.readlines():
        triples += ast.literal_eval(l.strip())

# 写入到 CSV 文件
with open(dataset_path + f'Graph_Iteration{Iteration}/test_instructions_llama2_7b_itr{Iteration}.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['prompt', 'response']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    # 写入表头
    writer.writeheader()
    
    # 遍历数据
    for triple in tqdm(triples):
        if len(triple) != 3:
            prompt = f"Is this true: {' '.join(triple)}?"
        else:
            subject, predicate, obj = triple
            prompt = f"Is this true: {subject} {predicate} {obj}?"
        response = "**"  # 可以对 response 进行更多的逻辑处理
        writer.writerow({'prompt': prompt, 'response': response})

print("CSV 文件已创建！")

In [None]:
"""
Target: fomulate generated graph to the text data format in KG completion model.
"""

import csv
import os
import time
import json
import functools
import ast
import random
from tqdm import tqdm

# read triples generated
triples = []
dataset_path = './GPT3.5_result_GenWiki-Hard/'

# read triples
with open(dataset_path + f'gpt_baseline/test_generated_graphs.txt', 'r') as f:
    for l in f.readlines():
        try:
            triples += ast.literal_eval(l.strip())
        except:
             triples += [['none', 'none', 'none']]
        

# 写入到 CSV 文件
with open(dataset_path + f'gpt_baseline/test_instructions_llama2_7b_gpt.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['prompt', 'response']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    # 写入表头
    writer.writeheader()
    
    # 遍历数据
    for triple in tqdm(triples):
        triple = [str(x) for x in triple]
        if len(triple) != 3:
            prompt = f"Is this true: {' '.join(triple)}?"
        else:
            subject, predicate, obj = triple
            prompt = f"Is this true: {subject} {predicate} {obj}?"
        response = "**"  # 可以对 response 进行更多的逻辑处理
        writer.writerow({'prompt': prompt, 'response': response})

print("CSV 文件已创建！")

# Filter the generated graphs with Judgement result
This is where we get the final result.

In [None]:
"""
Target: Remove the not correct triples generated from text.
"""
import csv
import os
import time
import json
import functools
import ast
import random
import pandas as pd
from tqdm import tqdm

triples = []
dataset_path = './GPT3.5_result_GenWiki-Hard/'
Iteration = 3

# read triples
with open(dataset_path + f'Graph_Iteration{Iteration}/test_generated_graphs.txt', 'r') as f:
    for l in f.readlines():
        triples.append(ast.literal_eval(l.strip()))

pred_res = pd.read_csv(dataset_path + f'Graph_Iteration{Iteration}/pred_instructions_llama2_7b_itr{Iteration}.csv', header=0, sep=',')
res_list = []
for index, data in tqdm(pred_res.iterrows()):
    try:
        response = data['generated'].lower()
        if 'no' in response[:10] or 'false' in response[:10]:
            res_list.append(False)
        else:
            res_list.append(True)
    except:
        res_list.append(False)

new_triples = []
i = 0
for triple_list in triples:
    new_triple_list = []
    wrong_triple_list = []
    for triple in triple_list:
        if res_list[i]:
            new_triple_list.append(triple)
        else:
            wrong_triple_list.append(triple)
        i += 1
    if len(new_triple_list) < 4:
        new_triple_list = triple_list
    new_triples.append(new_triple_list)


with open(dataset_path + f'Graph_Iteration{Iteration}/test_generated_graphs_final.txt', 'w') as f:
    for doc in new_triples:
        f.write(str(doc).replace('\n', '') + '\n')

In [None]:
"""
Target: Remove the not correct triples generated from text.
"""
import csv
import os
import time
import json
import functools
import ast
import random
import pandas as pd
from tqdm import tqdm

triples = []
dataset_path = './GPT3.5_result_GenWiki-Hard/'

# read triples
with open(dataset_path + f'gpt_baseline/test_generated_graphs.txt', 'r') as f:
    for l in f.readlines():
        try:
            triples.append(ast.literal_eval(l.strip()))
        except:
             triples.append([['none', 'none', 'none']])


pred_res = pd.read_csv(dataset_path + f'gpt_baseline/pred_instructions_llama2_7b_gpt.csv', header=0, sep=',')
res_list = []
for index, data in tqdm(pred_res.iterrows()):
    try:
        response = data['generated'].lower()
        if 'no' in response[:10] or 'false' in response[:10]:
            res_list.append(False)
        else:
            res_list.append(True)
    except:
        res_list.append(False)
new_triples = []
i = 0
for triple_list in triples:
    new_triple_list = []
    for triple in triple_list:
        if res_list[i]:
            new_triple_list.append(triple)
        i += 1
    if len(new_triple_list) < 5:
        new_triple_list = triple_list
    new_triples.append(new_triple_list)


with open(dataset_path + f'gpt_baseline/test_generated_graphs_final.txt', 'w') as f:
    for doc in new_triples:
        f.write(str(doc).replace('\n', '') + '\n')