In [2]:
import os
print(f"Current working folder: {os.getcwd()}")

os.chdir('D:/My Document/Khóa Luận Tốt Nghiệp/Model Reposity/RAG_LLM_DA')
print(f"Current working folder: {os.getcwd()}")
import pandas as pd
from stages.stage_1_learn_rules_from_data.data_loader import DataLoader
from stages.stage_1_learn_rules_from_data.temporal_walk import TemporalWalker
from stages.stage_1_learn_rules_from_data.temporal_walk import store_edges
from stages.stage_1_learn_rules_from_data.rule_learning import RuleLearner, rules_statistics
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
from openai_llm.llm_init import LLM_Model
from utils import load_json_data, save_json_data, load_vectorstore_db, load_learn_data, lookup_vector_db

Current working folder: D:\My Document\Khóa Luận Tốt Nghiệp\Model Reposity\RAG_LLM_DA
Current working folder: D:\My Document\Khóa Luận Tốt Nghiệp\Model Reposity\RAG_LLM_DA


In [3]:
llm_instance = LLM_Model()
dataset_dir = os.path.join(".", "datasets", 'icews14')
dir_path = os.path.join(".", "result", 'icews14', "stage_3")

data = DataLoader(dataset_dir)
test_data = data.test_data_text
rule_regex = load_json_data("config/rule_regex.json")['icews14']
temporal_walk_data = load_learn_data(data, 'all')
temporal_walk = TemporalWalker(temporal_walk_data, data.inverse_rel_idx, 'exp')
rl = RuleLearner(temporal_walk.edges, data.relation2id, data.id2entity, data.id2relation, data.inverse_rel_idx, 
                    'icews14', len(temporal_walk_data), dir_path)

transformed_relations = load_json_data('result/icews14/stage_1/transformed_relations.json')

Loading data from config/llm_config.json
Loading data from .\datasets\icews14\entity2id.json
Loading data from .\datasets\icews14\relation2id.json
Loading data from .\datasets\icews14\ts2id.json
Loading data from config/rule_regex.json
Loading data from result/icews14/stage_1/transformed_relations.json


In [5]:
vector_db = load_vectorstore_db(llm_instance, 'icews14')
for collection in vector_db:
    print(f"{collection}: {len(vector_db[collection]['vector_db'].get()['documents'])} documents")

Loading data from config/data_embedding.json
facts: 166718 documents
rules: 0 documents


### Convert query to natural question

In [4]:
test_query = test_data[12365]

if "inv_" in test_query[1]:
    question = f"{test_query[0]} {transformed_relations[test_query[1]]} by whom on {test_query[3]}?"
else:
    question = f"{test_query[0]} {transformed_relations[test_query[1]]} to/with whom on {test_query[3]}?"
test_query, question

(['Yemi_Osinbajo', 'inv_Make_statement', 'Kayode_Fayemi', '2014-12-19'],
 'Yemi_Osinbajo received a statement from by whom on 2014-12-19?')

### Get `top_k` most similar relations to the relation in the query

In [5]:
test_query[1]

'inv_Make_statement'

In [6]:
import numpy as np

def get_top_k_relations(similarity_matrix, relation_id, top_k=5):
    relation_similarity = similarity_matrix[relation_id]
    print(relation_similarity[3])
    top_k_relations = np.argsort(relation_similarity)[::-1][:top_k]
    return top_k_relations.tolist()

similarity_matrix = np.load('result/icews14/stage_1/relation_similarity.npy')
relation_id = data.relation2id[test_query[1]]
top_k_id = get_top_k_relations(similarity_matrix, relation_id)
top_k_id.append(relation_id)
top_k_rels = [data.id2relation[rel_id] for rel_id in top_k_id]
for rel_id, rel in zip(top_k_id, top_k_rels):
    print(f"{rel_id}: {rel}")

0.29735334900780497
0: Make_statement
232: inv_Make_an_appeal_or_request
252: inv_Make_optimistic_comment
311: inv_Express_intent_to_settle_dispute
328: inv_Apologize
230: inv_Make_statement


### Get all related rules

In [164]:
rules_df = pd.read_csv('result/icews14/stage_2/01_only_Markovian_merged_results.csv')
for _, entry in rules_df.iterrows():
    rl.create_rule_from_series_df(entry=entry, rule_regex=rule_regex)
rules_dict = rl.rules_dict
print("Rules statistics:")
rules_statistics(rules_dict)
# rules_df.columns

Rules statistics:
Number of relations with rules:  259
Total number of rules:  7609
Number of rules by length:  [(1, 1063), (2, 550), (3, 5996)]


In [165]:
def filter_related_rules(rules_dict, rel_id, top_k_id):
    related_rules = []
    rules = rules_dict[rel_id]
    print(len(rules))
    for rule in rules:
        if list(set(top_k_id) & set(rule['body_rels'])):
            related_rules.append(rule)

    return related_rules


In [166]:
related_rules = filter_related_rules(rules_dict, data.relation2id[test_query[1]], top_k_id)
len(related_rules)

67


35

In [167]:
for rule in related_rules:
    print(rule['verbalized_rule'])

Make_statement(X0,X1,T3)<-Make_statement(X0,X1,T0)&Use_conventional_military_force(X1,X2,T1)&Use_unconventional_violence(X2,X1,T2)
Make_statement(X0,X2,T3)<-Make_an_appeal_or_request(X0,X1,T0)&inv_Make_statement(X1,X0,T1)&Make_statement(X0,X2,T2)
Make_statement(X0,X1,T3)<-Make_statement(X0,X1,T0)&inv_Express_intent_to_cooperate_militarily(X1,X2,T1)&Express_intent_to_cooperate_militarily(X2,X1,T2)
Make_statement(X0,X1,T3)<-Make_statement(X0,X1,T0)&inv_Sign_formal_agreement(X1,X2,T1)&inv_Express_intent_to_engage_in_diplomatic_cooperation_(such_as_policy_support)(X2,X1,T2)
Make_statement(X0,X1,T3)<-Make_statement(X0,X1,T0)&inv_Engage_in_diplomatic_cooperation(X1,X2,T1)&inv_Engage_in_diplomatic_cooperation(X2,X1,T2)
Make_statement(X0,X1,T3)<-Make_statement(X0,X1,T0)&Arrest,_detain,_or_charge_with_legal_action(X1,X2,T1)&inv_Investigate(X2,X1,T2)
Make_statement(X0,X1,T3)<-Make_statement(X0,X1,T0)&Arrest,_detain,_or_charge_with_legal_action(X1,X2,T1)&inv_Arrest,_detain,_or_charge_with_legal_a

### Get related facts

In [None]:
search_content = ""
for rel_id in top_k_rels:
    search_content += f"{data.id2relation[rel_id]} "

In [168]:
search_content = transformed_relations[test_query[1]]

In [169]:
def lookup_facts(vector_db, subject, search_content, top_k=30):
    filter = {"subject": subject}
    docs = lookup_vector_db(search_content, filter, vector_db, llm_instance, top_k=top_k)
    return docs

In [170]:
def get_related_facts(search_content, vector_db, llm_instance, related_facts, candidates_dict, n = 0):
    if n == 3:
        return related_facts
    cands = candidates_dict[n]
    candidates_dict[n+1] = set()
    for can in cands:
        docs = lookup_facts(vector_db, can, search_content, top_k=20//(len(cands)))
        for doc in docs:
            # print(doc.page_content)
            candidates_dict[n+1].add(doc.metadata['object'])
            related_facts.append(doc.page_content)
    print(len(related_facts))
    return get_related_facts(search_content, vector_db, llm_instance, related_facts, candidates_dict, n+1)

candidates_dict = {0: {test_query[0]}}    
related_facts = get_related_facts(search_content, vector_db['facts']['vector_db'], llm_instance, [], candidates_dict)


20
34
54


In [171]:
candidates_dict

{0: {'Kashim_Shettima'},
 1: {'Boko_Haram',
  'Government_(Nigeria)',
  'Head_of_Government_(Nigeria)',
  'Indigenous_People_(Nigeria)',
  'Militant_(Nigeria)',
  'Military_Personnel_-_Special_(Nigeria)',
  'Suleiman_Abba'},
 2: {'Alhaji_Garba_Umar',
  'Education_(Nigeria)',
  'Government_(Nigeria)',
  'Kashim_Shettima',
  'Kayode_Fayemi',
  'Labaran_Maku',
  'Media_(Nigeria)',
  'Militant_(Boko_Haram)',
  'Militant_(Nigeria)',
  'Nigerian_Navy'},
 3: {'Aderenle_Shinaba',
  'Alhaji_Garba_Umar',
  'Angola',
  'Boko_Haram',
  'Government_(Nigeria)',
  'Kashim_Shettima',
  'Kayode_Fayemi',
  'Media_(Nigeria)',
  'Militant_(Nigeria)',
  'Muslim_(Nigeria)',
  'Police_(Benin)',
  'Rauf_Aregbesola',
  'Suleiman_Abba'}}

In [159]:
related_facts

['"UN_Security_Council received an appeal or request from to/with Office_of_the_United_Nations_High_Commissioner_for_Human_Rights on 2014-05-06"',
 '"UN_Security_Council received an appeal or request from to/with Citizen_(Sudan) on 2014-08-25"',
 '"UN_Security_Council received an appeal or request from to/with Iran on 2014-08-29"',
 '"UN_Security_Council received an appeal or request from to/with Citizen_(Sudan) on 2014-02-05"',
 '"UN_Security_Council received an appeal or request from to/with France on 2014-08-07"',
 '"UN_Security_Council received an appeal or request from to/with Iran on 2014-09-01"',
 '"UN_Security_Council received an appeal or request from to/with Amnesty_International on 2014-06-20"',
 '"UN_Security_Council received an appeal or request from to/with Amnesty_International on 2014-03-06"',
 '"UN_Security_Council received an appeal or request from to/with Iran on 2014-01-21"',
 '"UN_Security_Council received an appeal or request from to/with The_Hague on 2014-05-12"'

### Prompt

In [None]:
system_msg_content = f'''
You are an expert in Temporal Knowledge Graphs, utilizing data consisting of events and activities worldwide involving countries, organizations, famous individuals, etc. 
Your task is considered as Temporal Knowledge Graph Reasoning, which is to predict the missing object in a given fact which is from the test dataset. A fact is a quadruple: subject, relation, object, time.
You will be given some rules which are mined from the historical data (train and validation datasets). These rules
To tackle this task, you will need to follow these instructions carefully:
- 
'''
system_msg = SystemMessage(content=system_msg_content)

user_msg_content = f'''

'''
user_msg = HumanMessage(content=user_msg_content)
llm_instance.run_task([system_msg, user_msg])