In [1]:
import os
print(f"Current working folder: {os.getcwd()}")

os.chdir('D:/My Document/Khóa Luận Tốt Nghiệp/Model Reposity/RAG_LLM_DA')
print(f"Current working folder: {os.getcwd()}")
import pandas as pd
from stages.stage_1_learn_rules_from_data.data_loader import DataLoader
from stages.stage_1_learn_rules_from_data.temporal_walk import TemporalWalker
from stages.stage_1_learn_rules_from_data.temporal_walk import store_edges
from stages.stage_1_learn_rules_from_data.rule_learning import RuleLearner, rules_statistics
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
from openai_llm.llm_init import LLM_Model
from utils import load_json_data, save_json_data, load_vectorstore_db, load_learn_data, lookup_vector_db

Current working folder: d:\My Document\Khóa Luận Tốt Nghiệp\Model Reposity\RAG_LLM_DA\notebooks
Current working folder: D:\My Document\Khóa Luận Tốt Nghiệp\Model Reposity\RAG_LLM_DA


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
llm_instance = LLM_Model()
dataset_dir = os.path.join(".", "datasets", 'icews14')
dir_path = os.path.join(".", "result", 'icews14', "stage_3")

data = DataLoader(dataset_dir)
test_data = data.test_data_text
rule_regex = load_json_data("config/rule_regex.json")['icews14']
temporal_walk_data = load_learn_data(data, 'all')
temporal_walk = TemporalWalker(temporal_walk_data, data.inverse_rel_idx, 'exp')
rl = RuleLearner(temporal_walk.edges, data.relation2id, data.id2entity, data.id2relation, data.inverse_rel_idx, 
                    'icews14', len(temporal_walk_data), dir_path)

transformed_relations = load_json_data('result/icews14/stage_1/transformed_relations.json')

Loading data from config/llm_config.json
Loading data from .\datasets\icews14\entity2id.json
Loading data from .\datasets\icews14\relation2id.json
Loading data from .\datasets\icews14\ts2id.json
Loading data from config/rule_regex.json
Loading data from result/icews14/stage_1/transformed_relations.json


In [3]:
vector_db = load_vectorstore_db(llm_instance, 'icews14')
for collection in vector_db:
    print(f"{collection}: {len(vector_db[collection]['vector_db'].get()['documents'])} documents")

Loading data from config/data_embedding.json
facts: 166718 documents
rules: 0 documents


### Convert query to natural question

In [32]:
test_query = test_data[4995]

if "inv_" in test_query[1]:
    question = f"{test_query[0]} {transformed_relations[test_query[1]]} by whom on {test_query[3]}?"
else:
    question = f"{test_query[0]} {transformed_relations[test_query[1]]} to/with whom on {test_query[3]}?"
test_query, question

(['Business_(North_Korea)',
  'Make_pessimistic_comment',
  'Citizen_(North_Korea)',
  '2014-12-19'],
 'Business_(North_Korea) Made pessimistic comment to/with whom on 2014-12-19?')

### Get `top_k` most similar relations to the relation in the query

In [33]:
test_query[1]

'Make_pessimistic_comment'

In [34]:
import numpy as np

def get_top_k_relations(similarity_matrix, relation_id, top_k=10):
    relation_similarity = similarity_matrix[relation_id]
    # print(relation_similarity[3])
    top_k_relations = np.argsort(relation_similarity)[::-1][:top_k]
    return top_k_relations.tolist()

similarity_matrix = np.load('result/icews14/stage_1/relation_similarity.npy')
relation_id = data.relation2id[test_query[1]]
top_k_id = get_top_k_relations(similarity_matrix, relation_id)
top_k_id.append(relation_id)
top_k_rels = [data.id2relation[rel_id] for rel_id in top_k_id]
for rel_id, rel in zip(top_k_id, top_k_rels):
    print(f"{rel_id}: {rel}")

256: inv_Make_pessimistic_comment
22: Make_optimistic_comment
252: inv_Make_optimistic_comment
33: Make_empathetic_comment
343: inv_Decline_comment
263: inv_Make_empathetic_comment
113: Decline_comment
0: Make_statement
204: Reject_economic_cooperation
8: Criticize_or_denounce
26: Make_pessimistic_comment


### Get all related rules

In [35]:
rules_df = pd.read_csv('result/icews14/stage_2/01_only_Markovian_merged_results.csv')
for _, entry in rules_df.iterrows():
    rl.create_rule_from_series_df(entry=entry, rule_regex=rule_regex)
rules_dict = rl.rules_dict
print("Rules statistics:")
rules_statistics(rules_dict)
# rules_df.columns

Rules statistics:
Number of relations with rules:  259
Total number of rules:  7609
Number of rules by length:  [(1, 1063), (2, 550), (3, 5996)]


In [36]:
def filter_related_rules(rules_dict, rel_id, top_k_id):
    related_rules = []
    rules = rules_dict[rel_id]
    print(len(rules))
    for rule in rules:
        if list(set(top_k_id) & set(rule['body_rels'])):
            related_rules.append(rule['verbalized_rule'])

    return related_rules


In [37]:
related_rules = filter_related_rules(rules_dict, data.relation2id[test_query[1]], top_k_id)
len(related_rules)

22


11

In [38]:
related_rules

['Make_pessimistic_comment(X0,X2,T3)<-Express_intent_to_cooperate_economically(X0,X1,T0)&inv_Express_intent_to_cooperate_economically(X1,X0,T1)&Make_pessimistic_comment(X0,X2,T2)',
 'Make_pessimistic_comment(X0,X1,T1)<-Make_pessimistic_comment(X0,X1,T0)',
 'Make_pessimistic_comment(X0,X2,T2)<-Make_statement(X0,X1,T0)&Threaten(X1,X2,T1)',
 'Make_pessimistic_comment(X0,X2,T3)<-Conduct_hunger_strike(X0,X1,T0)&inv_Praise_or_endorse(X1,X0,T1)&Criticize_or_denounce(X0,X2,T2)',
 'Make_pessimistic_comment(X0,X2,T3)<-inv_Host_a_visit(X0,X1,T0)&inv_Express_intent_to_meet_or_negotiate(X1,X0,T1)&Make_pessimistic_comment(X0,X2,T2)',
 'Make_pessimistic_comment(X0,X2,T3)<-inv_Demand_change_in_leadership(X0,X1,T0)&inv_Praise_or_endorse(X1,X0,T1)&Criticize_or_denounce(X0,X2,T2)',
 'Make_pessimistic_comment(X0,X2,T2)<-inv_Use_unconventional_violence(X0,X1,T0)&inv_Make_optimistic_comment(X1,X2,T1)',
 'Make_pessimistic_comment(X0,X3,T3)<-Express_intent_to_meet_or_negotiate(X0,X1,T0)&inv_Use_conventional_m

### Get related facts

In [39]:
search_content = transformed_relations[test_query[1]]
search_content

'Made pessimistic comment'

In [40]:
def lookup_facts(vector_db, filter, search_content, top_k=20):
    docs = lookup_vector_db(search_content, filter, vector_db, llm_instance, top_k=top_k)
    return docs

In [54]:
def get_related_facts(search_content, vector_db, llm_instance, related_facts, candidates_dict, seen_entities=None, seen_facts=None, n=0):  
    # Khởi tạo sets theo dõi nếu là lần gọi đầu tiên  
    if seen_entities is None:  
        seen_entities = set()  
    if seen_facts is None:  
        seen_facts = set()  
    
    # Điều kiện dừng  
    if n == 3:  
        return related_facts  
    
    cands = candidates_dict[n]  
    candidates_dict[n+1] = set()  
    
    # Xử lý từng candidate  
    for can in cands:  
        # Bỏ qua nếu entity đã xử lý trước đó  
        if can in seen_entities:  
            continue  
            
        seen_entities.add(can)  
        filter = {"subject": can}
        docs = lookup_facts(vector_db, filter, search_content, top_k=20//(len(cands)))  
        print(can)
        print(docs)
        
        for doc in docs:  
            fact_content = doc.page_content  
            new_entity = doc.metadata['object']  
            
            # Chỉ thêm fact và entity mới nếu chưa xuất hiện trước đó  
            if fact_content not in seen_facts:  
                seen_facts.add(fact_content)  
                related_facts.append(fact_content)  
                
                # Chỉ thêm entity mới vào candidates cho hop tiếp theo  
                if new_entity not in seen_entities:  
                    candidates_dict[n+1].add(new_entity)  
    
    # Nếu không còn candidates mới, dừng sớm  
    if not candidates_dict[n+1]:  
        return related_facts  
        
    return get_related_facts(  
        search_content,   
        vector_db,   
        llm_instance,   
        related_facts,   
        candidates_dict,   
        seen_entities,   
        seen_facts,   
        n+1  
    )  

In [55]:

candidates_dict = {0: {test_query[0]}}    
related_facts = get_related_facts(search_content, vector_db['facts']['vector_db'], llm_instance, [], candidates_dict)


Business_(North_Korea)
[Document(metadata={'object': 'North_Korea', 'object_id': 11, 'relation': 'Make_statement', 'relation_id': 0, 'subject': 'Business_(North_Korea)', 'subject_id': 1888, 'timestamp': '2014-07-15', 'timestamp_id': 196}, page_content='"Business_(North_Korea) Made a statement to/with North_Korea on 2014-07-15"'), Document(metadata={'object': 'UN_Security_Council', 'object_id': 25, 'relation': 'inv_Impose_embargo,_boycott,_or_sanctions', 'relation_id': 282, 'subject': 'Business_(North_Korea)', 'subject_id': 1888, 'timestamp': '2014-08-05', 'timestamp_id': 217}, page_content='"Business_(North_Korea) Had embargo, boycott, or sanctions imposed to/with UN_Security_Council on 2014-08-05"'), Document(metadata={'object': 'North_Korea', 'object_id': 11, 'relation': 'inv_Express_intent_to_yield', 'relation_id': 367, 'subject': 'Business_(North_Korea)', 'subject_id': 1888, 'timestamp': '2014-09-22', 'timestamp_id': 265}, page_content='"Business_(North_Korea) Received intent to yi

In [56]:
candidates_dict

{0: {'Business_(North_Korea)'},
 1: {'Government_(United_States)',
  'Kim_Jong-Un',
  'North_Korea',
  'South_Korea',
  'UN_Security_Council'},
 2: {'Citizen_(North_Korea)',
  'Citizen_(South_Korea)',
  'Gerhard_Schröder',
  'Japan',
  'Kim_Jong-Un',
  'Military_Personnel_-_Special_(Afghanistan)',
  'Sergey_Viktorovich_Lavrov',
  'South_Korea',
  'South_Sudan',
  'Thailand'},
 3: {'Angela_Merkel',
  'China',
  'Citizen_(Thailand)',
  'France',
  'John_Kerry',
  'North_Atlantic_Treaty_Organization',
  'Ryoo_Kihl-jae'}}

In [57]:
related_facts

['"Business_(North_Korea) Made a statement to/with North_Korea on 2014-07-15"',
 '"Business_(North_Korea) Had embargo, boycott, or sanctions imposed to/with UN_Security_Council on 2014-08-05"',
 '"Business_(North_Korea) Received intent to yield to/with North_Korea on 2014-09-22"',
 '"Business_(North_Korea) Assassinated to/with Kim_Jong-Un on 2014-07-10"',
 '"Business_(North_Korea) Defied norms, law to/with North_Korea on 2014-02-20"',
 '"Business_(North_Korea) Had embargo, boycott, or sanctions imposed to/with Government_(United_States) on 2014-07-31"',
 '"Business_(North_Korea) Expressed intent to meet or negotiate to/with South_Korea on 2014-04-29"',
 '"North_Korea Made pessimistic comment to/with Citizen_(North_Korea) on 2014-05-29"',
 '"North_Korea Made pessimistic comment to/with Citizen_(North_Korea) on 2014-05-28"',
 '"North_Korea received a pessimistic comment from to/with Kim_Jong-Un on 2014-01-07"',
 '"North_Korea Made pessimistic comment to/with Japan on 2014-05-28"',
 '"Gov

### Get most related entites and their facts


In [58]:
import numpy as np

def get_top_k_entities(similarity_matrix, entity_id, top_k=10):
    entity_similarity = similarity_matrix[entity_id]
    top_k_relations = np.argsort(entity_similarity)[::-1][:top_k]
    return top_k_relations.tolist()

similarity_matrix = np.load('./entity_similarity.npy')
top_k_id = get_top_k_entities(similarity_matrix, data.entity2id[test_query[0]])

related_entities = [data.id2entity[id] for id in top_k_id]

In [59]:
related_entity_facts = []
for ent in related_entities:
    filter = {"subject": ent}
    docs = lookup_facts(vector_db['facts']['vector_db'], filter, search_content, top_k=10)
    related_entity_facts.extend([doc.page_content for doc in docs])

related_entity_facts

['"Finance_/_Economy_/_Commerce_/_Trade_Ministry_(North_Korea) was hosted by to/with Angola on 2014-05-23"',
 '"Finance_/_Economy_/_Commerce_/_Trade_Ministry_(North_Korea) Made a visit to/with Angola on 2014-05-23"',
 '"Business_(South_Korea) was criticized or denounced by to/with Ministry_(South_Korea) on 2014-07-15"',
 '"Business_(South_Korea) was criticized or denounced by to/with Ministry_(South_Korea) on 2014-07-16"',
 '"Business_(South_Korea) Made a statement to/with Barack_Obama on 2014-03-15"',
 '"Business_(South_Korea) was hosted by to/with North_Korea on 2014-04-21"',
 '"Business_(South_Korea) was hosted by to/with North_Korea on 2014-04-28"',
 '"Business_(South_Korea) was hosted by to/with North_Korea on 2014-04-22"',
 '"Business_(South_Korea) Consulted to/with Head_of_Government_(South_Korea) on 2014-11-14"',
 '"Business_(South_Korea) received a statement from to/with Energy_Department/Ministry_(Philippines) on 2014-06-17"',
 '"Business_(South_Korea) received a statement fr

### Prompt version 1

In [60]:
system_msg_content = f'''
You are an expert in Temporal Knowledge Graphs, utilizing data consisting of events and activities worldwide involving countries, organizations, famous individuals, etc.   
Your task is Temporal Knowledge Graph Reasoning, which involves predicting the missing object in a given fact from the test dataset. A fact is represented as a quadruple: subject, relation, object, and time.

In this context:  
- "subject" is the entity mentioned in the query  
- "relation" is an action/event performed by the subject  
- "object" is the entity you need to infer through reasoning  
- "timestamp": The temporal aspect of the fact

To support your reasoning process in finding the missing object, you will be provided with relevant facts:  
1. Primary Information:  
- A sequence of events (facts), known as "Reasoning Paths", related to the query's subject and relation  
- Learned rules from training and validation datasets that represent patterns which events typically follow. An important note is relations with the "inv_" prefix (e.g., "inv_make_statement") indicate passive relations. For example: inv_make_statement(B,A,T) means "B receives a statement from A at time T.
Using this source of information, your task is to infer the missing "object" through multi-hop reasoning.  
2. Secondary Information:  
- Most related entities to the entity subject  
- Facts between these related entities with the entity subject.
This information aids your reasoning process, especially because:     
- These entities can be candidates because they might have relationships with the entity subject, which are also similar to the relationship between the entity subject and the missing object,
3. Additional Information:
- The facts of these related entities.
This information is useful especiall when there are no facts about the entity subject in the past. So, patterns from these similar entities might apply to the entity subject to infer the missing object.

You should follow these reasoning Process Guidelines:  
1. Primary Analysis:  
   - Analyze direct reasoning paths connecting to the subject  
   - Match and apply relevant rules to existing facts  
   - Identify temporal patterns and their significance  
   - Evaluate the strength of direct evidence  

2. Multi-hop Reasoning:  
   - Consider indirect connections through intermediate entities  
   - Evaluate path length and relevance  
   - Consider temporal sequence of connected facts  
   - Weight evidence based on path length and temporal proximity  

3. Similar Entity Analysis:  
   - Consider most related entities as potential candidates because they might have relationships with the entity subject in the past.
   - Examine patterns from semantically similar entities  
   - Apply successful patterns from similar entities  

Finally, for the candidate selection criteria:  
1. Evidence Strength:  
   - Direct path evidence (highest weight)  
   - Rule application matches  
   - Multi-hop reasoning paths 
   - Related entity patterns  
    
2. Temporal Relevance:  
   - Recency of connections  
   - Pattern consistency over time  
   - Temporal proximity to query time  

3. Confidence Scoring:  
   - Direct evidence: High confidence  
   - Rule-based inference: Medium-high confidence  
   - Related entity and their patterns: Medium confidence  
   - Multi-hop paths: Weighted by path length

Your answer should be in the following JSON format:  
{{  
    "candidates": // An ordered list of up to 10 candidates, from highest to lowest likelihood of being the correct answer.   
                  // Each candidate should be an entity name exactly as it appears in the given facts.  
                  // The list should be ordered by decreasing probability of being the correct answer.
}}
'''
system_msg = SystemMessage(content=system_msg_content)

user_msg_content = f'''
For the primary information:
- Here are facts related to the query's subject and relation:
{related_facts}
- Here are the learned rules related to query's relation:
{related_rules}

For the secondary information:
- Here are the most related entities to the entity "subject" and the facts between these related entities and the entity "subject":
   + {related_entities}

For the additional information:
- Here are the facts of these related entities:
   + {related_entity_facts}
'''
user_msg = HumanMessage(content=user_msg_content)

In [61]:
print(user_msg_content)


For the primary information:
- Here are facts related to the query's subject and relation:
['"Business_(North_Korea) Made a statement to/with North_Korea on 2014-07-15"', '"Business_(North_Korea) Had embargo, boycott, or sanctions imposed to/with UN_Security_Council on 2014-08-05"', '"Business_(North_Korea) Received intent to yield to/with North_Korea on 2014-09-22"', '"Business_(North_Korea) Assassinated to/with Kim_Jong-Un on 2014-07-10"', '"Business_(North_Korea) Defied norms, law to/with North_Korea on 2014-02-20"', '"Business_(North_Korea) Had embargo, boycott, or sanctions imposed to/with Government_(United_States) on 2014-07-31"', '"Business_(North_Korea) Expressed intent to meet or negotiate to/with South_Korea on 2014-04-29"', '"North_Korea Made pessimistic comment to/with Citizen_(North_Korea) on 2014-05-29"', '"North_Korea Made pessimistic comment to/with Citizen_(North_Korea) on 2014-05-28"', '"North_Korea received a pessimistic comment from to/with Kim_Jong-Un on 2014-01-

In [62]:
answer_llm = llm_instance.run_task([system_msg, user_msg])

In [63]:
answer_llm

{'candidates': ['Citizen_(North_Korea)',
  'Kim_Jong-Un',
  'North_Korea',
  'Business_(South_Korea)',
  'Military_(North_Korea)',
  'Finance_/_Economy_/_Commerce_/_Trade_Ministry_(North_Korea)',
  'Ministry_(North_Korea)',
  'Business_(Cuba)',
  'Employee_(North_Korea)',
  'Business_(Norway)']}

In [51]:
related_entities

['Finance_/_Economy_/_Commerce_/_Trade_Ministry_(North_Korea)',
 'Business_(South_Korea)',
 'Bank_(North_Korea)',
 'Employee_(North_Korea)',
 'Business_(Cuba)',
 'Military_(North_Korea)',
 'Business_(Norway)',
 'North_Korea',
 'Religion_(North_Korea)',
 'Ministry_(North_Korea)']