### Date mapping

In [None]:
import json
from datetime import datetime, timedelta  

class DateMapper:  
    def __init__(self, year=2014):  
        self.year = year  
        self.start_date = datetime(year, 1, 1)  
        self.date_to_id_map = {}  
        self.id_to_date_map = {}  
        self._build_mappings()  

    def _build_mappings(self):  
        """Xây dựng mapping hai chiều"""  
        current_date = self.start_date  
        for day_id in range(1, 367):  # 366 ngày cho năm nhuận  
            if current_date.year != self.year:
                break
            date_str = current_date.strftime('%Y-%m-%d')  
            self.date_to_id_map[date_str] = day_id  
            self.id_to_date_map[day_id] = date_str  
            current_date += timedelta(days=1)  

    def date_to_id(self, date_str):  
        """Chuyển đổi ngày thành ID"""  
        try:  
            return self.date_to_id_map[date_str]  
        except KeyError:  
            raise ValueError(f"Invalid date or date not in year {self.year}: {date_str}")  

    def id_to_date(self, day_id):  
        """Chuyển đổi ID thành ngày"""  
        try:  
            return self.id_to_date_map[day_id]  
        except KeyError:  
            raise ValueError(f"Invalid ID. Must be between 1 and 366")  

    def convert_dates(self, dates):  
        """Chuyển đổi nhiều ngày cùng lúc"""  
        return [self.date_to_id(date) for date in dates]

    def save_to_json(self, filename):
        """Lưu mapping ngày sang ID vào file JSON"""
        with open(filename, 'w') as json_file:
            json.dump(self.date_to_id_map, json_file, indent=4)

# Sử dụng  
mapper = DateMapper(2014)  

# Lưu mapping vào file JSON
mapper.save_to_json('date_to_id_mapping_2014.json')

{'2014-01-01': 1, '2014-01-02': 2, '2014-01-03': 3, '2014-01-04': 4, '2014-01-05': 5, '2014-01-06': 6, '2014-01-07': 7, '2014-01-08': 8, '2014-01-09': 9, '2014-01-10': 10, '2014-01-11': 11, '2014-01-12': 12, '2014-01-13': 13, '2014-01-14': 14, '2014-01-15': 15, '2014-01-16': 16, '2014-01-17': 17, '2014-01-18': 18, '2014-01-19': 19, '2014-01-20': 20, '2014-01-21': 21, '2014-01-22': 22, '2014-01-23': 23, '2014-01-24': 24, '2014-01-25': 25, '2014-01-26': 26, '2014-01-27': 27, '2014-01-28': 28, '2014-01-29': 29, '2014-01-30': 30, '2014-01-31': 31, '2014-02-01': 32, '2014-02-02': 33, '2014-02-03': 34, '2014-02-04': 35, '2014-02-05': 36, '2014-02-06': 37, '2014-02-07': 38, '2014-02-08': 39, '2014-02-09': 40, '2014-02-10': 41, '2014-02-11': 42, '2014-02-12': 43, '2014-02-13': 44, '2014-02-14': 45, '2014-02-15': 46, '2014-02-16': 47, '2014-02-17': 48, '2014-02-18': 49, '2014-02-19': 50, '2014-02-20': 51, '2014-02-21': 52, '2014-02-22': 53, '2014-02-23': 54, '2014-02-24': 55, '2014-02-25': 56, 

### Add test_inv for each dataset

In [8]:
import pandas as pd

columns = ['subject', 'relation', 'object', 'timestamp']
df = pd.read_csv('../datasets/icews18/test.txt', sep='\t', names=columns)
# tạo 1 df mới với cột subject và object ngược lại với df cũ, riêng cột relation của df mới này sẽ có thêm 'inv_' ở trước
df_inv = df.copy()
df_inv['subject'] = df['object']
df_inv['object'] = df['subject']
df_inv['relation'] = 'inv_' + df['relation']
# gộp 2 df lại với nhau
df = pd.concat([df, df_inv], ignore_index=True)
df.to_csv('../datasets/icews18/test.txt.inv', sep='\t', header=False, index=False)


### Modify JSON output data

In [3]:
import os
print(f"Current working folder: {os.getcwd()}")

os.chdir('D:/My Document/Khóa Luận Tốt Nghiệp/Model Reposity/RAG_LLM_DA')
print(f"Current working folder: {os.getcwd()}")

from utils import load_json_data, save_json_data

Current working folder: D:\My Document\Khóa Luận Tốt Nghiệp\Model Reposity\RAG_LLM_DA
Current working folder: D:\My Document\Khóa Luận Tốt Nghiệp\Model Reposity\RAG_LLM_DA


In [6]:
x = load_json_data('result/icews14/stage_4/final_candidates.json')
# delete keys from '7371' to '14741'
for i in range(7371, 14742):
    del x[str(i)]
save_json_data(x, './final_candidates.json', )

Loading data from result/icews14/stage_4/final_candidates.json
Data has been converted to JSON and saved to ./final_candidates.json


### Process raw dataset

In [1]:
import os
print(f"Current working folder: {os.getcwd()}")

os.chdir('C:/Users/UHY6HC/Downloads/Thesis/RAG_LLM_DA')
print(f"Current working folder: {os.getcwd()}")

from utils import load_json_data, write_lines_to_file, read_lines_from_file, save_json_data
import pandas as pd
import os

source_data = "datasets/YAGO"
destination_data = "datasets/icews14"

Current working folder: c:\Users\UHY6HC\Downloads\Thesis\RAG_LLM_DA\notebooks
Current working folder: C:\Users\UHY6HC\Downloads\Thesis\RAG_LLM_DA


In [2]:
entity_list = list(load_json_data(os.path.join(source_data, 'entity2id.json')).keys())
write_lines_to_file(os.path.join(source_data, 'entities.txt'), entity_list)

Loading data from datasets/YAGO\entity2id.json


In [3]:
ts2id = load_json_data(os.path.join(source_data, 'ts2id.json'))
new_ts2id = {f"T_{k}": v for k,v in ts2id.items()}
save_json_data(new_ts2id, os.path.join(source_data, 'ts2id.json'))

Loading data from datasets/YAGO\ts2id.json
Data has been converted to JSON and saved to datasets/YAGO\ts2id.json


In [4]:
columns = ['subject', 'relation', 'object', 'timestamp', 'temp']
for f in ['train_.txt', 'valid_.txt', 'test_.txt']:
    df = pd.read_csv(os.path.join(source_data, f), sep='\t', header=None, names=columns)
    df = df.drop('temp', axis=1)
    df.to_csv(os.path.join(source_data, f), sep='\t', header=None, index=False)

In [5]:
columns = ['subject', 'relation', 'object', 'timestamp']
df = pd.read_csv(os.path.join(source_data, 'all_facts.txt'), sep='\t', header=None, names=columns)
new_ts = []
for _, row in df.iterrows():
    new_ts.append(f"T_{row['timestamp']}")

df['timestamp'] = new_ts
df.head()

Unnamed: 0,subject,relation,object,timestamp
0,Balleroy,Instance_of,Communes_In_France,T_0
1,Selman_Waksman,Educated_at,Rutger'S_University,T_0
2,Charles_Macintosh,Country_of_citizenship,United_Kingdom_Of_Great_Britain,T_0
3,Montigny-Le-Gannelon,Instance_of,Communes_In_France,T_0
4,France_Metropolitan,Contains_administrative_territorial_entity,Nord-Pas_De_Calais,T_0


In [7]:
train_df = df.iloc[:220393]
train_df.to_csv(os.path.join(source_data, 'train.txt'), sep='\t', header=None, index=False)

In [8]:
valid_df = df.iloc[220393:249341]
valid_df.to_csv(os.path.join(source_data, 'valid.txt'), sep='\t', header=None, index=False)

In [10]:
train_valid_df = df.iloc[:249341]
train_valid_df.to_csv(os.path.join(source_data, 'facts.txt'), sep='\t', header=None, index=False)

inv_facts_rows = []
for _, row in train_valid_df.iterrows():
    inv_facts_rows.append(row)
    inv_row = row.copy()
    inv_row['subject'], inv_row['object'] = row['object'], row['subject']
    inv_row['relation'] = f"inv_{row['relation']}"
    inv_facts_rows.append(inv_row)

inv_fact_df = pd.DataFrame(inv_facts_rows)
inv_fact_df.to_csv(os.path.join(source_data, 'facts.txt.inv'), sep='\t', header=None, index=False)

In [11]:
test_df = df.iloc[249341:]
test_df.to_csv(os.path.join(source_data, 'test.txt'), sep='\t', header=None, index=False)

inv_test_rows = []
for _, row in test_df.iterrows():
    inv_test_rows.append(row)
    inv_row = row.copy()
    inv_row['subject'], inv_row['object'] = row['object'], row['subject']
    inv_row['relation'] = f"inv_{row['relation']}"
    inv_test_rows.append(inv_row)

inv_test_df = pd.DataFrame(inv_test_rows)
inv_test_df.to_csv(os.path.join(source_data, 'test.txt.inv'), sep='\t', header=None, index=False)

In [12]:
relation_list = list(load_json_data(os.path.join(source_data, 'relation2id.json')).keys())
write_lines_to_file(os.path.join(source_data, 'relations.txt'), relation_list)

Loading data from datasets/YAGO\relation2id.json
