In [None]:
"""
Copyright 2024 Muhammad Khalid Habiburahman

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

In [None]:
import numpy as np
import pandas as pd
from rdflib import Graph
from sklearn.model_selection import train_test_split

pd.set_option('display.max_colwidth', 200)
%matplotlib inline

# Mounting My Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd "/content/drive/My Drive/Tugas Akhir/Code"

/content/drive/My Drive/Tugas Akhir/Code


# Load Data

In [None]:
# Replace 'your_file.rdf' with the path to your RDF file
rdf_file_path = 'Dataset/kg_tsel/kg_tsel_emp.ttl'

In [None]:
# Create an RDF graph
rdf_graph = Graph()

In [None]:
# Parse the RDF file and load data into the graph
rdf_graph.parse(rdf_file_path, format='turtle')  # Change the format if your RDF file is in a different format (e.g., 'turtle')

<Graph identifier=Nbfdacbd6b2bd4b0cbe026feee9c9dc5c (<class 'rdflib.graph.Graph'>)>

In [None]:
# Print the number of triples in the graph
print(f"Number of triples in the RDF graph: {len(rdf_graph)}")

Number of triples in the RDF graph: 235171


In [None]:
kg_df = pd.DataFrame(rdf_graph, columns=['head', 'relation', 'tail'])

In [None]:
kg_df

Unnamed: 0,head,relation,tail
0,http://example.com/base/Linda_Pramesti,http://example.com/base/has%20skill,http://example.com/base/software_development
1,http://example.com/base/Dulkipli_Rizal,http://example.com/base/has%20skill,http://example.com/base/advertising
2,http://example.com/base/Randy_Galawana,http://example.com/base/work%20for,http://example.com/base/Institut_Teknologi_Telkom
3,http://example.com/base/Mohamad_Ichsan,http://example.com/base/graduated%20from,http://example.com/base/University_of_Indonesia
4,http://example.com/base/Rory_Lasweny,http://example.com/base/has%20skill,http://example.com/base/customer_relations
...,...,...,...
235166,http://example.com/base/Rocky_Raylendra,http://example.com/base/has%20skill,http://example.com/base/corporate_branding
235167,http://example.com/base/Kiki_Moch_Jaenudin%2C_S.IP,http://example.com/base/has%20skill,http://example.com/base/sales
235168,http://example.com/base/Adhi_Kurniadhi,http://example.com/base/lives%20in,http://example.com/base/Indonesia
235169,http://example.com/base/Lely_Purwa,http://example.com/base/has%20skill,http://example.com/base/Administration


# Data Cleaning

In [None]:
kg_df = kg_df.applymap(lambda x: x[24:] if isinstance(x, str) else x)
kg_df = kg_df.applymap(lambda x: str(x).replace('%20', '_'))
kg_df = kg_df.applymap(lambda x: str(x).replace('%28', '('))
kg_df = kg_df.applymap(lambda x: str(x).replace('%29', ')'))

In [None]:
kg_df.iloc[235170]

head        Rina_Juliana
relation        lives_in
tail           West_Java
Name: 235170, dtype: object

# Data Splitting

In [None]:
# Split data into 3 sets (train, validation, test) with ratio 70:15:15
# Split data intro 70:30 (train:test)
train_triples, test_triples = train_test_split(kg_df, test_size=0.3, random_state=42)

# Split the test data for validation data (50:50)
test_triples, valid_triples = train_test_split(test_triples, test_size=0.5, random_state=42)

In [None]:
# Combine entities from training, validation, and test sets
train_entities = list(set(train_triples["head"]).union(set(train_triples["tail"])))
valid_entities = list(set(valid_triples["head"]).union(set(valid_triples["tail"])))
test_entities = list(set(test_triples["head"]).union(set(test_triples["tail"])))

# Create a list of relations from the training, validation, and test sets
train_relations = train_triples['relation'].unique()
valid_relations = valid_triples['relation'].unique()
test_relations = test_triples['relation'].unique()

In [None]:
print('      |   E   R Triples')
print('-----------------------')
print('Train :', len(train_entities), len(train_relations), train_triples.shape[0])
print('Valid :', len(valid_entities), len(valid_relations), valid_triples.shape[0])
print('Test  :', len(test_entities), len(test_relations), test_triples.shape[0])

      |   E   R Triples
-----------------------
Train : 53789 8 164619
Valid : 20732 8 35276
Test  : 20703 8 35276


# Export Splitted Employee KG Data

In [None]:
train_triples.to_csv("Dataset/kg_tsel/train.txt", sep="\t", index=False, header=False)
valid_triples.to_csv("Dataset/kg_tsel/valid.txt", sep="\t", index=False, header=False)
test_triples.to_csv("Dataset/kg_tsel/test.txt", sep="\t", index=False, header=False)