In [6]:
import re
import random
from collections import defaultdict

def split_list(data):
    random.shuffle(data)
    split_index = int(len(data) * 0.7)
    list_30 = data[:split_index]
    list_70 = data[split_index:]
    return list_30, list_70

def extract_queries(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    queries = [q.strip() for q in content.split('---') if q.strip()]
    return queries

def extract_candidate_seed_nodes(queries, min_count=10):
    node_query_count = defaultdict(set)
    block_pattern = re.compile(r'CONSTRUCT\s*\{(.*?)\}\s*WHERE\s*\{(.*?)\}', re.DOTALL)

    for idx, query in enumerate(queries):
        match = block_pattern.search(query)
        if not match:
            continue
        blocks = match.groups()
        for block in blocks:
            triples = block.strip().split(' .')
            for triple in triples:
                tokens = triple.strip().split()
                for token in tokens:
                    if token and not token.startswith('?'):
                        node_query_count[token].add(idx)

    frequent_nodes = {node for node, qset in node_query_count.items() if len(qset) > min_count}
    return frequent_nodes

def get_filtered_queries(queries, seed_nodes):
    filtered = []
    for q in queries:
        if any(seed in q for seed in seed_nodes):
            filtered.append(q)
    return filtered

seed_node_set = extract_candidate_seed_nodes(train_data)
filtered_queries = get_filtered_queries(train_data, seed_nodes)

print("Sample seed nodes:", seed_nodes)
print("Number of filtered queries:", len(filtered_queries))


Sample seed nodes: ['<http://purl.org/ontology/mo/release>', 'sorg:text', '<http://schema.org/editor>', '<http://purl.org/goodrelations/v1#name>', 'sorg:language', '<http://purl.org/goodrelations/v1#validThrough>', 'sorg:expires', '<http://schema.org/author>', '<http://schema.org/language>', 'wsdbm:subscribes']
Number of filtered queries: 1634


In [None]:
!pip uninstall numpy scipy scikit-learn -y
!pip install numpy scipy scikit-learn --upgrade --force-reinstall

from sklearn.model_selection import train_test_split

Found existing installation: numpy 2.2.4

You can safely remove it manually.
You can safely remove it manually.
You can safely remove it manually.
You can safely remove it manually.



Uninstalling numpy-2.2.4:
  Successfully uninstalled numpy-2.2.4
Found existing installation: scipy 1.13.1
Uninstalling scipy-1.13.1:
  Successfully uninstalled scipy-1.13.1
Found existing installation: scikit-learn 1.4.2
Uninstalling scikit-learn-1.4.2:
  Successfully uninstalled scikit-learn-1.4.2
Collecting numpy
  Using cached numpy-2.2.4-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting scipy
  Downloading scipy-1.15.2-cp312-cp312-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.8 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.8 kB ? eta -:--:--
     ------------ ------------------------- 20.5/60.8 kB 162.5 kB/s eta 0:00:01
     ------------------------- ------------ 41.0/60.8 kB 245.8 kB/s eta 0:00:01
     -------------------------------- ----- 51.2/60.8 kB 290.5 kB/s eta 0:00:01
     -------------------------------- ----- 51.2/60.8 kB 290.5 kB/s eta 0:00:01
     -------------------------------------- 60.8/60.8 kB 20

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
contourpy 1.2.0 requires numpy<2.0,>=1.20, but you have numpy 2.2.4 which is incompatible.
numba 0.59.1 requires numpy<1.27,>=1.22, but you have numpy 2.2.4 which is incompatible.
pywavelets 1.5.0 requires numpy<2.0,>=1.22.4, but you have numpy 2.2.4 which is incompatible.
streamlit 1.32.0 requires numpy<2,>=1.19.3, but you have numpy 2.2.4 which is incompatible.

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.4 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recen

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.4 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.



ImportError: numpy.core.multiarray failed to import

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON, TURTLE
import pandas as pd
import os
import threading
import time
from rdflib import Graph
import traceback

SPARQL_ENDPOINT = "http://localhost:3030/#/dataset/FREEBASE/query"
output_directory = r"C:\Users\sohel\Downloads\FOLDER"
os.makedirs(output_directory, exist_ok=True)

def read_queries_from_csv(file_path):
    df = pd.read_csv(file_path)
    return df.iloc[:, 0].tolist()

def save_results_to_csv(df, query_index, output_dir):
    output_file = os.path.join(output_dir, f"query_result_{query_index}.csv")
    df.to_csv(output_file, index=False)
    print(f"SELECT query saved as CSV: {output_file}")

def execute_sparql_query(query, query_index, output_dir):
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    is_construct_query = query.strip().upper().startswith("CONSTRUCT")
    sparql.setReturnFormat(TURTLE if is_construct_query else JSON)

    try:
        results = sparql.query().convert()

        if is_construct_query:
            try:
                if not results or (isinstance(results, bytes) and not results.strip()):
                    raise ValueError("Empty RDF data returned from SPARQL endpoint.")
                g = Graph()
                g.parse(data=results.decode("utf-8"), format="turtle")
                rdf_file = os.path.join(output_dir, f"query_result_{query_index}.ttl")
                g.serialize(destination=rdf_file, format="turtle")
                print(f"CONSTRUCT query saved as Turtle: {rdf_file}")
            except UnboundLocalError as ule:
                print(f"RDF parsing failed: {ule}")
            except Exception as e:
                print(f"General RDF parse error: {e}")
                print(f"Preview of raw RDF:\n{results[:300]}")
            return None

        if "results" not in results or "bindings" not in results["results"]:
            return pd.DataFrame()

        variables = results["head"]["vars"]
        data = [
            {var: result[var]["value"] if var in result else None for var in variables}
            for result in results["results"]["bindings"]
        ]
        return pd.DataFrame(data)

    except Exception as e:
        print("Exception during SPARQL execution:")
        print(repr(e))
        traceback.print_exc()
        return pd.DataFrame()

class TimeoutException(Exception):
    pass

def execute_with_timeout(func, args=(), timeout=300):
    result = [None]
    exception = [None]

    def wrapper():
        try:
            result[0] = func(*args)
        except Exception as e:
            exception[0] = e

    thread = threading.Thread(target=wrapper)
    thread.start()
    thread.join(timeout)

    if thread.is_alive():
        print("Query execution exceeded time limit")
        return None

    if exception[0]:
        raise exception[0]

    return result[0]

if __name__ == "__main__":
    queries = train_data
    start_index = 0

    for idx in range(start_index, len(queries)):
        query = queries[idx]
        print(f"Executing query {idx + 1}/{len(queries)}:\n{query}\n")

        start_time = time.time()
        try:
            result_df = execute_with_timeout(
                execute_sparql_query,
                args=(query, idx, output_directory),
                timeout=300
            )
            elapsed = time.time() - start_time
            print(f"Query {idx + 1} executed in {elapsed:.2f} seconds")

            if result_df is not None and not result_df.empty:
                save_results_to_csv(result_df, idx, output_directory)
            else:
                print(f"No tabular results for query {idx + 1}")

        except Exception as e:
            print(f"Error in query {idx + 1}: {e}")
            traceback.print_exc()


In [None]:
import os
import rdflib

input_folder = r"C:\Users\sohel\Downloads\FOLDER"
output_folder = r"C:\Users\sohel\Downloads\FOLDER1"
os.makedirs(output_folder, exist_ok=True)

for filename in os.listdir(input_folder):
    if filename.endswith(".ttl"):
        input_path = os.path.join(input_folder, filename)
        output_path = os.path.join(output_folder, filename.replace(".ttl", ".nt"))

        if os.path.getsize(input_path) > 0:
            graph = rdflib.Graph()
            graph.parse(input_path, format="turtle")
            graph.serialize(destination=output_path, format="nt")
            print(f"Converted: {input_path} -> {output_path}")
        else:
            print(f"Skipped empty file: {input_path}")

print("Conversion complete!")




Converted: C:\Users\sohel\Downloads\Q_output5\query_result_0.ttl -> C:\Users\sohel\Downloads\Q_output_converted5\query_result_0.nt
Converted: C:\Users\sohel\Downloads\Q_output5\query_result_10.ttl -> C:\Users\sohel\Downloads\Q_output_converted5\query_result_10.nt
Converted: C:\Users\sohel\Downloads\Q_output5\query_result_100.ttl -> C:\Users\sohel\Downloads\Q_output_converted5\query_result_100.nt
Converted: C:\Users\sohel\Downloads\Q_output5\query_result_101.ttl -> C:\Users\sohel\Downloads\Q_output_converted5\query_result_101.nt
Converted: C:\Users\sohel\Downloads\Q_output5\query_result_102.ttl -> C:\Users\sohel\Downloads\Q_output_converted5\query_result_102.nt
Converted: C:\Users\sohel\Downloads\Q_output5\query_result_103.ttl -> C:\Users\sohel\Downloads\Q_output_converted5\query_result_103.nt
Converted: C:\Users\sohel\Downloads\Q_output5\query_result_104.ttl -> C:\Users\sohel\Downloads\Q_output_converted5\query_result_104.nt
Converted: C:\Users\sohel\Downloads\Q_output5\query_result_10

In [None]:
from rdflib import Graph
import pandas as pd
import os

NT_FILES_DIRECTORY = r"C:\Users\sohel\Downloads\FOLDER2"
triple_scores = {}

def process_nt_file(nt_file):
    print(f"Processing file: {nt_file}")
    g = Graph()
    g.parse(nt_file, format="nt")
    num_triples = len(g)

    if num_triples == 0:
        return

    score_per_triple = 1 / num_triples
    new_triple_scores = {}

    for s, p, o in g:
        triple = (str(s), str(p), str(o))
        if triple in triple_scores:
            triple_scores[triple] += score_per_triple
        else:
            new_triple_scores[triple] = score_per_triple

    triple_scores.update(new_triple_scores)

nt_files = [os.path.join(NT_FILES_DIRECTORY, f) for f in os.listdir(NT_FILES_DIRECTORY) if f.endswith(".nt")]
for nt_file in nt_files:
    process_nt_file(nt_file)

triple_df = pd.DataFrame([(s, p, o, score) for (s, p, o), score in triple_scores.items()],
                         columns=["Subject", "Predicate", "Object", "Score"])
print(triple_df.head())


✅ Processing file: C:\Users\sohel\Downloads\Q_output_converted5\query_result_0.nt
✅ Processing file: C:\Users\sohel\Downloads\Q_output_converted5\query_result_10.nt
✅ Processing file: C:\Users\sohel\Downloads\Q_output_converted5\query_result_100.nt
✅ Processing file: C:\Users\sohel\Downloads\Q_output_converted5\query_result_101.nt
✅ Processing file: C:\Users\sohel\Downloads\Q_output_converted5\query_result_102.nt
✅ Processing file: C:\Users\sohel\Downloads\Q_output_converted5\query_result_103.nt
✅ Processing file: C:\Users\sohel\Downloads\Q_output_converted5\query_result_104.nt
✅ Processing file: C:\Users\sohel\Downloads\Q_output_converted5\query_result_107.nt
✅ Processing file: C:\Users\sohel\Downloads\Q_output_converted5\query_result_108.nt
✅ Processing file: C:\Users\sohel\Downloads\Q_output_converted5\query_result_109.nt
✅ Processing file: C:\Users\sohel\Downloads\Q_output_converted5\query_result_11.nt
✅ Processing file: C:\Users\sohel\Downloads\Q_output_converted5\query_result_111

In [None]:
triple_df['prob']=triple_df['Score']/triple_df['Score'].sum()

In [None]:
triple_df

Unnamed: 0,Subject,Predicate,Object,Score,prob
0,http://rdf.freebase.com/ns/m.07484,http://rdf.freebase.com/ns/people.person.place...,http://rdf.freebase.com/ns/m.05ql8_,1.000000,0.001949
1,http://rdf.freebase.com/ns/m.027xx3,http://rdf.freebase.com/ns/organization.organi...,http://rdf.freebase.com/ns/m.0cnn3sk,1.000000,0.001949
2,http://rdf.freebase.com/ns/m.0cnn3sk,http://rdf.freebase.com/ns/location.mailing_ad...,http://rdf.freebase.com/ns/m.0r6cx,0.500000,0.000974
3,http://rdf.freebase.com/ns/m.02t7b7x,http://rdf.freebase.com/ns/location.mailing_ad...,http://rdf.freebase.com/ns/m.01dky4,0.500000,0.000974
4,http://rdf.freebase.com/ns/m.0k50,http://rdf.freebase.com/ns/organization.organi...,http://rdf.freebase.com/ns/m.02t7b7x,0.500000,0.000974
...,...,...,...,...,...
7630,http://rdf.freebase.com/ns/m.0k6vcw9,http://rdf.freebase.com/ns/tv.regular_tv_appea...,2012-02-07,0.333333,0.000650
7631,http://rdf.freebase.com/ns/m.03m8sg,http://rdf.freebase.com/ns/tv.tv_program.regul...,http://rdf.freebase.com/ns/m.0k6vcw9,0.333333,0.000650
7632,http://rdf.freebase.com/ns/m.05kgb4v,http://rdf.freebase.com/ns/location.mailing_ad...,http://rdf.freebase.com/ns/m.04p3c,0.500000,0.000974
7633,http://rdf.freebase.com/ns/m.01bzs9,http://rdf.freebase.com/ns/organization.organi...,http://rdf.freebase.com/ns/m.05kgb4v,0.500000,0.000974


In [None]:
sampled_df = triple_df.sample(n=5, weights="prob", replace=True)
sampled_df=sampled_df[["Subject","Predicate","Object"]]

In [None]:
import re

def extract_subject_object_predicate_sets(queries):
    subject_object_set = []
    predicate_set = set()

    for query in queries:
        construct_match = re.search(r'CONSTRUCT\s*{(.*?)}\s*WHERE', query, re.DOTALL)
        if not construct_match:
            continue
        construct_block = construct_match.group(1)

        triples = re.findall(r'([^\s]+)\s+([^\s]+)\s+([^\s]+)\s*\.\s*', construct_block)

        for subject, predicate, obj in triples:
            subject = subject.strip('<>')
            predicate = predicate.strip('<>')
            obj = obj.strip('<>')

            subject_object_set.append(subject)
            subject_object_set.append(obj)
            predicate_set.add(predicate)

    return set(subject_object_set), predicate_set
import re
import pandas as pd

sampled_df = triple_df.sample(n=10, weights="prob", replace=True)
sampled_df = sampled_df[["Subject", "Predicate", "Object"]]

S_O_set = set(sampled_df["Subject"])
S_O_set.update(sampled_df["Object"])
predicate_set = set(sampled_df["Predicate"])

total_queries = len(queries)
total_score = 0
unique_predicates = set()

for query in queries:
    S_O_query, query_predicates = extract_subject_object_predicate_sets([query])
    predicates_found = len(query_predicates.intersection(predicate_set))
    S_O_found = len(S_O_query.intersection(S_O_set))

    total_predicates_found = predicates_found / len(query_predicates) if len(query_predicates) else 0
    S_O_found_ratio = S_O_found / len(S_O_query) if len(S_O_query) else 0

    total_score += 0.5 * total_predicates_found + 0.5 * S_O_found_ratio
    unique_predicates.update(query_predicates)

total_score, total_queries, total_score / total_queries, len(unique_predicates)
