In [1]:
from collections import defaultdict

def load_graph(path='../data/Graphs/graph.txt'):
    graph = defaultdict(list)
    with open(path, 'r') as f:
        for line in f:
            src, rel, dst = eval(line.strip())
            graph[src].append((rel, dst))
    return graph


In [2]:
def load_queries(path='../data/Graphs/annotations.txt'):
    queries = []
    with open(path, 'r') as f:
        for line in f:
            qid, question, start_node, _, _, answers = eval(line.strip())
            answer_ids = {ans['AnswerArgument'] for ans in answers}
            queries.append({
                'qid': qid,
                'question': question,
                'start_node': start_node,
                'answers': answer_ids
            })
    return queries


In [3]:
import json

def load_vertex_lookup(path='../data/Graphs/vertex_lookup.json'):
    with open(path, 'r') as f:
        return json.load(f)


In [5]:
from gensim import models
from nltk import word_tokenize
import numpy as np

word2vec_model = models.Word2Vec.load('wordvec.dat')

def get_rel_score_word2vecbase(rel: str, query: str) -> float:
    if rel not in word2vec_model.wv:
        return 0.0
    rel = 'ns:' + rel if not rel.startswith('ns:') else rel
    words = word_tokenize(query.lower())
    vectors = [word2vec_model.wv[w] for w in words if w in word2vec_model.wv]
    if not vectors:
        return 0.0
    rel_vec = word2vec_model.wv[rel]
    return float(np.mean([
        np.dot(vec, rel_vec) / (np.linalg.norm(vec) * np.linalg.norm(rel_vec)) for vec in vectors
    ]))


ModuleNotFoundError: No module named 'gensim'

In [6]:
import numpy as np
from collections import deque
from sklearn.metrics import f1_score

def traverse_graph(graph, query, start_node, max_depth=2, score_threshold=0.25):
    visited = set()
    answers = set()
    queue = deque([(start_node, 0)])

    while queue:
        current_node, depth = queue.popleft()
        if depth > max_depth:
            continue
        visited.add(current_node)
        for rel, neighbor in graph.get(current_node, []):
            if neighbor in visited:
                continue
            score = get_rel_score_word2vecbase(rel, query)
            if score >= score_threshold:
                answers.add(neighbor)
                queue.append((neighbor, depth + 1))
    return answers


In [7]:
def evaluate_all_queries(graph, queries):
    f1s = []
    for q in queries:
        predicted = traverse_graph(graph, q['question'], q['start_node'])
        true = q['answers']
        all_ids = list(predicted.union(true))
        y_true = [1 if x in true else 0 for x in all_ids]
        y_pred = [1 if x in predicted else 0 for x in all_ids]
        score = f1_score(y_true, y_pred)
        print(f"Q{q['qid']}: {q['question']}\nF1: {score:.2f}\n")
        f1s.append(score)
    print(f"\nAverage F1 across all queries: {np.mean(f1s):.3f}")


In [None]:
    if __name__ == "__main__":
        graph = load_graph('graph.txt')
        queries = load_queries('questions.txt')
        vertex_lookup = load_vertex_lookup('mapping.json')
        
        evaluate_all_queries(graph, queries)
