In [21]:
from collections import defaultdict

from gensim import models
from nltk import word_tokenize
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import json

In [22]:
def load_graph(path='/kaggle/input/feed-data/graph.txt'):
    graph = defaultdict(list)
    with open(path, 'r') as f:
        for line in f:
            src, rel, dst = eval(line.strip())
            graph[src].append((rel, dst))
    return graph


In [23]:
def load_queries(path='/kaggle/input/feed-data/questions.txt'):
    queries = []
    with open(path, 'r') as f:
        for line in f:
            qid, question, start_node, _, _, answers = eval(line.strip())
            answer_ids = {ans['AnswerArgument'] for ans in answers}
            queries.append({
                'qid': qid,
                'question': question,
                'start_node': start_node,
                'answers': answer_ids
            })
    return queries


In [24]:
import json

def load_vertex_lookup(path='/kaggle/input/feed-data/mapping.json'):
    with open(path, 'r') as f:
        return json.load(f)


In [25]:
from gensim import models
from nltk import word_tokenize
import numpy as np

word2vec_model = models.Word2Vec.load('/kaggle/input/word-vector/word2vec_train_dev.dat')


def get_rel_score_word2vecbase(rel: str, query: str) -> float:
    """
    Get score for query and relation. Used to inform exploration of knowledge graph.

    :param rel: relation, or edge in knowledge graph
    :param query: query, question to answer
    :return: float score similarity between question and relation
    """
    # Relation not in embedding vocabulary
    rel = 'ns:' + rel if not rel[:3] == 'ns:' else rel
    if rel not in word2vec_model.wv:
        return 0.0
    # Relation must start with ns:
    

    words = word_tokenize(query.lower())
    w_embs = []
    for w in words:
        if w in word2vec_model.wv:
            w_embs.append(word2vec_model.wv[w])
    return np.mean(cosine_similarity(w_embs, [word2vec_model.wv[rel]]))

In [26]:
import numpy as np
from collections import deque
from sklearn.metrics import f1_score

def traverse_graph(graph, query, start_node, max_depth=2, score_threshold=0.25):
    visited = set()
    answers = set()
    queue = deque([(start_node, 0)])

    while queue:
        current_node, depth = queue.popleft()
        if depth > max_depth:
            continue
        visited.add(current_node)
        for rel, neighbor in graph.get(current_node, []):
            if neighbor in visited:
                continue
            score = get_rel_score_word2vecbase(rel, query)
            if score >= score_threshold:
                answers.add(neighbor)
                queue.append((neighbor, depth + 1))
    return answers


In [27]:
def evaluate_all_queries(graph, queries):
    f1s = []
    for q in queries:
        predicted = traverse_graph(graph, q['question'], q['start_node'])
        true = q['answers']
        all_ids = list(predicted.union(true))
        y_true = [1 if x in true else 0 for x in all_ids]
        y_pred = [1 if x in predicted else 0 for x in all_ids]
        score = f1_score(y_true, y_pred)
        print(f"Q{q['qid']}: {q['question']}\nF1: {score:.2f}\n")
        f1s.append(score)
    print(f"\nAverage F1 across all queries: {np.mean(f1s):.3f}")


In [28]:
if __name__ == "__main__":
    graph = load_graph()
    queries = load_queries()
    vertex_lookup = load_vertex_lookup()
    
    evaluate_all_queries(graph, queries)


Q1: what time zones are there in the us
F1: 0.12

Q2: what are major exports of the usa
F1: 0.24

Q3: what time is right now in texas
F1: 0.03

Q4: what war was george washington associated with
F1: 0.45

Q5: what are the most common religions in the united states
F1: 0.44

Q6: what are the names of michael jackson children
F1: 0.10

Q7: when michael jordan got drafted
F1: 0.25

Q8: what type of government and economic system does the united states have
F1: 0.38

Q9: what country was george washington from
F1: 0.07

Q10: what are the major imports of the united states
F1: 0.12

Q11: on which continent is the usa located
F1: 0.07

Q12: what continent is the usa located in
F1: 0.01

Q13: what are the airports in chicago
F1: 0.14

Q14: what are the names of michael jackson movies
F1: 0.09

Q15: what timezone is illinois on
F1: 0.02

Q16: what year michael jordan came in the nba
F1: 0.25

Q17: what is the political system of the us
F1: 0.27

Q18: what illnesses did george washington have
F