In [14]:
# util functions
from collections import defaultdict

import gensim
from nltk import word_tokenize
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import json

# Function to get the cosine similarity between a relation and query
# Note: Update this string with the path to the file
word2vec_model = gensim.models.Word2Vec.load('word2vec_train_dev.dat')


def get_rel_score_word2vecbase(rel: str, query: str) -> float:
    """
    Get score for query and relation. Used to inform exploration of knowledge graph.

    :param rel: relation, or edge in knowledge graph
    :param query: query, question to answer
    :return: float score similarity between question and relation
    """
    # Relation not in embedding vocabulary
    if rel not in word2vec_model.wv:
        return 0.0
    # Relation must start with ns:
    rel = 'ns:' + rel if not rel[:3] == 'ns:' else rel

    words = word_tokenize(query.lower())
    w_embs = []
    for w in words:
        if w in word2vec_model.wv:
            w_embs.append(word2vec_model.wv[w])
    return np.mean(cosine_similarity(w_embs, [word2vec_model.wv[rel]]))


def load_node_label_lookup(filepath: str) -> dict:
    """

    Load the lookup dictionary for nodes from the provided json file.

    Args:
        filepath: Path to the json file containing the lookup dictionary.

    Returns: Dictionary of node ids to text description of node.

    """
    with open(filepath, 'rb') as fp:
        return json.load(fp)


def load_query_df(filepath: str) -> pd.DataFrame:
    """

    Load a simplified dataframe of queries. Generated from the original queries nested dictionary, this simplified
    version contains all necessary information for performing the graph traversal testing without all the extra
    information and difficult formatting. Simply loop through this dataframe row by row, start at the start node with
    the query for that row, and the expected answers are given in that same row.

    Args:
        filepath: Path to the provided parquet file

    Returns: Dataframe of queries to perform on the graph.

    """
    return pd.read_parquet(filepath)


# Function to load the graph from file
def load_graph() -> dict:
    """

    Load the graph from the given file.

    Returns: Graph, in form of node_id key, and nested list value. Nested list is adjacency list, with each list
    containing the relation, and destination node_id.

    """
    # Preparing the graph
    graph = defaultdict(list)
    for line in open('graph.txt'):
        line = eval(line[:-1])
        graph[line[0]].append([line[1], line[2]])
    return graph


# Function to load the queries from file
# Preparing the queries
def load_queries() -> list:
    """

    Load the original queries file. This format can be extremely confusing, for a simplified format use load_query_df.

    Returns: Nested list, with index, node_id, relation types for answers, text description of start node, and dict of
    answers.

    """
    queries = []
    for line in open('annotations.txt'):
        line = eval(line[:-1])
        queries.append(line)
    return queries

In [None]:
# Given is knowledge graph with entities and relations, questions with starting entity and answers, and their word embedding.
# Also we provide a json file for lookup of vertex IDs.
# For each question, navigate the graph from the start entity outwards until you find appropriate answer entities.
# Utils functions (similarity, load_graphs) are given, but you can choose not to use them.
# This python file contains the helper functions for this homework, the only update needed to use this file is to fill in the file paths.

# - The number of correct answers varies (could be 1, could many), use F1 to compare your answers with the given correct answers
# - Utils functions (similarity, load_graphs) are given, but you can choose not to use them.
# - Answers are given to be used for evaluation only, DO NOT USE ANSWERS IN YOUR GRAPH TRAVERSAL. 
# Your strategy should be a graph traversal augmented with scoring of paths; you might discard paths not promising along the way.
# This is similar to a focused crawl strategy. You will take a query (question) that you are trying to answer, it will have a starting entity. 
# Begin your traversal at that starting entity, and look at all adjacent edges. 
# Use get_rel_score_word2vec base to get a similarity score for each edge, and traverse the edges that are promising. 
# This part is up to you, you can cut off scores below a certain threshold, or take only the top percentage, or weight it based on the average.

# There are many valid strategies. You will continue to traverse a path, until the score starts to decrease, or you notice the similarity score drops significantly (compared to the previous edges). 
# Overall try a few different approaches, and choose one that gives you the best overall F1 score.

In [16]:
#query_df = load_query_df('annotations.txt')
graph = load_graph()
node_lookup = load_node_label_lookup('node_lookup.json')
get_rel_score_word2vecbase

# Relation must start with ns:
word2vec_model.wv.index_to_key

# if rel not in word2vec_model.wv:
#         return 0.0
# Instead of comparing embeddings we are comparing strings with keys after this change

SyntaxError: '[' was never closed (<string>, line 1)

In [12]:
%pip install fastparquet

Collecting fastparquet
  Downloading fastparquet-2024.11.0-cp312-cp312-win_amd64.whl.metadata (4.3 kB)
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.10.0-cp312-cp312-win_amd64.whl.metadata (5.1 kB)
Collecting fsspec (from fastparquet)
  Using cached fsspec-2025.3.2-py3-none-any.whl.metadata (11 kB)
Downloading fastparquet-2024.11.0-cp312-cp312-win_amd64.whl (673 kB)
   ---------------------------------------- 0.0/673.3 kB ? eta -:--:--
   ---------------------------------------- 673.3/673.3 kB 2.4 MB/s eta 0:00:00
Downloading cramjam-2.10.0-cp312-cp312-win_amd64.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ------------------------------------- -- 1.6/1.7 MB 7.6 MB/s eta 0:00:01
   ------------------------------------- -- 1.6/1.7 MB 7.6 MB/s eta 0:00:01
   ---------------------------------------- 1.7/1.7 MB 2.5 MB/s eta 0:00:00
Using cached fsspec-2025.3.2-py3-none-any.whl (194 kB)
Installing collected packages: fsspec, cramja