# Generic Evaluation of Any Text File

**Prerequisites**
- obtain training files in NT format (see notebook 'RDF2Vec KBC Steps.ipynb')
- train your embeddings so that you have a txt file
- install kbc_rdf2vec ([https://github.com/janothan/kbc_rdf2vec](https://github.com/janothan/kbc_rdf2vec))
- install kbc_evaluation ([https://github.com/janothan/kbc_evaluation/](https://github.com/janothan/kbc_evaluation/))

In [7]:
#work_dir = "/work/jportisc/kbc_rdf2vec/strategy_grid_2/evaluation_3_kglove"

fb_vector_txt_file = "./kglove_fb/vectors.txt"
wn_vector_txt_file = "./kglove_wn/vectors.txt"

is_vectors_contain_predicates = False

# file where "WN18.nt" and "FB15k.nt" can be found
nt_dir = "/work/jportisc/kbc_rdf2vec/strategy_grid_2/evaluation_2/nt_files"

# Now let's decide on your directory where everything shall be written to (requires > 5Gb of disk space)
working_directory = "/work/jportisc/kbc_rdf2vec/strategy_grid_2/evaluation_3_kglove"

We first need to tranform the file in the gensim kv format.

In [9]:
from gensim.scripts.glove2word2vec import glove2word2vec
from pathlib import Path
from gensim.models import KeyedVectors
from gensim.models.keyedvectors import Word2VecKeyedVectors



def remove_tags(txt_file: str, file_to_write_path: Path) -> str:
    """
    Returns the newly written file where the concepts are not enclosed in tags (<...>).
    The new file will be written to the working directory.
    """
        
    with file_to_write_path.open(mode="w+") as file_to_write:
        with Path(txt_file).open(mode='r') as file_to_read:
            for line in file_to_read:
                starting_token = ""
                tokens = line.split(" ")
                if tokens[0].startswith("<") and tokens[0].endswith(">"):
                    starting_token = tokens[0][1:len(tokens[0])-1]
                else:
                    starting_token = tokens[0]
                file_to_write.write(starting_token)
                for i in range(1, len(tokens)):
                    file_to_write.write(" " + tokens[i])
    
    return str(file_to_write_path.resolve())
                    
        

def convert_to_kv(txt_file: str, new_file: str) -> Word2VecKeyedVectors:
    w2v_file = new_file[0:len(new_file)-3] + ".w2v"
    if Path(new_file).is_file():
        print(f"WARNING: File {new_file} exists already! Just loading the file...")
        return KeyedVectors.load_word2vec_format(w2v_file)
    else:
        glove2word2vec(txt_file, w2v_file)
    result = KeyedVectors.load_word2vec_format(w2v_file)
    result.save(new_file)
    return result
    


# kv files to be written
fb_kv_file = str(Path(working_directory).joinpath("fb_kv_format.kv").resolve())
wn_kv_file = str(Path(working_directory).joinpath("wn_kv_format.kv").resolve())

# write kv files (and also files where no tags are around the concepts)
fb_no_tags = Path(working_directory).joinpath("fb_txt_no_tags.txt")
wn_no_tags = Path(working_directory).joinpath("wn_txt_no_tags.txt")
fb_kv = convert_to_kv(remove_tags(txt_file = fb_vector_txt_file, file_to_write_path=fb_no_tags), fb_kv_file)
wn_kv = convert_to_kv(remove_tags(txt_file = wn_vector_txt_file, file_to_write_path=wn_no_tags), wn_kv_file)

# let's set the training files (required for predictions)...
wn_nt_path = str(Path(nt_dir).joinpath("WN18.nt").resolve())
fb15k_nt_path = str(Path(nt_dir).joinpath("FB15k.nt").resolve())

2021-08-24 22:26:20,100 - gensim.scripts.glove2word2vec - INFO - converting 16297 vectors from /work/jportisc/kbc_rdf2vec/strategy_grid_2/evaluation_3_kglove/fb_txt_no_tags.txt to /work/jportisc/kbc_rdf2vec/strategy_grid_2/evaluation_3_kglove/fb_kv_format.w2v
2021-08-24 22:26:20,436 - gensim.models.utils_any2vec - INFO - loading projection weights from /work/jportisc/kbc_rdf2vec/strategy_grid_2/evaluation_3_kglove/fb_kv_format.w2v
2021-08-24 22:26:25,396 - gensim.models.utils_any2vec - INFO - loaded (16297, 200) matrix from /work/jportisc/kbc_rdf2vec/strategy_grid_2/evaluation_3_kglove/fb_kv_format.w2v
2021-08-24 22:26:25,398 - gensim.utils - INFO - saving Word2VecKeyedVectors object under /work/jportisc/kbc_rdf2vec/strategy_grid_2/evaluation_3_kglove/fb_kv_format.kv, separately None
2021-08-24 22:26:25,399 - gensim.utils - INFO - not storing attribute vectors_norm
2021-08-24 22:26:25,875 - gensim.utils - INFO - saved /work/jportisc/kbc_rdf2vec/strategy_grid_2/evaluation_3_kglove/fb_kv

gensim.models.keyedvectors.Word2VecKeyedVectors

## Let's predict!
We start by generating the files containing the predictions.

In [10]:
from kbc_rdf2vec.dataset import DataSet
from kbc_rdf2vec.prediction import PredictionFunctionEnum, PredictionFunction
from kbc_rdf2vec.rdf2vec_kbc import Rdf2vecKbc

import os

def generate_prediction_files() -> None:
    wn_vector_file = wn_kv_file
    wn_nt_file = wn_nt_path
    fb15k_vector_file = fb_kv_file
    fb15k_nt_file = fb15k_nt_path

    # let's make a directory if it does not exist yet
    prediction_path = os.path.join(working_directory, "predictions")
    if not os.path.exists(prediction_path):
        os.makedirs(prediction_path)
    
    if is_vectors_contain_predicates:
        # ANN WN
        kbc = Rdf2vecKbc(
            model_path=wn_vector_file,
            data_set=DataSet.WN18,
            n=None,
            prediction_function=PredictionFunctionEnum.ANN,
            file_for_predicate_exclusion=wn_nt_file,
            is_reflexive_match_allowed=False,
        )
        kbc.predict(os.path.join(prediction_path, "wn_ann.txt"))

        # ANN FB
        kbc = Rdf2vecKbc(
            model_path=fb15k_vector_file,
            data_set=DataSet.FB15K,
            n=None,
            prediction_function=PredictionFunctionEnum.ANN,
            file_for_predicate_exclusion=fb15k_nt_file,
            is_reflexive_match_allowed=False,
        )
        kbc.predict(os.path.join(prediction_path, "fb15k_ann.txt"))
    

    """
    # most similar WN
    kbc = Rdf2vecKbc(
        model_path=wn_vector_file,
        n=None,
        data_set=DataSet.WN18,
        file_for_predicate_exclusion=wn_nt_file,
        is_reflexive_match_allowed=False,
        prediction_function=PredictionFunctionEnum.MOST_SIMILAR,
    )
    kbc.predict(os.path.join(prediction_path, "wn_most_similar.txt"))
    
    # most similar FB
    kbc = Rdf2vecKbc(
        model_path=fb15k_vector_file,
        n=None,
        data_set=DataSet.FB15K,
        file_for_predicate_exclusion=fb15k_nt_file,
        is_reflexive_match_allowed=False,
        prediction_function=PredictionFunctionEnum.MOST_SIMILAR,
    )
    kbc.predict(os.path.join(prediction_path, "fb15k_most_similar.txt"))
    
    # avg most similar WN
    kbc = Rdf2vecKbc(
        model_path=wn_vector_file,
        n=None,
        data_set=DataSet.WN18,
        file_for_predicate_exclusion=wn_nt_file,
        is_reflexive_match_allowed=False,
        prediction_function=PredictionFunctionEnum.PREDICATE_AVERAGING_MOST_SIMILAR,
    )
    kbc.predict(os.path.join(prediction_path, "wn_averaged_most_similar.txt"))

    # avg most similar FB
    kbc = Rdf2vecKbc(
        model_path=fb15k_vector_file,
        n=None,
        data_set=DataSet.FB15K,
        file_for_predicate_exclusion=fb15k_nt_file,
        is_reflexive_match_allowed=False,
        prediction_function=PredictionFunctionEnum.PREDICATE_AVERAGING_MOST_SIMILAR,
    )
    kbc.predict(os.path.join(prediction_path, "fb15k_averaged_most_similar.txt")) 
    
    # addition WN
    kbc = Rdf2vecKbc(
        model_path=wn_vector_file,
        n=None,
        data_set=DataSet.WN18,
        file_for_predicate_exclusion=wn_nt_file,
        is_reflexive_match_allowed=False,
        prediction_function=PredictionFunctionEnum.ADDITION,
    )
    kbc.predict(os.path.join(prediction_path, "wn_addition.txt"))

    # addition FB
    kbc = Rdf2vecKbc(
        model_path=fb15k_vector_file,
        n=None,
        data_set=DataSet.FB15K,
        file_for_predicate_exclusion=fb15k_nt_file,
        is_reflexive_match_allowed=False,
        prediction_function=PredictionFunctionEnum.ADDITION,
    )
    kbc.predict(os.path.join(prediction_path, "fb15k_addition.txt"))
    
    # addition FB with reflexive matches allowed
    kbc = Rdf2vecKbc(
        model_path=fb15k_vector_file,
        n=None,
        data_set=DataSet.FB15K,
        file_for_predicate_exclusion=fb15k_nt_file,
        is_reflexive_match_allowed=True,
        prediction_function=PredictionFunctionEnum.ADDITION,
    )
    kbc.predict(os.path.join(prediction_path, "fb15k_reflexive_addition.txt"))
    
    """
    
    # avg addition WN
    kbc = Rdf2vecKbc(
        model_path=wn_vector_file,
        n=None,
        data_set=DataSet.WN18,
        file_for_predicate_exclusion=wn_nt_file,
        is_reflexive_match_allowed=False,
        prediction_function=PredictionFunctionEnum.PREDICATE_AVERAGING_ADDITION,
    )
    kbc.predict(os.path.join(prediction_path, "wn_averaged_addition.txt"))

    # avg addition FB
    kbc = Rdf2vecKbc(
        model_path=fb15k_vector_file,
        n=None,
        data_set=DataSet.FB15K,
        file_for_predicate_exclusion=fb15k_nt_file,
        is_reflexive_match_allowed=False,
        prediction_function=PredictionFunctionEnum.PREDICATE_AVERAGING_ADDITION,
    )
    kbc.predict(os.path.join(prediction_path, "fb15k_averaged_addition.txt"))
    
    """
    # avg addition FB with reflexive matches allowed
    kbc = Rdf2vecKbc(
        model_path=fb15k_vector_file,
        n=None,
        data_set=DataSet.FB15K,
        file_for_predicate_exclusion=fb15k_nt_file,
        is_reflexive_match_allowed=True,
        prediction_function=PredictionFunctionEnum.PREDICATE_AVERAGING_ADDITION,
    )
    kbc.predict(os.path.join(prediction_path, "fb15k_reflexive_averaged_addition.txt"))
    """

generate_prediction_files()

2021-08-24 22:27:28,110 - kbc_rdf2vec.rdf2vec_kbc - INFO - Gensim vector file detected.
2021-08-24 22:27:28,111 - gensim.utils - INFO - loading Word2VecKeyedVectors object from /work/jportisc/kbc_rdf2vec/strategy_grid_2/evaluation_3_kglove/wn_kv_format.kv
2021-08-24 22:27:28,617 - gensim.utils - INFO - setting ignored attribute vectors_norm to None
2021-08-24 22:27:28,619 - gensim.utils - INFO - loaded /work/jportisc/kbc_rdf2vec/strategy_grid_2/evaluation_3_kglove/wn_kv_format.kv
2021-08-24 22:27:29,192 - kbc_rdf2vec.prediction - INFO - Initializing AveragePredicatePredictionFunction
Predicting Tails and Heads
  0%|          | 0/5000 [00:00<?, ?it/s]2021-08-24 22:27:31,769 - gensim.models.keyedvectors - INFO - precomputing L2-norms of word weight vectors
100%|██████████| 5000/5000 [16:47<00:00,  4.96it/s]
2021-08-24 22:44:19,535 - kbc_rdf2vec.rdf2vec_kbc - INFO - Erroneous Triples: 0
2021-08-24 22:44:19,655 - kbc_rdf2vec.rdf2vec_kbc - INFO - Gensim vector file detected.
2021-08-24 22:4