### Description
Depending on how they were trained and saved, different gensim Word2Vec models need to be loaded with different functions, which may also be limited by the package version being used. This notebook just checks to make sure that all the models in the project repository can be successfully loaded with the used version of the gensim package with the expected functions. Add paths to more models here and check them as needed.

In [None]:
import gensim

In [2]:
# For the models available from here, load them this way through gensim, see that reference.
# http://bio.nlplab.org/#doc-tools
# http://evexdb.org/pmresources/vec-space-models/

paths = [
    "../models/bio_nlp_lab/PMC-w2v.bin",
    "../models/bio_nlp_lab/PubMed-w2v.bin",
    "../models/bio_nlp_lab/PubMed-and-PMC-w2v.bin",
    "../models/bio_nlp_lab/wikipedia-pubmed-and-PMC-w2v.bin"
]

# This just checks to make sure that the model can be loaded using the expected load function, 
# and can be used to look up the supported vocabulary and get a vector from it, it does not look
# at anything that has to do with model quality, just checks to make sure the function works.
for path in paths:
    model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)
    a_word_in_vocab = list(model.wv.vocab.keys())[0]
    vector = model[a_word_in_vocab]
    print(len(vector))
    print(vector)
    print("done testing the model at {}".format(path))

200
[ 1.7009387e-03  1.6617328e-04  1.7475006e-03 -7.3296158e-04
  1.6685453e-03  7.7628880e-04  1.3029224e-03 -9.4431522e-04
 -1.7969294e-03 -1.3431505e-04 -1.1006570e-03  2.4136316e-03
  1.3699713e-03  4.7293276e-04  2.0290185e-03  1.5452197e-03
  1.7989987e-03 -6.8767340e-04  1.8027565e-03  1.7412505e-03
  1.2331307e-03  2.4236381e-04 -6.4087095e-04 -2.4768531e-03
  1.2024977e-03 -7.8178733e-04 -2.4543409e-03  2.3431657e-03
  2.2981528e-03  6.0450792e-04  1.5653047e-03 -1.3344465e-03
  2.2928673e-03  1.4964816e-03  1.6237933e-03 -1.6035325e-03
  7.9613534e-04  1.2147952e-03  9.1233966e-04 -1.1959630e-03
  8.2323939e-04 -1.7087120e-03  8.6356374e-04  2.8843849e-04
  8.8678626e-04  1.0013651e-03 -2.2729065e-03  1.1015240e-03
 -1.2239610e-03 -2.0716675e-03  1.6513589e-03 -1.9234723e-03
  1.6135767e-03 -9.6441910e-04 -4.4311865e-04  5.3678604e-04
 -1.9993277e-03 -2.1131360e-03  2.5251627e-04 -4.1473226e-04
 -1.7030467e-03 -7.2272896e-04 -1.8731330e-03  2.3439024e-03
 -2.1132566e-03 -1.3



In [4]:
# For the other models use this other function for loading them. Add reference here.

paths = [
    "../models/wiki_sg/word2vec.bin"
]

for path in paths:
    model = gensim.models.Word2Vec.load(path)
    a_word_in_vocab = list(model.wv.vocab.keys())[0]
    vector = model[a_word_in_vocab]
    print(len(vector))
    print(vector)
    print("done testing the model at {}".format(path)) 

300
[ 3.58987927e-01  3.41845639e-02 -1.95871413e-01  1.10056594e-01
 -6.09722376e-01 -7.92895332e-02  4.55843300e-01  1.33193821e-01
  2.91736692e-01 -2.05134407e-01 -1.17449366e-01  5.99229753e-01
 -1.39417619e-01 -1.58866480e-01 -2.50067830e-01 -8.68160725e-02
 -3.03834170e-01 -1.74561828e-01 -3.16625714e-01  1.84902757e-01
 -2.59308666e-01  2.29947105e-01 -1.53124526e-01  4.46302667e-02
  6.53399169e-01  4.15386707e-01  4.24897552e-01 -5.74072227e-02
 -4.05244589e-01 -2.94395536e-01 -5.59289694e-01 -1.69368520e-01
 -2.72241831e-01 -2.36547425e-01 -2.54874289e-01  2.92852521e-01
 -2.45971918e-01  7.02370584e-01  7.31448948e-01 -4.45907146e-01
 -7.46426880e-01 -1.39110982e-01 -3.16632539e-01  5.19702375e-01
  1.15833414e+00 -1.17041968e-01  4.48320478e-01 -7.67863810e-01
 -4.61077005e-01 -4.30266768e-01  4.85435456e-01 -7.80618250e-01
  2.35865340e-02  6.19204640e-02 -4.10979480e-01  2.37210527e-01
  4.38621305e-02  4.91246194e-01  2.18698099e-01  1.10477328e-01
 -1.20045438e-01 -2.5

  # Remove the CWD from sys.path while we load stuff.
