# Import Libraries

In [1]:
import tensorflow as tf
from tensorflow.keras.layers import SimpleRNN, Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.initializers import Constant
from gensim.models import Word2Vec
import functools
import numpy as np
import sys
import os
import pprint
from keras.preprocessing.text import Tokenizer
pp = pprint.PrettyPrinter(indent=4)
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


# Cont declaration

In [2]:
type_of_Word2Vec_model = 'SG'
vector_file_name = 'wiki-db_more50_200'
vector_file_name_path = './../model/' + type_of_Word2Vec_model + '/' + vector_file_name
MAX_SEQUENCE_LENGTH = 21
num_of_epochs = 5
batch_size = 1024 

train_file_name = 'uni_pair_combine'
train_file_path = './../dataset/train_data/'

# Hyper parameters Setup

In [3]:
embedding_dim = 200
num_hidden = 128

# Function Implementations

In [4]:
def load_data(input_file_name,wordvec):
    '''
    Create training data for the network.
    Input:
    Output: x_train , y_train
    '''
    # initiate the return values
    
    #Read data
    fin = open(input_file_name,'r', encoding = 'utf-8').read().split('\n')
#     print('First sentence: ', fin[0])
    num_of_train_sample = len(fin)
    
    # Initiate the return values
    y_train = []
    x_train = []

    # Load data
    count = 0
    inVocab = 0
    OOV_count = 0
    with open(input_file_name,'r', encoding = 'utf-8') as fin:
        for line in fin:
            tmp = line.split('\t')
            y_string = tmp[0]
            x_string = tmp[1].lower().strip('\n').split(' ')
            if len(x_string) < 2:
                continue
            if y_string in wordvec.wv:
                y_train.append(wordvec.wv[y_string])
                inVocab += 1
            else:
                #y_train.append(wordvec.wv['UNKNOWN'])
                #OOV_count += 1
                continue
            # change Text into Integer
            x_train_line = []
            
            for sample in x_string:
                if sample in wordvec.wv:
                    x_train_line.append(wordvec.wv.vocab[sample].index)
                else:
                    x_train_line.append(wordvec.wv.vocab['unknown'].index)
            x_train.append(x_train_line)

    
    
    # Padding
    x_train = pad_sequences(x_train, maxlen=MAX_SEQUENCE_LENGTH)
    y_train = np.array(y_train)
#     print('Not in Vocab: ',OOV_count)
#     print('in Vocab: ',inVocab)
    
    # return x_train, y_train
    return x_train , y_train
    

In [5]:
def Word2VecTOEmbeddingMatrix(wordvec, embedding_dim):
    model = wordvec
    embedding_matrix = np.zeros((len(model.wv.vocab), embedding_dim))
    for i in range(len(model.wv.vocab)):
        embedding_vector = model.wv[model.wv.index2word[i]]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

# Models Definitions

In [6]:
# Baseline: Simple RNN network without attention
def init_rnn_model(vocab_size, embedding_dim, embedding_matrix, MAX_SEQUENCE_LENGTH ):
    model =  Sequential() # Define Sequential Model
    embedding_layer = Embedding(vocab_size,
                            embedding_dim,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
    model.add(embedding_layer) # Add the Embedding layers to 
    model.add(SimpleRNN(embedding_dim, return_sequences = False))
    print(model.summary())
    model.compile(loss='mean_squared_error'
              ,optimizer='rmsprop'
              ,metrics=['acc'])
    return model

# Main

In [7]:
# Load the Pretrained Word Vector from Gensim
wordvec = Word2Vec.load(vector_file_name_path) # Load the model from the vector_file_name
wordvec.wv.init_sims(replace=True)
print('Loaded Word2Vec model')

Loaded Word2Vec model


In [8]:
# Get Vocabulary Size
vocab_size = len(wordvec.wv.vocab)
print('Vocab size: ', vocab_size)

Vocab size:  968009


In [11]:
wordvec.wv.similar_by_word('king',topn=10)

[('throne', 0.704645037651062),
 ('prince', 0.679724931716919),
 ('ruler', 0.6674383282661438),
 ('kings', 0.6557703018188477),
 ('DBPEDIA_ID/Hattusili_III', 0.6556156873703003),
 ('DBPEDIA_ID/Mursili_II', 0.6515605449676514),
 ('DBPEDIA_ID/Swasawke', 0.6443279981613159),
 ('DBPEDIA_ID/Vikramaditya', 0.6402791142463684),
 ('reign', 0.6358672380447388),
 ('DBPEDIA_ID/Arshak_II', 0.635525107383728)]

In [18]:
def getRanking(wordvec, compound_word, vec):
    '''
    Calculate the rank of the vector using the similar_by_vector function from Gensim
    Input:
        wordvec: Gensim word2vec model
        compound_word: the compound word need to be compare
        vec: the estimated vector of that compound word
    Output:
        rank: the rank of that compound_word when calculate the similarity by word
        if rank > 10: return 11
    similar_by_vector(vector, topn=10, restrict_vocab=None)
    Find the top-N most similar words by vector.

    Parameters:	
    vector (numpy.array) – Vector from which similarities are to be computed.
    topn ({int, False}, optional) – Number of top-N similar words to return. If topn is False, similar_by_vector returns the vector of similarity scores.
    restrict_vocab (int, optional) – Optional integer which limits the range of vectors which are searched for most-similar values. For example, restrict_vocab=10000 would only check the first 10000 word vectors in the vocabulary order. (This may be meaningful if you’ve sorted the vocabulary by descending frequency.)
    Returns:	
    Sequence of (word, similarity).

    Return type:	
    list of (str, float)
    '''
    top10 = wordvec.wv.similar_by_vector(vec,topn=10)
    for i, word_tuple in enumerate(top10):
        word = word_tuple[0]
        if word == compound_word:
            return i+1

    return 11

In [21]:
vec = wordvec.wv['king']
getRanking(wordvec,'throne',vec)

2