In [61]:
##Load modules and libraries
import numpy as np
from scipy import spatial

# Function to generate "index" vectors

In [62]:
def get_random_word_vector(dimension, k):
    #k is a number of +1s and -1s if you decide to implement Random Indexing algorithm. So the total number of nonzero components in the vector is 2k
    #For BEAGLE k is not relevant
    v = np.array(np.zeros(dimension),np.int8) #Vector in initialized. 
    
    #ToDo Generate components of an "index" vector randomly using the randomness suitable for the chosen algorithm
    fill_ind = np.random.permutation(np.arange(0, dimension, 1))[:k]
    random_ones_and_negs = np.random.binomial(1, 0.5, k) * 2 - 1
    v[fill_ind] = random_ones_and_negs
    
    return v

# Parameters 

In [63]:
test_name = "TOEFL_synonyms.txt" # file with TOEFL dataset
data_file_name = "Glemmatized.txt" # file with the text corpus

dimension = 2000 # Dimensionality for high-dimensional vectors

threshold = 100000 # Frequency threshold in the corpus 

ones_number = 5 # number of nonzero elements in randomly generated high-dimensional vectors
window_size = 3 #number of neighboring words to consider both back and forth. In other words number of words before/after current word

# Initialize "index" vectors and embeddings 

In [64]:
def populate_dictionaries(data_file_name, dimension, threshold, ones_number):
    dictionary = {} # vocabulary and corresponing random high-dimensional vectors
    amount_dictionary = {} # counts frequency of words
    word_space = {} # stores embedings

    # Count how many times each word appears in the corpus
    text_file = open(data_file_name, "r")
    for line in text_file:
        if line != "\n":
            words = line.split()
            for word in words:
                if amount_dictionary.get(word) is None:
                    amount_dictionary[word] = 1
                else:
                    amount_dictionary[word] += 1
    text_file.close()

    #Create a dictionary with the assigned random high-dimensional vectors
    text_file = open(data_file_name, "r")
    for line in text_file: #read line in the file
        words = line.split() # extract words from the line
        for word in words:  # for each word
            if dictionary.get(word) is None: # If the word was not yed added to the vocabulary
                if amount_dictionary[word] < threshold:
                    dictionary[word] = get_random_word_vector(dimension, ones_number) # assign an "index" vector 
                else:
                    dictionary[word] = np.zeros(dimension) # frequent words are assigned with empty vectors. In a way they will not contribute to the word embedding
    text_file.close()
    return dictionary, amount_dictionary, word_space

# Choose embeddigns to construct  

In [65]:
def process_toefl(test_name, dimension, word_space): 
    #Note that in order to save time we only create embeddings for the words needed in the TOEFL task
    number_of_tests = 0
    TOEFL_file = open(test_name, "r") # open TOEFL file

    #Find all unique words amongst TOEFL tasks and initialize their embeddings to zeros    
    for line in TOEFL_file:
            words = line.split()
            word_space[words[0]] = np.zeros(dimension)
            word_space[words[1]] = np.zeros(dimension)
            word_space[words[2]] = np.zeros(dimension)
            word_space[words[3]] = np.zeros(dimension)
            word_space[words[4]] = np.zeros(dimension)
            number_of_tests += 1 # counts the number of test cases in TOEFL file
    TOEFL_file.close()
    return word_space, number_of_tests

# Construct embeddings 

In [66]:
import copy
def create_embeddings(data_file_name, dimension, window_size, dictionary, word_space, mode=0):
    #Each line in the corpus is a sentence so we only consider the window of words within the sentence.

    text_file = open(data_file_name, "r")
    line = "placeholder"
    
    while line != "":
        line = text_file.readline()
        words = line.split()
        for i in range(0,len(words)):
            if not (word_space.get(words[i]) is None): # This line forces us to create only embeddigns for words present in TOEFL
                word_space[words[i]]
                if mode == 0 or mode == 1:
                    #Form "context" vector
                    context=np.zeros(dimension) # initialize context vector
                    for j in range(max(i-window_size,0),min(i+window_size+1,len(words))): # align window size with the location of the focus word in the sentence
                        #ToDo increment context vector with the corresponding "index" vectors
                        #Note that the index" vector for the focus word in nor included into the context vector
                        if j != i:
                            context += dictionary.get(words[j])
                    word_space[words[i]] += context # update the embedding with new context vector        
                 
                if mode == 0 or mode == 2:
                    #Form "order" vector
                    order=np.zeros(dimension) # initialize order vector
                    for j in range(max(i-window_size,0),min(i+window_size+1,len(words))): # align window size with the location of the focus word in the sentence
                        #ToDo increment context vector with the properly permuted "index" vectors
                        if j != i:
                            roll_amt = j-i # shift vector as permutation. inverse in shift negative
                            order += np.roll(dictionary.get(words[j]), roll_amt)
                    word_space[words[i]] += order # update the embedding with new order vector
        
    return word_space

# Testing of the embeddings on TOEFL

In [67]:
#Used to check if the answer for TOEFL synonyms task is correct
def get_answer_mod(words):
    min_value = min(spatial.distance.cosine(words[0], words[1]), spatial.distance.cosine(words[0], words[2]), spatial.distance.cosine(words[0], words[3]),
                    spatial.distance.cosine(words[0], words[4]))
    if min_value == spatial.distance.cosine(words[0],words[1]):
        return 1
    else:
        return 0

In [68]:
def compute_accuracies(test_name, dimension, number_of_tests, amount_dictionary, word_space):
    zero_vector = np.zeros(dimension) # used to check if an embedding is non empty 
    i = 0
    TOEFL_file = open(test_name, 'r')
    right_answers = 0.0 # variable for correct answers
    number_skipped_tests = 0.0 # some tests could be skipped if there are no corresponding words in the vocabulary extracted from the training corpus
    while i < number_of_tests:
            line = TOEFL_file.readline() #read line in the file
            words = line.split()  # extract words from the line
            try:
                if not(amount_dictionary.get(words[0]) is None): # check if there word in the corpus for the query word
                    k = 1
                    while k < 5:
                        if np.array_equal(word_space[words[k]], zero_vector): # if no representation was learnt assign a random vector
                            word_space[words[k]] = np.random.randn(dimension)
                        k += 1
                    right_answers += get_answer_mod([word_space[words[0]],word_space[words[1]],word_space[words[2]],
                                word_space[words[3]],word_space[words[4]]]) #check if word is predicted right
            except KeyError: # if there is no representation for the query vector than skip
                number_skipped_tests += 1
                print("skipped test: " + str(i) + "; Line: " + str(words))
            except IndexError:
                break
            i += 1
    TOEFL_file.close()
    accuracy = 100 * right_answers / number_of_tests # accuracy of the embeddings  
    # print("Dimensionality of embeddings: " +str(dimension) + "; Percentage of correct answers in TOEFL: " + str(accuracy) + "%")
    return accuracy

In [69]:
def run_test(dimensionality, window_sizes, mode):
    test_name = "TOEFL_synonyms.txt"
    data_file_name = "Glemmatized.txt"
    threshold = 100000
    ones_number = 5
    
    results = []
    for d in dimensionality:
        dimension = d # Dimensionality for high-dimensional vectors

        dictionary, amount_dictionary, d_word_space = populate_dictionaries(data_file_name, dimension, threshold, ones_number)
        d_word_space, number_of_tests = process_toefl(test_name, dimension, d_word_space)
        for ws in window_sizes:
            ws_word_space = copy.deepcopy(d_word_space)
            print(d, ws, mode)
            window_size = ws #number of neighboring words to consider both back and forth. In other words number of words before/after current word

            ws_word_space = create_embeddings(data_file_name, dimension, window_size, dictionary, ws_word_space, mode)
            acc = compute_accuracies(test_name, dimension, number_of_tests, amount_dictionary, ws_word_space)
            results.append((d, ws, acc))

            print("Dimensionality of embeddings: " +str(d) +  "; Window Size: " +str(ws) + "; Percentage of correct answers in TOEFL: " + str(acc) + "%")
    return results

In [73]:
def print_results(results):
    for res in results:
        print(f"dim: {res[0]}, window_size: {res[1]}, accuracy: {res[2]}")

In [74]:
# dimensionality_test
dimensionality = [256, 512, 768]
window_sizes = [3]
mode=0
print("Accuracy by dimensions")
d_results = run_test(dimensionality, window_sizes, mode)
print(f"iter:{i}")
print_results(d_results)

Accuracy by dimensions
256 3 0
Dimensionality of embeddings: 256; Window Size: 3; Percentage of correct answers in TOEFL: 55.0%
512 3 0
Dimensionality of embeddings: 512; Window Size: 3; Percentage of correct answers in TOEFL: 53.75%
768 3 0
Dimensionality of embeddings: 768; Window Size: 3; Percentage of correct answers in TOEFL: 57.5%
iter:0
dim: 256, window_size: 3, accuracy: 55.0
dim: 512, window_size: 3, accuracy: 53.75
dim: 768, window_size: 3, accuracy: 57.5


In [75]:
# window_size test
dimensionality = [256]
window_sizes = [1, 2, 3, 4]
mode=0
ws_results = run_test(dimensionality, window_sizes, mode)
print("Accuracy by window size")
print_results(ws_results)

256 1 0
Dimensionality of embeddings: 256; Window Size: 1; Percentage of correct answers in TOEFL: 48.75%
256 2 0
Dimensionality of embeddings: 256; Window Size: 2; Percentage of correct answers in TOEFL: 52.5%
256 3 0
Dimensionality of embeddings: 256; Window Size: 3; Percentage of correct answers in TOEFL: 52.5%
256 4 0
Dimensionality of embeddings: 256; Window Size: 4; Percentage of correct answers in TOEFL: 53.75%
Accuracy by window size
dim: 256, window_size: 1, accuracy: 48.75
dim: 256, window_size: 2, accuracy: 52.5
dim: 256, window_size: 3, accuracy: 52.5
dim: 256, window_size: 4, accuracy: 53.75


In [76]:
# context_only test
dimensionality = [256, 512, 768]
window_sizes = [3]
mode=1
context_results = run_test(dimensionality, window_sizes, mode)
print("Accuracy with only context embeddings")
print_results(context_results)

256 3 1
Dimensionality of embeddings: 256; Window Size: 3; Percentage of correct answers in TOEFL: 52.5%
512 3 1
Dimensionality of embeddings: 512; Window Size: 3; Percentage of correct answers in TOEFL: 52.5%
768 3 1
Dimensionality of embeddings: 768; Window Size: 3; Percentage of correct answers in TOEFL: 57.5%
Accuracy with only context embeddings
dim: 256, window_size: 3, accuracy: 52.5
dim: 512, window_size: 3, accuracy: 52.5
dim: 768, window_size: 3, accuracy: 57.5


In [77]:
# order_only test
dimensionality = [256, 512, 768]
window_sizes = [3]
mode=2
order_results = run_test(dimensionality, window_sizes, mode)
print("Accuracy with order embeddings")
print_results(order_results)

256 3 2
Dimensionality of embeddings: 256; Window Size: 3; Percentage of correct answers in TOEFL: 50.0%
512 3 2
Dimensionality of embeddings: 512; Window Size: 3; Percentage of correct answers in TOEFL: 53.75%
768 3 2
Dimensionality of embeddings: 768; Window Size: 3; Percentage of correct answers in TOEFL: 58.75%
Accuracy with order embeddings
dim: 256, window_size: 3, accuracy: 50.0
dim: 512, window_size: 3, accuracy: 53.75
dim: 768, window_size: 3, accuracy: 58.75


# Questions

The semantic vectors were embedded using the Random Indexing strategy. 

Only one experiment was run per configuration due to limitations in compute resources.

## Question 1

results:

`dim: 256, window_size: 3, accuracy: 55.0
dim: 512, window_size: 3, accuracy: 53.75
dim: 768, window_size: 3, accuracy: 57.5`

## Question 2a

results:

`dim: 256, 
window_size: 1, accuracy: 48.75
window_size: 2, accuracy: 52.5
window_size: 3, accuracy: 52.5
window_size: 4, accuracy: 53.75`

As the window size increased, there was a general trend for accuracy to also increase. However, there was no increase in accuracy from window sizes 2 to 3. This may be because the words that are at distance 2 from the probed word may be 'frequent' words, and thus no additional information was embedded.

## Question 3

results:

`window_size: 3,
dim: 256, accuracy: 55.0
dim: 512, accuracy: 53.75
dim: 768, accuracy: 57.5`

The size of the dimensions seems to not not affect the accuracy of TOEFL synonymy assessment. The accuracy fell from  256 dims to 512 dims, and rose again from 512 dims to 768 dims, which can be interpretted as the accuracy being within a interval despite the changes in dimensins. This seems to show that because we are using a sparse encoding of vectors, even a relatively small dimension is sufficient to create accurate word embeddings. However, this may be a flawed result because only one experiment was done for each dimensionality. In that case, it can be inferred that the increasing dimensions actually increase accuracy and that the decrease in accuracy in 256 to 512 dimensions was a rare error. This notion is reinforced when looking at context-only and order-only embedding accuracies, which display increases in accuracy as the numer of dimensions increases.

## Question 4

Accuracy with only context embeddings

`dim: 256, normal: 55.0, context-only: 52.5, order-only: 50.0
dim: 512, normal: 53.75, context-only: 52.5, order-only: 53.75
dim: 768, normal: 57.5, context-only: 57.5, order-only: 58.751`

Context only embeddings performed better in the 256 dimension while order-only embeddings performed better in the higher 512 and 768 dimensions. This may display that context embeddings are effective even in smaller dimensions while the effectiveness of order embeddings increases faster than context embeddings do as the number of dimensions increases.