In [None]:
import numpy as np
import codecs
import math
import string

"""
Using "Dependency Based" dataset from
url: https://levyomer.wordpress.com/2014/04/25/dependency-based-word-embeddings/
"""

"""
Code for loading bin file is from a blog post
url: https://blog.ekbana.com/loading-glove-pre-trained-word-embedding-model-from-text-file-faster-5d3e8f2b8455
"""
def convert_to_binary(embedding_path):
    """
    Here, it takes path to embedding text file provided by glove.
    :param embedding_path: takes path of the embedding which is in text format or any format other than binary.
    :return: a binary file of the given embeddings which takes a lot less time to load.
    """
    f = codecs.open(embedding_path + ".txt", 'r', encoding='utf-8')
    wv = []
    with codecs.open(embedding_path + ".vocab", "w", encoding='utf-8') as vocab_write:
        count = 0
        for line in f:
            if count == 0:
                pass
            else:
                splitlines = line.split()
                vocab_write.write(splitlines[0].strip())
                vocab_write.write("\n")
                wv.append([float(val) for val in splitlines[1:]])
            count += 1
    np.save(embedding_path + ".npy", np.array(wv))
    
def load_embeddings_binary(embeddings_path):
    """
    It loads embedding provided by glove which is saved as binary file. Loading of this model is
    about  second faster than that of loading of txt glove file as model.
    :param embeddings_path: path of glove file.
    :return: glove model
    """
    with codecs.open(embeddings_path + '.vocab', 'r', 'utf-8') as f_in:
        index2word = [line.strip() for line in f_in]
    wv = np.load(embeddings_path + '.npy')
    model = {}
    for i, w in enumerate(index2word):
        model[w] = wv[i]
    return model

In [None]:
"""
Function to find top similar words as defined by dot product
Written by us :)
"""
def n_similar(inputVec, n, keySpace, model):
    topWord = []
    topDot = []
    for i in range(0,n):
        topWord.append('')
        topDot.append(0)
    length = inputVec.shape[0]
    for key in keySpace:
        lenKey = (np.reshape(model[key],(1,length)) @ np.reshape(model[key],(length,1))) ** .5
        lenInput = (np.reshape(inputVec,(1,length)) @ np.reshape(inputVec,(length,1))) ** .5
        dot = np.reshape(inputVec,(1,length)) @ np.reshape(model[key],(length,1)) / lenKey / lenInput
        for j in range(0,n):
            if (dot > topDot[n - j - 1]):
                if (j != 0):
                    topWord[n - j] = topWord[n - j - 1]
                    topDot[n - j] = topDot[n - j - 1]
                topWord[n - j - 1] = key
                topDot[n - j - 1] = dot
    return topWord, topDot

In [None]:
 try:
     path = "deps"
     x = load_embeddings_binary(path)
     keys = x.keys()
 except FileNotFoundError:
     raise Exception(f'FILE {path}.txt NOT FOUND PLEASE DOWNLOAD DEPENDENCY-BASED [WORDS] DATASET AND EXTRACT IN DIRECTORY FROM https://levyomer.wordpress.com/2014/04/25/dependency-based-word-embeddings');

In [None]:
math = x['apple'] - x['fruit']
vals = keys
result, length = n_similar(math, 5, vals, x)
print(result)
print(length)

In [None]:
# import the spam training data
csvImport = np.loadtxt('spam.csv',delimiter=',', dtype = 'S100',encoding = 'latin-1', usecols = (0,1), skiprows = 1)

In [None]:
maxRow = np.shape(csvImport)[0]
yTrain = np.zeros((maxRow,1))
xTrain = np.zeros((maxRow,300))

# get the average word vector for each text
for i in range(0,maxRow):
    if (str(csvImport[i,0])[2:-1] == "spam"):
        yTrain[i] = -1
    else:
        yTrain[i] = 1
    
    vecSum = np.zeros((300,))
    wordCount = 0

    words = str(csvImport[i,1])[2:-1].split(' ')
    for word in words:
        try:
            # for every word, if it is in the vocabulary from word2vec, add the word vector to the sum
            word.translate(str.maketrans('', '', string.punctuation))
            wordVec = x[word]
            vecSum = wordVec + vecSum
            wordCount = wordCount + 1
        except:
            pass
    if (wordCount > 0):
        # average out the words
        avgWord = vecSum / wordCount
    else:
        avgWord - vecSum
    xTrain[i,:] = np.reshape(avgWord,(1,300))
# xTrain has row vectors of the average word value

# create a new vocab from the texts
bagVocab = []
# get vocab for the bag so it isn't too big
for i in range(0,maxRow):
    words = str(csvImport[i,1])[2:-1].split(' ')
    for word in words:
        word.translate(str.maketrans('', '', string.punctuation))
        # if the word is in the word2vec vocabulary and not already in our new vocab, put it in the new vocab
        if (word in x.keys() and (not word in bagVocab)):
            bagVocab.append(word)
            
# create a new matrix for info from bag of words
xTrainBag = np.zeros((maxRow,len(bagVocab)))

# get bag of word count vector
for i in range(0,maxRow):
    words = str(csvImport[i,1])[2:-1].split(' ')
    for word in words:
        word.translate(str.maketrans('', '', string.punctuation))
        try:
            # update bag count when a word is found
            index = bagVocab.index(word)
            xTrainBag[i,index] = xTrainBag[i,index] + 1
        except:
            pass


In [None]:
# perform truncated svd on the average word vector model
import math

parts = 8

split_X = np.array_split(xTrain, parts)
split_y = np.array_split(yTrain, parts)

# You can update these parameters if you have extra computation time
svd_parameters = range(9,10)

svd_errors = []

def calcErrorCount(expected, actual):
    return (expected != actual).sum()

def calcErrorRate(expected, actual):
    return calcErrorCount(expected, actual) / len(actual)

# hard coded so the other test is consistent
for errorEstimateIndex in range(0, 1):
    for predictIndex in range(1, 2):
        if errorEstimateIndex == predictIndex:
            continue
        
        training_X = np.vstack(np.delete(split_X, [errorEstimateIndex, predictIndex], axis=0))
        training_y = np.vstack(np.delete(split_y, [errorEstimateIndex, predictIndex], axis=0))
        
        print(np.shape(training_X))
        
        error_estimation_X = split_X[errorEstimateIndex]
        error_estimation_y = split_y[errorEstimateIndex]
        
        predict_X = split_X[predictIndex]
        predict_y = split_y[predictIndex]
        
        """
        Truncated SVD
        """
        
        # Estimate w for each choice of the regularization parameter
        svd_w = []
        for param in svd_parameters:
            U,s,VT = np.linalg.svd(training_X,full_matrices=False)
    
            V_train = VT.T[:,0:param]
            s_inv = np.linalg.inv(np.identity(param) * s[0:param])
            UT_train = U.T[0:param,:]
    
            w = V_train @ s_inv @ UT_train @ training_y
        
            svd_w.append(w)
        
        # Select the best value for the regularization parameter
        # by estimating the error on the first holdout set
        
        leastErrorRateSvd = math.inf
        leastErrorParamSvd = None
        
        for i in range(len(svd_w)):
            w = svd_w[i]
            estimated_y = np.sign(error_estimation_X @ w)
            errorRate = calcErrorRate(error_estimation_y, estimated_y)
            
            if errorRate < leastErrorRateSvd:
                leastErrorRateSvd = errorRate
                leastErrorParamSvd = svd_parameters[i]
        
        # Use the w corresponding to the best value of the regularization
        # parameter to predict the labels of the remaining holdout set
        
        predicted_y = np.sign(predict_X @ w)
        errorRate = calcErrorRate(predict_y, predicted_y)
        svd_errors.append(errorRate)

print(f'Average SVD Error Rate: {(np.mean(svd_errors) * 100).round(3)}')

In [None]:
parts = 8

split_X = np.array_split(xTrainBag, parts)
split_y = np.array_split(yTrain, parts)

# You can update these parameters if you have extra computation time
# this matrix is a bit large to compute
svd_parameters = range(9,10)

svd_errors = []

def calcErrorCount(expected, actual):
    return (expected != actual).sum()

def calcErrorRate(expected, actual):
    return calcErrorCount(expected, actual) / len(actual)

# hardcoded because it takes a while to run
for errorEstimateIndex in range(0, 1):
    for predictIndex in range(1, 2):
        if errorEstimateIndex == predictIndex:
            continue
        
        training_X = np.vstack(np.delete(split_X, [errorEstimateIndex, predictIndex], axis=0))
        training_y = np.vstack(np.delete(split_y, [errorEstimateIndex, predictIndex], axis=0))
        
        print(np.shape(training_X))
        
        error_estimation_X = split_X[errorEstimateIndex]
        error_estimation_y = split_y[errorEstimateIndex]
        
        predict_X = split_X[predictIndex]
        predict_y = split_y[predictIndex]
        
        """
        Truncated SVD
        """
        
        # Estimate w for each choice of the regularization parameter
        svd_w = []
        for param in svd_parameters:
            U,s,VT = np.linalg.svd(training_X,full_matrices=False)
    
            V_train = VT.T[:,0:param]
            s_inv = np.linalg.inv(np.identity(param) * s[0:param])
            UT_train = U.T[0:param,:]
    
            w = V_train @ s_inv @ UT_train @ training_y
        
            svd_w.append(w)
        
        # Select the best value for the regularization parameter
        # by estimating the error on the first holdout set
        
        leastErrorRateSvd = math.inf
        leastErrorParamSvd = None
        
        for i in range(len(svd_w)):
            w = svd_w[i]
            estimated_y = np.sign(error_estimation_X @ w)
            errorRate = calcErrorRate(error_estimation_y, estimated_y)
            
            if errorRate < leastErrorRateSvd:
                leastErrorRateSvd = errorRate
                leastErrorParamSvd = svd_parameters[i]

        # Use the w corresponding to the best value of the regularization
        # parameter to predict the labels of the remaining holdout set
        
        predicted_y = np.sign(predict_X @ w)
        errorRate = calcErrorRate(predict_y, predicted_y)
        svd_errors.append(errorRate)

print(f'Average SVD Error Rate: {(np.mean(svd_errors) * 100).round(3)}')