In [122]:
import numpy as np
import codecs
import math
import string

"""
Using "Dependency Based" dataset from
url: https://levyomer.wordpress.com/2014/04/25/dependency-based-word-embeddings/
"""

"""
Code for loading bin file is from a blog post
url: https://blog.ekbana.com/loading-glove-pre-trained-word-embedding-model-from-text-file-faster-5d3e8f2b8455
"""
def convert_to_binary(embedding_path):
    """
    Here, it takes path to embedding text file provided by glove.
    :param embedding_path: takes path of the embedding which is in text format or any format other than binary.
    :return: a binary file of the given embeddings which takes a lot less time to load.
    """
    f = codecs.open(embedding_path + ".txt", 'r', encoding='utf-8')
    wv = []
    with codecs.open(embedding_path + ".vocab", "w", encoding='utf-8') as vocab_write:
        count = 0
        for line in f:
            if count == 0:
                pass
            else:
                splitlines = line.split()
                vocab_write.write(splitlines[0].strip())
                vocab_write.write("\n")
                wv.append([float(val) for val in splitlines[1:]])
            count += 1
    np.save(embedding_path + ".npy", np.array(wv))
    
def load_embeddings_binary(embeddings_path):
    """
    It loads embedding provided by glove which is saved as binary file. Loading of this model is
    about  second faster than that of loading of txt glove file as model.
    :param embeddings_path: path of glove file.
    :return: glove model
    """
    with codecs.open(embeddings_path + '.vocab', 'r', 'utf-8') as f_in:
        index2word = [line.strip() for line in f_in]
    wv = np.load(embeddings_path + '.npy')
    model = {}
    for i, w in enumerate(index2word):
        model[w] = wv[i]
    return model

In [2]:
"""
Function to find top similar words as defined by dot product
Written by us :)
"""
def n_similar(inputVec, n, keySpace, model):
    topWord = []
    topDot = []
    for i in range(0,n):
        topWord.append('')
        topDot.append(0)
    length = inputVec.shape[0]
    for key in keySpace:
        lenKey = (np.reshape(model[key],(1,length)) @ np.reshape(model[key],(length,1))) ** .5
        lenInput = (np.reshape(inputVec,(1,length)) @ np.reshape(inputVec,(length,1))) ** .5
        dot = np.reshape(inputVec,(1,length)) @ np.reshape(model[key],(length,1)) / lenKey / lenInput
        for j in range(0,n):
            if (dot > topDot[n - j - 1]):
                if (j != 0):
                    topWord[n - j] = topWord[n - j - 1]
                    topDot[n - j] = topDot[n - j - 1]
                topWord[n - j - 1] = key
                topDot[n - j - 1] = dot
    return topWord, topDot

In [26]:
 try:
     path = "deps"
     convert_to_binary(path)
     print('converted')
     x = load_embeddings_binary(path)
     keys = x.keys()
 except FileNotFoundError:
     raise Exception(f'FILE {path}.txt NOT FOUND PLEASE DOWNLOAD DEPENDENCY-BASED [WORDS] DATASET AND EXTRACT IN DIRECTORY FROM https://levyomer.wordpress.com/2014/04/25/dependency-based-word-embeddings');

KeyboardInterrupt: 

In [3]:
# note: the path refers to a <path>.txt file, you will need to update your extension
path = "deps"
convert_to_binary(path)
print('converted')
x = load_embeddings_binary(path)
keys = x.keys()

converted


In [25]:
math = x['blackberry'] + x['berry']
vals = keys
result, length = n_similar(math, 5, vals, x)
print(result)
print(length)

['berry', 'blackberry', 'cantaloupe', 'mangosteen', 'apricot']
[array([[0.8527363]]), array([[0.8527363]]), array([[0.77408645]]), array([[0.76526394]]), array([[0.76039084]])]


In [98]:
csvImport = np.loadtxt('spam.csv',delimiter=',', dtype = 'S100',encoding = 'latin-1', usecols = (0,1), skiprows = 1)

In [123]:
maxRow = np.shape(csvImport)[0]
yTrain = np.zeros((maxRow,1))
xTrain = np.zeros((maxRow,300))

# get the average word vector
for i in range(0,maxRow):
    if (str(csvImport[i,0])[2:-1] == "spam"):
        yTrain[i] = -1
    else:
        yTrain[i] = 1
    
    vecSum = np.zeros((300,))
    wordCount = 0

    words = str(csvImport[i,1])[2:-1].split(' ')
    for word in words:
        try:
            word.translate(str.maketrans('', '', string.punctuation))
            wordVec = x[word]
            vecSum = wordVec + vecSum
            wordCount = wordCount + 1
        except:
            pass
    if (wordCount > 0):
        avgWord = vecSum / wordCount
    else:
        avgWord - vecSum
    xTrain[i,:] = np.reshape(avgWord,(1,300))
# xTrain has row vectors of the average word value

bagVocab = []

# get vocab for the bag so it isn't too big
for i in range(0,maxRow):
    words = str(csvImport[i,1])[2:-1].split(' ')
    for word in words:
        word.translate(str.maketrans('', '', string.punctuation))
        if (not word in bagVocab):
            bagVocab.append(word)
            
xTrainBag = np.zeros((maxRow,len(bagVocab)))

# get bag of word count vector
for i in range(0,maxRow):
    words = str(csvImport[i,1])[2:-1].split(' ')
    for word in words:
        word.translate(str.maketrans('', '', string.punctuation))
        try:
            index = bagVocab.index(word)
            xTrainBag[i,index] = xTrainBag[i,index] + 1
        except:
            pass


In [108]:
parts = 8

split_X = np.array_split(xTrain, parts)
split_y = np.array_split(yTrain, parts)

ridge_parameters = [0, 2**-1, 2**0, 2**1, 2**2, 2**3, 2**4]

ridge_errors = []

def calcErrorCount(expected, actual):
    return (expected != actual).sum()

def calcErrorRate(expected, actual):
    return calcErrorCount(expected, actual) / len(actual)

for errorEstimateIndex in range(0, parts):
    for predictIndex in range(0, parts):
        if errorEstimateIndex == predictIndex:
            continue
        
#         print(f'Estimate: {errorEstimateIndex}  Predict: {predictIndex}')
        
        training_X = np.vstack(np.delete(split_X, [errorEstimateIndex, predictIndex], axis=0))
        training_y = np.vstack(np.delete(split_y, [errorEstimateIndex, predictIndex], axis=0))
        
        print(np.shape(training_X))
        
        error_estimation_X = split_X[errorEstimateIndex]
        error_estimation_y = split_y[errorEstimateIndex]
        
        predict_X = split_X[predictIndex]
        predict_y = split_y[predictIndex]
        
        """
        Ridge Regression
        """
        
        # Estimate w for each choice of the regularization parameter
        ridge_w = []
        for param in ridge_parameters:
            U,s,VT = np.linalg.svd(training_X,full_matrices=False)
            s_matrix = np.identity(len(s)) * s;
            V = VT.T

            pseudoInv = np.linalg.inv((V @ s_matrix @ VT) + (np.identity(len(s)) * param))
            
            w = pseudoInv @ V @ s_matrix @ U.T @ training_y
            
            ridge_w.append(w)
        
        # Select the best value for the regularization parameter
        # by estimating the error on the first holdout set
        
        leastErrorRateRidge = math.inf
        leastErrorParamRidge = None
        
        for i in range(len(ridge_w)):
            w = ridge_w[i]
            estimated_y = np.sign(error_estimation_X @ w)
            errorRate = calcErrorRate(error_estimation_y, estimated_y)
            
#             print(f'error for {ridge_parameters[i]} is {errorRate}')
            
            if errorRate < leastErrorRateRidge:
                leastErrorRateRidge = errorRate
                leastErrorParamRidge = ridge_parameters[i]
        
        print(f'[RIDGE] Best param: {leastErrorParamRidge}  error: {leastErrorRateRidge}')
        
        # Use the w corresponding to the best value of the regularization
        # parameter to predict the labels of the remaining holdout set
        
        predicted_y = np.sign(predict_X @ w)
        errorRate = calcErrorRate(predict_y, predicted_y)
        ridge_errors.append(errorRate)

print(f'Average Ridge Error Rate: {(np.mean(ridge_errors) * 100).round(3)}')

(4180, 300)
[RIDGE] Best param: 0  error: 0.14347202295552366
(4180, 300)
[RIDGE] Best param: 0  error: 0.14347202295552366
(4180, 300)
[RIDGE] Best param: 0  error: 0.14347202295552366
(4180, 300)
[RIDGE] Best param: 0  error: 0.14347202295552366
(4180, 300)
[RIDGE] Best param: 0  error: 0.14347202295552366
(4181, 300)
[RIDGE] Best param: 0  error: 0.14347202295552366
(4181, 300)
[RIDGE] Best param: 0  error: 0.14347202295552366
(4180, 300)
[RIDGE] Best param: 0  error: 0.14634146341463414
(4180, 300)
[RIDGE] Best param: 0  error: 0.14634146341463414
(4180, 300)
[RIDGE] Best param: 0  error: 0.14634146341463414
(4180, 300)
[RIDGE] Best param: 0  error: 0.14634146341463414
(4180, 300)
[RIDGE] Best param: 0  error: 0.14634146341463414
(4181, 300)


KeyboardInterrupt: 

In [124]:
parts = 8

split_X = np.array_split(xTrainBag, parts)
split_y = np.array_split(yTrain, parts)

ridge_parameters = [0, 2**-1, 2**0, 2**1, 2**2, 2**3, 2**4]

ridge_errors = []

def calcErrorCount(expected, actual):
    return (expected != actual).sum()

def calcErrorRate(expected, actual):
    return calcErrorCount(expected, actual) / len(actual)

for errorEstimateIndex in range(0, parts):
    for predictIndex in range(0, parts):
        if errorEstimateIndex == predictIndex:
            continue
        
#         print(f'Estimate: {errorEstimateIndex}  Predict: {predictIndex}')
        
        training_X = np.vstack(np.delete(split_X, [errorEstimateIndex, predictIndex], axis=0))
        training_y = np.vstack(np.delete(split_y, [errorEstimateIndex, predictIndex], axis=0))
        
        print(np.shape(training_X))
        
        error_estimation_X = split_X[errorEstimateIndex]
        error_estimation_y = split_y[errorEstimateIndex]
        
        predict_X = split_X[predictIndex]
        predict_y = split_y[predictIndex]
        
        """
        Ridge Regression
        """
        
        # Estimate w for each choice of the regularization parameter
        ridge_w = []
        for param in ridge_parameters:
            U,s,VT = np.linalg.svd(training_X,full_matrices=False)
            s_matrix = np.identity(len(s)) * s;
            V = VT.T

            pseudoInv = np.linalg.inv((V @ s_matrix @ VT) + (np.identity(len(s)) * param))
            
            w = pseudoInv @ V @ s_matrix @ U.T @ training_y
            
            ridge_w.append(w)
        
        # Select the best value for the regularization parameter
        # by estimating the error on the first holdout set
        
        leastErrorRateRidge = math.inf
        leastErrorParamRidge = None
        
        for i in range(len(ridge_w)):
            w = ridge_w[i]
            estimated_y = np.sign(error_estimation_X @ w)
            errorRate = calcErrorRate(error_estimation_y, estimated_y)
            
#             print(f'error for {ridge_parameters[i]} is {errorRate}')
            
            if errorRate < leastErrorRateRidge:
                leastErrorRateRidge = errorRate
                leastErrorParamRidge = ridge_parameters[i]
        
        print(f'[RIDGE] Best param: {leastErrorParamRidge}  error: {leastErrorRateRidge}')
        
        # Use the w corresponding to the best value of the regularization
        # parameter to predict the labels of the remaining holdout set
        
        predicted_y = np.sign(predict_X @ w)
        errorRate = calcErrorRate(predict_y, predicted_y)
        ridge_errors.append(errorRate)

print(f'Average Ridge Error Rate: {(np.mean(ridge_errors) * 100).round(3)}')

(4180, 11706)


ValueError: operands could not be broadcast together with shapes (11706,11706) (4180,4180) 