Loading and Preprocessing Data



In [None]:
# necessary imports

import nltk, math, re
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
import numpy as np
import tensorflow as tf
!pip install tensorly
import tensorly as tl
from scipy import spatial
from scipy.stats import spearmanr
from sklearn.decomposition import NMF

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


Collecting tensorly
[?25l  Downloading https://files.pythonhosted.org/packages/54/91/967c3bc6c4601fa9d36043f580c3ad691722ed82a71c57016ecbb48a088c/tensorly-0.4.4.tar.gz (68kB)
[K     |████████████████████████████████| 71kB 3.3MB/s 
Collecting nose
[?25l  Downloading https://files.pythonhosted.org/packages/15/d8/dd071918c040f50fa1cf80da16423af51ff8ce4a0f2399b7bf8de45ac3d9/nose-1.3.7-py3-none-any.whl (154kB)
[K     |████████████████████████████████| 163kB 14.2MB/s 
[?25hBuilding wheels for collected packages: tensorly
  Building wheel for tensorly (setup.py) ... [?25l[?25hdone
  Created wheel for tensorly: filename=tensorly-0.4.4-cp36-none-any.whl size=98403 sha256=dbe97a767b8d8b18a76a15843e8bf72c7f1aa3a31c22308f9745f00243f1004c
  Stored in directory: /root/.cache/pip/wheels/83/2a/e7/a8efd4828f2b83227355c943cce62bf404d1eb07ce5f081181
Successfully built tensorly
Installing collected packages: nose, tensorly
Successfully installed nose-1.3.7 tensorly-0.4.4


In [None]:
# Loading Dataset: BNC-Baby

bnc = nltk.corpus.reader.bnc.BNCCorpusReader(root='Datasets/BNC_baby/Texts/', fileids=r'.*.xml')
sentences = bnc.tagged_sents(c5=True)


In [None]:
sents = []
for i in range(len(sentences)):
    sents.append([])
    for j in range(len(sentences[i])):
        a = list(sentences[i][j])
        a[0] = a[0].lower()
        a[0] = lemmatizer.lemmatize(a[0])
        sents[i].append(tuple(a))

In [None]:
# Extracting data in sentences

al_tag = (
    'AJ0','AJC','AJS','AV0','AVP','AVQ','CJS','DPS','DTQ','EX0','NN0','NN1','NN2','NP0','PNI','PNP','PNQ','PNX',
    'VBB','VBD','VBG','VBI','VBN','VBZ','VDB','VDD','VDI','VDG','VDN','VDZ','VHB','VHD','VHG','VHI','VHN','VHZ',
    'VM0','VVB','VVD','VVG','VVI','VVN','VVZ'
) #all tags that we want to keep

nountags =  ('NN0','NN1','NN2','NP0','PNI','PNP','PNQ','PNX')

verbtags =  ('VBB','VBD','VBG','VBI','VBN','VBZ','VDB','VDD','VDI','VDG','VDN','VDZ','VHB','VHD','VHG','VHI','VHN','VHZ',
    'VM0','VVB','VVD','VVG','VVI','VVN','VVZ')

tokens = 0
sentslist = []
nounslist = []
verbslist = []

for i in range(len(sents)):
    a = ''
    for j in range(len(sents[i])):
        # if(sents[i][j][1] in al_tag):
            tokens += 1
            a = a + sents[i][j][0]
            a = a + ' '
    #     if(sents[i][j][1] in nountags):
    #         nounslist.append(sents[i][j][0])
    #     if(sents[i][j][1] in verbtags):
    #         verbslist.append(sents[i][j][0])
    if (a != ''):
        sentslist.append(a[:-1])
    else:
        del sentslist[-1]


In [None]:
print(tokens)

2600407


In [None]:
with open('/TensorBasedFactorizationModelUtilities/listofBNCbabysentences.txt', 'w') as f:
    for item in sentslist:
        f.write("%s\n" %item)

In [None]:
# calculating frequencies of nouns and verbs

nounfreq = sorted(FreqDist(nounslist).items(),key=lambda k:k[1], reverse=True)
verbfreq = sorted(FreqDist(verbslist).items(),key=lambda k:k[1], reverse=True)

In [None]:
with open('/TensorBasedFactorizationModelUtilities/Nounsfrequency_BNCbaby.txt', 'w') as f:
    for item in nounfreq:
        f.write("%s\n" %str(item))

with open('/TensorBasedFactorizationModelUtilities/Verbsfreq_BNCbaby.txt', 'w') as f:
    for item in verbfreq:
        f.write("%s\n" %str(item))

In [None]:
# creating a dictionary of context words for all nouns

contextwordsdict = {}
for i in range(len(sents)):
    for j in range(len(sents[i])):
        if(sents[i][j][1] in nountags):
            if(sents[i][j][0] not in contextwordsdict.keys()):
                contextwordsdict[sents[i][j][0]] = {}
            for k in range(len(sents[i])):
                if ((k!=j) and (sents[i][k][1] in al_tag)):
                    try:
                        contextwordsdict[sents[i][j][0]][sents[i][k][0]] += 1
                    except:
                        contextwordsdict[sents[i][j][0]][sents[i][k][0]] = 1
            

In [None]:
import json
json = json.dumps(contextwordsdict)
f = open("/TensorBasedFactorizationModelUtilities/Nouns-Contextwordsdict.json","w")
f.write(json)
f.close()

In [None]:
# calculating frequency of context words

contextwordslist = []
for item in contextwordsdict.keys():
    for context in contextwordsdict[item].keys():
        for i in range(contextwordsdict[item][context]):
            contextwordslist.append(context)
contextfreq = sorted(FreqDist(contextwordslist).items(),key=lambda k:k[1], reverse=True)

Calculating Latent Factors of Nouns using NMF


In [None]:
# Generating the Nouns-Context Words Matrix W considering 100 most frequent nouns and 3000 most frequent context words, weighted using PMI

W = np.zeros([1000, 3000])
for i in range(1000):
    p1 = float(nounfreq[i][1])/tokens
    for j in range(3000):
        p2 = float(contextfreq[j][1])/tokens
        try:
            p_joint = float(contextwordsdict[nounfreq[i][0]][contextfreq[j][0]])/tokens
            p = p_joint/(p1*p2)
            W[i][j] = math.log(p,2)
            if (W[i][j] < 0):
                W[i][j] = 0
        except:
            continue


In [None]:
np.save('/TensorBasedFactorizationModelUtilities/Nouns-ContextWordsMatrix',W)

In [None]:
# NMF

model = NMF(n_components=300, init='random', random_state=0, beta_loss='kullback-leibler', solver='mu', max_iter=1000)
W = model.fit_transform(W) # Noun Latent Factors
H = model.components_

In [None]:
np.save('/TensorBasedFactorizationModelUtilities/NounLatentFactorsBNCBaby',W)

Decomposition of subject-verb-object tensor using noun latent factors to obtain a core tensor of verbs that models semantic compositionality

In [None]:
nounfreq = []
i = 0
with open('/TensorBasedFactorizationModelUtilities/Nounsfrequency_BNCbaby.txt', 'r') as f:
    line = f.readline()
    while(line):
        i += 1
        l = line.split(',')
        a = [str(l[0][2:-1]) , int(l[1][1:-2])]
        nounfreq.append(tuple(a))
        if (i==1000):
            break
        line = f.readline()

In [None]:
verbfreq = []
i = 0
with open('/TensorBasedFactorizationModelUtilities/Verbsfreq_BNCbaby.txt', 'r') as f:
    line = f.readline()
    while(line):
        i += 1
        l = line.split(',')
        a = [str(l[0][2:-1]) , int(l[1][1:-2])]
        verbfreq.append(tuple(a))
        if (i==1000):
            break
        line = f.readline()

In [None]:
sentslist = []
with open('/TensorBasedFactorizationModelUtilities/listofBNCbabysentences.txt', 'r') as f:
    line = f.readline()
    while(line):
        sentslist.append(line[:-1])
        line = f.readline()

In [None]:
W = np.load('/TensorBasedFactorizationModelUtilities/NounLatentFactorsBNCBaby.npy')

In [None]:
X = np.zeros((1000,1000,1000))

In [None]:
def count(a,b,c,sentslist):
    ans = 0
    for i in range(len(sentslist)):
        if (re.match(r'.*' + a + r'.*' + b + r'.*' + c + r'.*', sentslist[i]) is not None):
            ans += 1
    return ans

In [None]:
# Creating SVO tensor considering 1000 most fequent nouns (sunjects and objects) and 1000 most frequent verbs, weighted using PMI

for i in range(1000):
    p1 = float(verbfreq[i][1])/tokens
    for j in range(1000):
        p2 = float(nounfreq[j][1])/tokens
        for k in range(1000):
            p3 = float(nounfreq[k][1])/tokens
            p_joint = float(count(nounfreq[j][0],verbfreq[i][0],nounfreq[k][0],sentslist))/tokens
            p = p_joint/(p1*p2*p3)
            try:
                X[i][j][k] = math.log(p,2)
                if(X[i][j][k] < 0):
                    X[i][j][k] = 0
            except:
                continue

In [None]:
np.save('/TensorBasedFactorizationModelUtilities/SVOTensor_BNCBaby', X)

In [None]:
X_tensor = tf.convert_to_tensor(X)

In [None]:
# Tensor Decomposition

temp = tl.tenalg.mode_dot(X, W.T, 2)
G = tl.tenalg.mode_dot(Y , W.T, 1) #core tensor

In [None]:
np.save('/TensorBasedFactorizationModelUtilities/CoreTensor_BNCBaby',G)

Evalutaion

In [None]:
# Finding the composition matrix of verb v considering subject and object in context

def svocompositioncontextual(a,b,c):
    a_ind = -1
    b_ind = -1
    c_ind = -1
    for i in range(len(nounfreq)):
        if (nounfreq[i][0] == a):
            a_ind = i
        if (nounfreq[i][0] == c):
            c_ind = i
        if (a_ind!=-1 and c_ind!=-1):
            break
    for i in range(len(verbfreq)):
        if (verbfreq[i][0] == b):
            b_ind = i
        if (b_ind!=-1):
            break

    if (a_ind==-1 or b_ind==-1 or c_ind==-1):
        return -1 # if verb or subject or object not available in our training dataset

    s = W[a_ind]
    o = W[c_ind]
    Y = np.outer(s,o) # vector outer product

    Gv = G[b_ind]

    Z = np.multiply(Gv,Y) # Hadamard product
    return Z

In [None]:
# finding the composition matric for a verb without considering context

def svocompositionnoncontextual(b):
    b_ind = -1
    for i in range(len(verbfreq)):
        if (verbfreq[i][0] == b):
            b_ind = i
        if (b_ind!=-1):
            break
    if (b_ind==-1):
        return -1 # if verb not present in our training dataset

    Z = G[b_ind] # slice of core tensor
    return Z

In [None]:
# Calculating similarity between two matrices

def similarity(A,B):
    C = np.asmatrix(np.full((300,1),(float(1)/math.sqrt(300)))) # column vector to convert matrix into a vectorized and normalized representation
    a = np.asarray(np.dot(A,C)) # matrix multiplication to obtain column vector
    b = np.asarray(np.dot(B,C))
    return (1 - spatial.distance.cosine(a,b)) # cosine similarity

In [None]:
# Evaluation

ranks = []
scores []

fp = open('/Datasets/GS2011/GS2011data.txt', 'r') # Test Dataset
line = fp.readline()
line = fp.readline()
while(line):
    l = line.split()
    A = svocompositioncontextual(l[2],l[1],l[3]) # contextual target word
    try:
        if (A==-1):
            continue # ignoring if the test case does not exist in training data
    except:
        garbage = 0
    B = svocompositionnoncontextual(l[4]) # non contextual landmark verb
    try:
        if (B==-1):
            continue
    except:
        garbage = 0
    scores.append(similarity(A,B))
    ranks.append(int(line[5]))
    line = fp.readline()

fp.close()

In [None]:
# Spearman correlation coefficient

rho = spearmanr(ranks,scores)
print(rho)

Predicting most suitable verb replacement for a given svo triple from the verbs in the training data

In [None]:
# Verb Replacement Prediction

s = 
v = 
o = 
verb = verbfreq[0][0]
if(svocompositioncontextual(s,v,o)!=-1):
    maximum = similarity(svocompositioncontextual(s,v,o),G[0]) # finding the verb for which the matrix similarity is maximum
    for i in range(1,1000):
        a = similarity(svocompositioncontextual(s,v,o),G[i])
        if (a > maximum):
            maximum = a
            verb = verbfreq[i][0]
    print("Most Suitable Verb Replacement is - " + verb)
else:
    print("SVO not in dataset!")

Baseline Evaluation

In [None]:
# Our baseline is when we igonre the context of our target as well as landmark verbs

ranks = []
scores []

fp = open('/Datasets/GS2011/GS2011data.txt', 'r')
line = fp.readline()
line = fp.readline()
while(line):
    l = line.split()
    A = svocompositionnoncontextual(l[1]) # non contextual target word
    if (A==-1):
        continue # ignoring if the target verb does not exist in training data 
    B = svocompositionnoncontextual(l[4]) # non contextual landmark verb
    if (B==-1):
        continue
    scores.append(similarity(A,B))
    ranks.append(int(line[5]))
    line = fp.readline()

fp.close()

In [None]:
# Spearman correlation coefficient

rho = spearmanr(ranks,scores)
print(rho)