In [389]:
import re
import math
import random
import time
import corenlp
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import codecs

import inspect # for logger


In [2]:
dimensionality=2000
denseness=10//dimensionality
indexspace = {}
globalfrequency = {}
bign = 0

In [373]:
def logger(msg,level=False):
    if level:
        print(inspect.stack()[1][3],"(): ",msg,sep="")

In [719]:
def sparseadd(onevec,othvec,weight=1,normalised=False):
    if normalised:
        onevec = normalise(onevec)
        othvec = normalise(othvec)        
    result={}
    try:
        for l in onevec:
            result[l] = onevec[l]
        for k in othvec:
            if k in result:
                result[k] = result[k]+othvec[k]*float(weight)
            else:
                result[k] = othvec[k]*float(weight)
    except:
        print("sparseadd(): error")
        raise
    return result
def sparsemultiply(onevec,othvec,weight=1):
    result={}
    try:
        for l in onevec:
            if l in othvec:
                result[l] = onevec[l]*othvec[l]*float(weight)
    except:
        print("sparsemultiply(): error ")
    return result
def sparsexor(onevec,othvec):
    result={}
    try:
        for l in range(len(onevec)):
            if ((l in onevec) and not (l in othvec)):
                result[l] = 1
            if (not (l in onevec) and (l in othvec)):
                result[l] = 1        
    except:
        print("sparsexor(): error ")
    return result

def newrandomvector(n,denseness):
    vec = {}
    k = int(n * denseness)
    if k % 2 != 0:
        k += 1
    if (k > 0):# no need to be careful about this, right? and k % 2 == 0):
        nonzeros = random.sample(list(range(n)),k)
        negatives = random.sample(nonzeros,k//2)
        for i in nonzeros:
            vec[str(i)] = 1;
        for i in negatives:
            vec[str(i)] = -1;
    return vec

def newoperator(n):
    k = 0.1
    return newrandomvector(n,k)

def sparsecosine(xvec,yvec,rounding=True,decimals=4):
    x2 = 0
    y2 = 0
    xy = 0
    try:
        for i in xvec:
            x2 += xvec[i]*xvec[i]
    except KeyError:
        print("sparsecosine(): error at position ",i)
    try:
        for j in yvec:
            y2 += yvec[j]*yvec[j]
            if j in xvec:
                xy += xvec[j]*yvec[j]
    except:
        print("sparsecosine(): errors at position ",j)
    if (x2*y2 == 0):
        cos = 0
    else:
        cos = xy/(math.sqrt(x2)*math.sqrt(y2))
    if (rounding):
        cos=round(cos,decimals)
    return cos

def sparselength(vec,rounding=True):
    x2 = 0
    length=0
    try:
        for i in vec:
            x2 += vec[i]*vec[i]
    except KeyError:
        print("sparselength(): error at position ",i)
    if (x2 > 0):
        length = math.sqrt(x2)
    if (rounding):
        length=round(length,4)
    return length

def comb(vec,k=0.1):
    newvector={}
    n=int(k*dimensionality/2)
    sorted_items=sorted(vec.items(), key=lambda x:x[1])
    bot=sorted_items[:n]
    top=sorted_items[-n:]
    for l in bot:
        newvector[l[0]]=l[1]
    for l in top:
        newvector[l[0]]=l[1]
    return newvector

def sparsesum(vec):
    s=0
    for i in vec:
        s += float(vec[i])
    return s

def normalise(vec):
    newvector={}
    vlen=sparselength(vec,False)
    if (vlen > 0):
        for i in vec:
            newvector[i]=vec[i]/math.sqrt(vlen*vlen)
    else:
        newvector=vec
    return newvector

def modify(vec,factor):
    newvector={}
    for i in vec:
        if (random.random() > factor):
            newvector[i]=vec[i]
        else:
            newvector[i]=float(vec[i])*(0.5-random.random())*2.0
    return newvector

def createpermutation(k):
    permutation=random.sample(range(k), k)
    return permutation
    
def permute(vector,permutation):
    newvector={}
    try:
        for i in range(len(permutation)):
            if str(i) in vector:
                newvector[str(permutation[i])]=vector[str(i)]
    except:
        newvector=vector
        print("permute(): no permutation done, something wrong")
    return newvector

def vectorsaturation(vector):
    d = 0
    for c in vector:
        d += 1
    return d

def frequencyweight(word):
    try:
        w = math.exp(-300*math.pi* globalfrequency[word] / bign)
    except KeyError:
        w = 0.5
    return w

In [305]:
def chkwordspace(words,debug=False):
    global globalfrequency
    global indexspace
    global bign
    i = 0
    for w in words:
        bign += 1
        if w in indexspace:
            globalfrequency[w] += 1
        else:
            indexspace[w] = newrandomvector(dimensionality,denseness)
            if debug:
                print("chkwordspace(): ",w," is new and is now hallucinated.")
        globalfrequency[w] = 1
chkwordspace(["epsilon"])

In [152]:
class clause:
    cleanuppattern=re.compile(r'[\.\'\!\*\?\+,;\:\-\/]+')
    def __init__(self,string):
        self.surfacestring=string
        self.cleanedutterance=re.sub(clause.cleanuppattern,"",string)
        self.tokens=self.cleanedutterance.lower().split()
        self.agent=None
        self.event=None
        self.patient=None
        self.instrument=None
        self.location=None
        self.manner=None
        self.wps = 0.0
        self.cpw = 0.0
    def __str__(self):
        return self.surfacestring
    
class referent:
    def __init__(self, string):
        self.surfacestring=string
        self.definite=True
        self.number=1
        
class event:
    def __init__(self, string):
        self.surfacestring=string    
        self.negated=False    
        self.adverbial=None    
        self.tense=None #past present or future
        self.aspect=None #ongoing perfect or pointwise
        self.mood=None #indicative, irreal, potential, optative, not, imperative

In [578]:
#from nltk.parse.stanford import StanfordDependencyParser
#path_to_stanford_corenlp = '/usr/share/stanford-corenlp-full/'
#parser_jar='stanford-corenlp.jar'
#path_to_models = path_to_stanford_corenlp+'models/'
#model_jar='stanford-english-corenlp-models.jar'
#dependency_parser = StanfordDependencyParser(path_to_jar=path_to_stanford_corenlp+parser_jar, path_to_models_jar=path_to_models+model_jar)

parser_client = corenlp.CoreNLPClient(annotators="tokenize ssplit pos natlog lemma depparse".split())
    
    
def semanticdepparse(string,debug=False,verbose=False):
    depgraph=parser_client.annotate(string)
    utterances=[]
    for ss in depgraph.sentence:
        for w in ss.token:
            if w.lemma not in indexspace:
                chkwordspace([w.lemma])
        utterances.append(semanticdepparseprocess(string,ss,debug,verbose))
    return utterances
        
def semanticdepparseprocess(string,ss,debug=False,verbose=False):
    utterance=clause(string)
    utterance.agent=referent("epsilon")
    utterance.patient=referent("epsilon")
    utterance.event=event("epsilon")
    scratch={}
    negated=False
    adverbial=None
    if verbose:
        print(ss)
    if debug:        
        i=1
        print("root:",ss.basicDependencies.root)
        for w in ss.token:
            print(i," ",w.lemma," ",w.pos)
            i += 1
        for e in ss.basicDependencies.edge:
            print(e.source,ss.token[e.source-1].lemma,"-",e.dep,"->",e.target,ss.token[e.target-1].lemma) 
    i=1
    cpw = 0
    sentenceitems={}
    sentencepos={}
    for w in ss.token:
        sentenceitems[i] = w.lemma
        sentencepos[i] = w.pos
        scratch[i] = False
        i += 1
        cpw += len(w.originalText)
    utterance.wps = i
    if i > 0: 
        utterance.cpw = cpw/i
    root = ss.basicDependencies.root[0] #only one root for now fix this!
    utterance.event=event(sentenceitems[root])  
    utterance.event.tense="PRESENT"
    if sentencepos[root] == "VBD":
        utterance.event.tense="PAST" 
    if sentencepos[root] == "VBN":
        utterance.event.tense="PAST" 
    for edge in ss.basicDependencies.edge:
        if debug:
            print(edge.source,sentenceitems[edge.source],"-",edge.dep,"->",edge.target,sentenceitems[edge.target])
        if edge.dep == 'neg':
            negated=True
        elif edge.dep == 'nsubj':
            utterance.agent=referent(sentenceitems[edge.target])
            if edge.target in scratch: 
                if scratch[edge.target]=="def":
                    utterance.agent.definite=True
                elif scratch[edge.target]=="indef":
                    utterance.agent.definite=False
        elif edge.dep == 'dobj':
                utterance.patient=referent(sentenceitems[edge.target])
                if edge.target in scratch: 
                    if scratch[edge.target]=="def":
                        utterance.patient.definite=True
                    elif scratch[edge.target]=="indef":
                        utterance.patient.definite=False
        elif edge.dep == 'advmod':
                adverbial=sentenceitems[edge.target]            
        elif edge.dep == 'det':
                if (sentenceitems[edge.target]=="a" or sentenceitems[edge.target]=="an"):
                    scratch[edge.source]="indef"
                elif (sentenceitems[edge.target]=="the"):
                    scratch[edge.target]="def"
        elif edge.dep == 'nmod:poss':
                scratch[edge.target]="def"
        elif edge.dep == 'aux':
                if (sentenceitems[edge.target]=="have"):
                    scratch['aux']="have"
                if (sentenceitems[edge.target]=="do"):
                    scratch['aux']="do"
                    if (sentencepos[edge.target] == "VBD"):
                        utterance.event.tense = "PAST"
                if (sentenceitems[edge.target]=="will"):
                    scratch['aux']="will"
                if (sentenceitems[edge.target]=="shall"):
                    scratch['aux']="shall"
    try:
        if debug: 
            print(sentenceitems[root]," ",sentencepos[root])
        if sentencepos[root] == "VB" and ('aux' in scratch and (scratch['aux'] == "will" or scratch['aux'] == "shall")):
            utterance.event.tense="FUTURE"
    except:
        print("semanticdepparse(): tense situation in "+string)
    try:
        utterance.event.adverbial=adverbial
    except:
        print("semanticdepparse(): adverbial blowout in "+string)
    try:
        utterance.event.negated=negated
    except:
        print("semanticdepparse(): negation mismatch in "+string)
    return utterance


def qudepparse(string,debug=False,verbose=False):
    depgraph=parser_client.annotate(string)
    utterances=[]
    for ss in depgraph.sentence:
        for w in ss.token:
            if w.lemma not in indexspace:
                chkwordspace([w.lemma])
        utterances.append(qudepparseprocess(string,ss,debug))
    return utterances
        
def qudepparseprocess(string,ss,debug=False):
    negated=False
    target = "epsilon"
    adverbial="epsilon"
    subject="epsilon"
    verb="epsilon"
    qu="epsilon"
    scratch = {}
    question = {}
    if debug:        
        i=1
        print("root:",ss.basicDependencies.root)
        for w in ss.token:
            print(i," ",w.lemma," ",w.pos)
            i += 1
        for e in ss.basicDependencies.edge:
            print(e.source,ss.token[e.source-1].lemma,"-",e.dep,"->",e.target,ss.token[e.target-1].lemma) 

    sentenceitems={}
    sentenceitems["epsilon"] = None
    sentencepos={}
    root = ss.basicDependencies.root[0] #only one root for now fix this!
    qu = root
    target = root
    verb = root
    i=1
    for w in ss.token:
        sentenceitems[i] = w.lemma
        sentencepos[i] = w.pos
        scratch[i] = False
        if w.pos == "WP":
            qu = i
        if w.pos == "WRB":
            qu = i
        i += 1
#        if w.pos == "VBD":
#            verb = i
#        if w.pos == "VBN":
#            verb = i
#        if w.pos == "VBZ":
#            verb = i
#        if w.pos == "VBP":
#            verb = i            
    tense="PRESENT"
    if sentencepos[root] == "VBD":
        tense="PAST" 
    if sentencepos[root] == "VBN":
        tense="PAST" 
    for edge in ss.basicDependencies.edge:
        if debug:
            print(edge.source,sentenceitems[edge.source],"-",edge.dep,"->",edge.target,sentenceitems[edge.target])
        if edge.dep == 'nsubj':
            subject= edge.target
        elif edge.dep == 'neg':
            negated=True
        elif edge.dep == 'advmod':
            if edge.target == qu:
                if edge.source == root:
                    target = "epsilon"
                else: 
                    target = edge.source
            else:
                adverbial = edge.target
        elif edge.dep == 'cop':
            if edge.target == qu:
                target = edge.source
            else:
                adverbial = edge.target
        elif edge.dep == 'aux':
            if (sentenceitems[edge.target]=="have"):
                scratch['aux']="have"
            if (sentenceitems[edge.target]=="do"):
                scratch['aux']="do"
            if (sentencepos[edge.target] == "VBD"):
                utterance.event.tense = "PAST"
            if (sentenceitems[edge.target]=="will"):
                scratch['aux']="will"
            if (sentenceitems[edge.target]=="shall"):
                scratch['aux']="shall"
    if target == "epsilon":
        if subject != "epsilon":
            target = subject
    try:
        if debug: 
            print(sentenceitems[root]," ",sentencepos[root])
        if sentencepos[root] == "VB" and ('aux' in scratch and (scratch['aux'] == "will" or scratch['aux'] == "shall")):
            tense="FUTURE"
    except:
        logger("tense situation in "+string,True)
    question["question"] = sentenceitems[qu]
    question["target"] = sentenceitems[target]
    question["verb"] = sentenceitems[verb]
    question["adverbial"] = sentenceitems[adverbial]
    question["subject"] = sentenceitems[subject]
    question["tense"] = tense
    question["negated"] = negated
    if debug:
        print(question["question"],question["target"],question["verb"],question["adverbial"],question["subject"],question["tense"],question["negated"],sep="\t")
    return question


In [580]:
a=qudepparse("How long is the river Peugeot",False)

In [571]:
list(a[0].values())

['how', 'be', 'Peugeot', False, 'PRESENT', None, 'long']

In [431]:
def initconstantsandpermutations(dimensionality=2000,densenessforabstracts=0.20):

    global definitepermutation
    global indefinitepermutation
    global futurepermutation
    global pasttensepermutation
    global presenttensepermutation
    global agentpermutation
    global patientpermutation
    global eventpermutation
    global instrumentpermutation
    global locationpermutation
    global mannerpermutation
    global negationpermutation
    global adverbialpermutation

    global previouspermutation

    global tensepermutation

    global agentabstract
    global patientabstract
    global eventabstract
    global instrumentabstract
    global locationabstract
    global mannerabstract
    global negationabstract

    global presentabstract
    global pastabstract
    global progressiveabstract
    global futureabstract
    
    global tagpermutation

    definitepermutation=createpermutation(dimensionality)
    indefinitepermutation=createpermutation(dimensionality)
    futurepermutation=createpermutation(dimensionality)
    pasttensepermutation=createpermutation(dimensionality)
    presenttensepermutation=createpermutation(dimensionality)
    agentpermutation=createpermutation(dimensionality)
    patientpermutation=createpermutation(dimensionality)
    eventpermutation=createpermutation(dimensionality)
    instrumentpermutation=createpermutation(dimensionality)
    locationpermutation=createpermutation(dimensionality)
    mannerpermutation=createpermutation(dimensionality)
    negationpermutation=createpermutation(dimensionality)
    adverbialpermutation=createpermutation(dimensionality)
    #sequence
    previouspermutation=createpermutation(dimensionality)

    tensepermutation=createpermutation(dimensionality)

    agentabstract=newrandomvector(dimensionality,densenessforabstracts)
    patientabstract=newrandomvector(dimensionality,densenessforabstracts)
    eventabstract=newrandomvector(dimensionality,densenessforabstracts)
    instrumentabstract=newrandomvector(dimensionality,densenessforabstracts)
    locationabstract=newrandomvector(dimensionality,densenessforabstracts)
    mannerabstract=newrandomvector(dimensionality,densenessforabstracts)
    negationabstract=newrandomvector(dimensionality,densenessforabstracts)

    presentabstract=newrandomvector(dimensionality,densenessforabstracts)
    pastabstract=newrandomvector(dimensionality,densenessforabstracts)
    progressiveabstract=newrandomvector(dimensionality,densenessforabstracts)
    futureabstract=newrandomvector(dimensionality,densenessforabstracts)
    
    tagpermutation=createpermutation(dimensionality)


In [391]:
def utterancevector(utterance,debug=False,lexical=True,morphology=True,semanticroles=True,constructional=True):
    baremorphology=morphology
    combinedmorphology=lexical
    uttvec={}
    momvec = {}
    if debug:
        debugvec={}
    if lexical:     # add in referent, using its context vector meaning more or less the concept it refers to
        if (utterance.patient): # took out comb
            if debug:
                momvec = uttvec
            uttvec=sparseadd(uttvec, indexspace[utterance.patient.surfacestring],frequencyweight(utterance.patient.surfacestring)) #, math.log( bign / globalfrequency[utterance.patient.surfacestring]))
            logger("adding "+utterance.patient.surfacestring,debug)
            logger("shift "+str(sparsecosine(momvec,uttvec)),debug)
        if (utterance.event):
            if debug:
                momvec = uttvec
            uttvec=sparseadd(uttvec,indexspace[utterance.event.surfacestring],frequencyweight(utterance.event.surfacestring)) #, math.log( bign / globalfrequency[utterance.event.surfacestring]))
            logger("adding "+utterance.event.surfacestring,debug)
            logger("shift "+str(sparsecosine(momvec,uttvec)),debug)
        if (utterance.agent):
            if debug:
                momvec = uttvec
            uttvec=sparseadd(uttvec,indexspace[utterance.agent.surfacestring],frequencyweight(utterance.agent.surfacestring)) #, math.log( bign / globalfrequency[utterance.agent.surfacestring]))
            logger("adding "+utterance.agent.surfacestring,debug)
            logger("shift "+str(sparsecosine(momvec,uttvec)),debug)
    if debug:
        print("utterancevec() lexicon: ",sparsecosine(uttvec,debugvec))
        debugvec=uttvec
    if morphology:     # add in morphological information about the roles of the referent
        morphvec={}
        if (utterance.agent):
            if (utterance.agent.definite):
                morphvec=sparseadd(morphvec,permute(indexspace[utterance.agent.surfacestring],definitepermutation))
            else:
                morphvec=sparseadd(morphvec,permute(indexspace[utterance.agent.surfacestring],indefinitepermutation))
        if (utterance.patient):
            if (utterance.patient.definite):
                    morphvec=sparseadd(morphvec,permute(indexspace[utterance.patient.surfacestring],definitepermutation))
            else:
     #       uttvec=sparseadd(uttvec,normalise(permute(comb(vectorspace[utterance.patient.surfacestring]),indefinite)))  
                    morphvec=sparseadd(morphvec,permute(indexspace[utterance.patient.surfacestring],indefinitepermutation))
        if (utterance.event):
            if (utterance.event.tense=="FUTURE"):
                    morphvec=sparseadd(morphvec,permute(indexspace[utterance.event.surfacestring],futuretensepermutation))
            if (utterance.event.tense=="PAST"):
                    morphvec=sparseadd(morphvec,permute(indexspace[utterance.event.surfacestring],pasttensepermutation))
            else:
                    morphvec=sparseadd(morphvec,permute(indexspace[utterance.event.surfacestring],presenttensepermutation))
        uttvec=sparseadd(normalise(uttvec),normalise(morphvec))
        if debug:
            print("utterancevec() morphology: ",sparsecosine(uttvec,debugvec))
            debugvec=uttvec
    if semanticroles:
        semroles = {}
        if (utterance.patient):
                semroles=sparseadd(semroles,permute(indexspace[utterance.patient.surfacestring],patientpermutation))
        if (utterance.event):
                semroles=sparseadd(semroles,permute(indexspace[utterance.event.surfacestring],eventpermutation))
        if (utterance.event.negated):
                semroles=sparseadd(semroles,permute(indexspace[utterance.event.surfacestring],negationpermutation))
        if (utterance.event.adverbial):
                semroles=sparseadd(semroles,permute(indexspace[utterance.event.adverbial],adverbialpermutation))
        if (utterance.agent):
                semroles=sparseadd(semroles,permute(indexspace[utterance.agent.surfacestring],agentpermutation))
        uttvec=sparseadd(normalise(uttvec),normalise(semroles))
        if debug:
            print("utterancevec() semanticrole: ",sparsecosine(uttvec,debugvec))
            debugvec=uttvec
    if constructional:    # add in morphological information about the roles, no account of referent
        constrvec={}
        if (utterance.agent):
            if (utterance.agent.definite):
                constrvec=sparseadd(constrvec,permute(agentabstract,definitepermutation))
            else:
                constrvec=sparseadd(constrvec,permute(agentabstract,indefinitepermutation))
        if (utterance.patient):
            if (utterance.patient.definite):
                constrvec=sparseadd(constrvec,permute(patientabstract,definitepermutation))
            else:
                constrvec=sparseadd(constrvec,permute(patientabstract,indefinitepermutation))
        if (utterance.event):
            if (utterance.event.tense=="FUTURE"):
                constrvec=sparseadd(constrvec,permute(futureabstract,tensepermutation))
            elif (utterance.event.tense=="PAST"):
                constrvec=sparseadd(constrvec,permute(pastabstract,tensepermutation))
            else:
                constrvec=sparseadd(constrvec,permute(presentabstract,tensepermutation)) 
        if (utterance.event.negated):
            uttvec=sparseadd(normalise(uttvec),normalise(permute(negationabstract,negationpermutation)))
        uttvec=sparseadd(normalise(uttvec),normalise(constrvec))
        if debug:
            print("utterancevec() constructions: ",sparsecosine(uttvec,debugvec))
            debugvec=uttvec
    return uttvec


In [632]:
def gettags(quinfilename=questionfile):
    global taglist
    global speclist
    global tagspec
    global spectag
    taglist = []
    speclist = []
    tagspec = {}
    spectag = {}
    qupattern = re.compile(r'(\w+):(\w+)\s+(.*)$')
    with codecs.open(quinfilename,"r", encoding='utf-8') as infile:        
        quline = infile.readline().rstrip()
        while quline:
            m = qupattern.match(quline)
            if m:
                text = m.groups()[2]
                tag = m.groups()[0]
                spec = m.groups()[1]
                if tag not in taglist:
                    taglist.append(tag)
                    spectag[tag] = []
                if spec not in speclist:
                    speclist.append(spec)
                    tagspec[spec] = tag
                    spectag[tag].append(spec)
            quline = infile.readline().rstrip()


In [634]:
speclist

['manner',
 'cremat',
 'animal',
 'exp',
 'ind',
 'gr',
 'title',
 'def',
 'date',
 'reason',
 'event',
 'state',
 'desc',
 'count',
 'other',
 'letter',
 'religion',
 'food',
 'country',
 'color',
 'termeq',
 'city',
 'body',
 'dismed',
 'mount',
 'money',
 'product',
 'period',
 'substance',
 'sport',
 'plant',
 'techmeth',
 'volsize',
 'instru',
 'abb',
 'speed',
 'word',
 'lang',
 'perc',
 'code',
 'dist',
 'temp',
 'symbol',
 'ord',
 'veh',
 'weight',
 'currency']

In [622]:
def weightfunction(word):
    if word in globalfrequency:
        return globalfrequency[word]
    elif word == "be":
        return 0.1
    else: 
        return 1

In [665]:
def addconfusion(facit,predicted):
    global confusionmatrix
    if facit in confusionmatrix:
        if predicted in confusionmatrix[facit]:
            confusionmatrix[facit][predicted] += 1
        else:
            confusionmatrix[facit][predicted] = 1
    else: 
        confusionmatrix[facit] = {}
        confusionmatrix[facit][predicted] = 1
    
    

In [729]:
qupattern = re.compile(r'(\w+):(\w+)\s+(.*)$')
with codecs.open(questionfile,"r", encoding='utf-8') as infile:        
    quline = infile.readline().rstrip()
    m = qupattern.match(quline)
    if m:
        print(m.groups()[0])
        print(m.groups()[1])
        print(m.groups()[2])
        text = m.groups()[2]
        tag = m.groups()[0]
        spec = m.groups()[1]
        sents = sent_tokenize(text)
##        print(permute(indexspace[tag],tagpermutation))
##        print(permute(indexspace[spec],tagpermutation))
        for sentence in sents:
            analyses = qudepparse(sentence)
            for analysis in analyses:
                uvector = getvector(analysis,sentence,semroles,selective)
                print(uvector)
                uvector = sparseadd(uvector,
                                    sparseadd(permute(indexspace[tag],tagpermutation),
                                              permute(indexspace[spec],tagpermutation),1,True),1,True)
        print(uvector)

DESC
manner
How did serfdom develop in and then leave Russia ?
{}
{'115': -0.035444060250417034, '1037': -0.035444060250417034, '1525': -0.035444060250417034, '1876': 0.035444060250417034, '434': 0.035444060250417034, '1022': -0.035444060250417034, '941': -0.035444060250417034, '336': -0.035444060250417034, '1049': -0.035444060250417034, '587': 0.035444060250417034, '1821': 0.035444060250417034, '1777': -0.035444060250417034, '1613': 0.035444060250417034, '1193': -0.07088812050083407, '1080': 0.035444060250417034, '1595': -0.035444060250417034, '1158': -0.035444060250417034, '1693': 0.0, '1594': 0.035444060250417034, '865': -0.035444060250417034, '42': -0.035444060250417034, '400': 0.035444060250417034, '387': 0.035444060250417034, '782': -0.035444060250417034, '1235': -0.035444060250417034, '639': 0.035444060250417034, '1833': 0.07088812050083407, '1722': 0.035444060250417034, '1998': 0.035444060250417034, '499': -0.035444060250417034, '893': 0.035444060250417034, '1272': 0.0354440602

In [None]:
def testquestions(debug=False,moredebug=False,semroles=False,selective=False):
    semroles = False
    selective = False
    global ctx
    qupattern = re.compile(r'(\w+):(\w+)\s+(.*)$')
    ticker = 0
    batch = 100
    with codecs.open(questionfile,"r", encoding='utf-8') as infile:        
        quline = infile.readline().rstrip()
        while quline:
            ticker += 1
            if ticker >= batch:
                print(".",end="")
                ticker = 0
            idx = 0
            m = qupattern.match(quline)
            if m:
                logger(m.groups()[0] + " " + m.groups()[1] + " " + m.groups()[2],moredebug)
                text = m.groups()[2]
                tag = m.groups()[0]
                spec = m.groups()[1]
                if tag not in taglist:
                    taglist.append(tag)
                    spectag[tag] = []
                    ctx[tag] = {}
                    indexspace[tag] = newrandomvector(dimensionality,denseness)
                    globalfrequency[tag] = 1
                if spec not in speclist:
                    speclist.append(spec)
                    tagspec[spec] = tag
                    spectag[tag].append(spec)
                    ctx[spec] = {}
                    indexspace[spec] = newrandomvector(dimensionality,denseness)
                    globalfrequency[spec] = 1
                txts[idx] = text
                tags[idx] = tag
                spex[idx] = spec
                sents = sent_tokenize(text)
                i = 0
                for sentence in sents:
                    if debug: 
                        logger(sentence,debug)
                    analyses = qudepparse(sentence)
                    kk = 0
                    for analysis in analyses:
                        uvector = getvector(analysis,sentence,semroles,selective)
                        ctx[tag] = sparseadd(ctx[tag],normalise(uvector))
                        ctx[spec] = sparseadd(ctx[spec],normalise(uvector))
                        logger(str(idx)+" ====================",debug)
                        uvector = sparseadd(uvector,
                                            sparseadd(permute(indexspace[tag],tagpermutation),
                                                      permute(indexspace[spec],tagpermutation),1,True),1,True)
                        logger(str(sparsecosine(uvector,permute(sparseadd(indexspace[tag],indexspace[spec]),tagpermutation)))+" "+str(sparsecosine(uvector,indexspace[tag]))+" "+str(sparsecosine(uvector,indexspace[spec])),debug)
                        prev = uvector
                        if (kk > 0): # there was more than one analysis
                            idx += 1
                            txts[idx] = text
                            tags[idx] = tag
                            spex[idx] = spec
                        utterancespace[idx] = uvector
                        kk += 1
                    i += 1
                    idx += 1
            try:
                quline = infile.readline() 
            except UnicodeDecodeError:
                logger("read error: "+quline,True)
                quline = infile.readline() 
def testem():
    for uu in utterancespace:
        tt = txts[uu]
        aa = dependencyanalysisstore[uu]
        nn = utterancevector(aa)
        ss = utterancespace[uu]
        print(uu,tt)
        print(uu,sparsecosine(nn,ss),tags[uu],spex[uu],sep="\t")
        for vv in utterancespace:
            mm = utterancespace[vv]
            print("\t",vv,sparsecosine(nn,mm),sparsecosine(ss,mm),sep="\t",end="\t")
            try:
                print(tags[vv],spex[vv],sep="\t")
            except KeyError:
                print("NONE","none",sep="\t")

In [759]:
def getvector(analysis,sentence,semroles=False,selective=False,train=True):
    uvector = {} # vector for test item
    if semroles: 
        wds = list(analysis.values())
        chkwordspace(wds) # make sure no KeyErrors will occur, add all words to lexicon
        for role in analysis:
            item = analysis[role]
            if role not in permutationcollection:
                permutationcollection[role] = createpermutation(dimensionality)
            vector = sparseadd(uvector,
                               permute(normalise(indexspace[item]),
                                       permutationcollection[role]))
    elif selective: # only lexical items with roles
        wds = list(analysis.values())
        chkwordspace(wds) # make sure no KeyErrors will occur, add all words to lexicon
        for item in wds:
            uvector = sparseadd(uvector,normalise(indexspace[item]))
    else: # straight lexical 
        wds = word_tokenize(sentence)
        chkwordspace(wds) # make sure no KeyErrors will occur, add all words to lexicon
        for item in wds:
            uvector = sparseadd(uvector,normalise(indexspace[item]))
    return uvector

In [760]:
def evaluateTREC(debug=False,moredebug=False,semroles=False,selective=False):
    global permutationcollection
    confusionmatrix = {}
    antal = 0
    treffar = 0
    treff01 = 0
    treff05 = 0
    treff10 = 0
    ticker = 0
    streffar = 0
    streff01 = 0
    streff05 = 0
    streff10 = 0
    sticker = 0
    batch = 100
    debugprintthreshold = 10
    qupattern = re.compile(r'(\w+):(\w+)\s+(.*)$')
    with codecs.open(testfile,"r", encoding='utf-8') as infile:        
        quline = infile.readline().rstrip()
        idx = 0
        while quline:
            m = qupattern.match(quline)
            if m:
                text = m.groups()[2]
                tag = m.groups()[0]
                spec = m.groups()[1]
                sents = sent_tokenize(text)
                for sentence in sents:
                    key = tag+":"+spec
                    logger(key+"\t"+sentence,debug)
                    analyses = qudepparse(sentence)
                    for analysis in analyses:  # almost certainly only one analysis
                        antal += 1
                        ticker += 1
                        uvector = getvector(analysis,sentence,semroles,selective)
                        # =========================================================================
                        # compare with tag context vectors
                        tagneighbours = {}
                        for kk in taglist:
                            for ll in spectag[kk]:
                                cosinedistance = sparsecosine(sparseadd(normalise(ctx[kk]),normalise(ctx[ll])),uvector)
                                tagneighbours[kk+":"+ll] = cosinedistance
                                sortedtagneighbours = sorted(tagneighbours.items(), key=lambda neighbour:neighbour[1], reverse=True) #[0:10]
                        addconfusion(key,sortedtagneighbours[0][0])  # compare top prediction with gold standard
                        rankofsortedtagneighbour = 0
                        for onesortedtagneighbour in sortedtagneighbours:
                            rankofsortedtagneighbour += 1
                            result = ""
                            if onesortedtagneighbour[0] == key:  # it's a hit!
                                if debug:
                                    result = "***"
                                treffar += rankofsortedtagneighbour
                                if rankofsortedtagneighbour == 1:
                                    treff01 += 1
                                if rankofsortedtagneighbour <= 5:
                                    treff05 += 1
                                if rankofsortedtagneighbour <= 10:
                                    treff10 += 1
                            if rankofsortedtagneighbour <= debugprintthreshold:
                                logger(onesortedtagneighbour[0]+"\t"+str(onesortedtagneighbour[1])+"\t"+str(result),debug)
                            if batch > 0 and ticker >= batch:
                                average = 0 
                                if antal > 0:
                                    average = treffar / antal
                                print(treff01,treff05,treff10,antal,average,sep="\t")
                                ticker = 0
                        # =========================================================================
                        # compare with train sentence vectors
                        sentenceneighbours = {}
                        for uu in utterancespace:
                            cosinedistance = sparsecosine(utterancespace[uu],uvector)
                            mt = 0
                            kk = "TBD"
                            for kt in taglist:
                                if sparsecosine(indexspace[kt],utterancespace[uu]) > mt:
                                    kk = kt
                            ms = 0
                            ll = "tbd"
                            try:
                                for st in spectag[kt]:
                                    if sparsecosine(indexspace[st],utterancespace[uu]) > ms:
                                        ll = st
                            except:
                                ll = "error"
                            sentenceneighbours[kk+":"+ll] = cosinedistance
                            sortedsentenceneighbours = sorted(sentenceneighbours.items(), key=lambda neighbour:neighbour[1], reverse=True) #[0:10]
                        addconfusion(key,sortedsentenceneighbours[0][0])  # compare top prediction with gold standard
                        rankofsortedsentenceneighbour = 0
                        for onesortedneighbour in sortedsentenceneighbours:
                            rankofsortedsentenceneighbour += 1
                            result = ""
                            if onesortedneighbour[0] == key:  # it's a hit!
                                if debug:
                                    result = "***"
                                streffar += rankofsortedsentenceneighbour
                                if rankofsortedsentenceneighbour == 1:
                                    streff01 += 1
                                if rankofsortedsentenceneighbour <= 5:
                                    streff05 += 1
                                if rankofsortedsentenceneighbour <= 10:
                                    streff10 += 1
                            if rankofsortedsentenceneighbour <= debugprintthreshold:
                                logger(onesortedneighbour[0]+"\t"+str(onesortedneighbour[1])+"\t"+str(result),debug)
                            if batch > 0 and sticker >= batch:
                                average = 0 
                                if antal > 0:
                                    average = streffar / antal
                                print(streff01,streff05,streff10,antal,average,sep="\t")
                                sticker = 0
            try:
                quline = infile.readline() 
            except UnicodeDecodeError:
                logger("read error: "+quline,True)
                quline = infile.readline() 
    average = 0
    if antal > 0:
        average = treffar / antal
    print(treff01,treff05,treff10,antal,average,sep="\t")
    if antal > 0:
        average = streffar / antal
    print(streff01,streff05,streff10,antal,average,sep="\t")
    return confusionmatrix

In [761]:
questionfile="/home/jussi/data/questions/train_3000.label"
testfile="/home/jussi/data/questions/TREC_10.label"
initconstantsandpermutations()
tags = {}
spex = {}
txts = {}
ctx = {}
indexspace = {}
chkwordspace(["epsilon"])
denseness = 0.2
permutationcollection = {}
debug = False
moredebug = False
semantics = False
selective = False
utterancespace = {}
ctx = {}
print("training",semantics,selective,time.ctime())
testquestions(debug, moredebug, semantics, selective)
print("testing",semantics,selective,time.ctime())
cm1 = evaluateTREC(debug, moredebug, semantics, selective)
selective = True
utterancespace = {}
ctx = {}
print("training",semantics,selective,time.ctime())
testquestions(debug, moredebug, semantics, selective)
print("testing",semantics,selective,time.ctime())
cm2 = evaluateTREC(debug, moredebug, semantics, selective)
selective = False
semantics = True
utterancespace = {}
ctx = {}
print("training",semantics,selective,time.ctime())
testquestions(debug, moredebug, semantics, selective)
print("testing",semantics,selective,time.ctime())
cm3 = evaluateTREC(debug, moredebug, semantics, selective)

training False False Wed Feb  7 13:39:22 2018
testing False False Wed Feb  7 13:40:46 2018
52	68	72	100	4.17
97	131	143	200	4.96
145	201	219	300	4.703333333333333
195	265	289	400	4.9025
249	333	364	500	4.882
250	335	366	502	4.870517928286852
0	2	5	502	1.051792828685259
training False True Wed Feb  7 16:57:49 2018
testing False True Wed Feb  7 16:59:26 2018
sparsecosine(): errors at position  22
sparsecosine(): errors at position  890
sparsecosine(): errors at position  1389
sparsecosine(): errors at position  1327


KeyboardInterrupt: 

In [754]:
spectag

{'ABBR': ['exp', 'abb'],
 'DESC': ['manner', 'def', 'reason', 'desc'],
 'ENTY': ['cremat',
  'animal',
  'event',
  'other',
  'letter',
  'religion',
  'food',
  'color',
  'termeq',
  'body',
  'dismed',
  'product',
  'substance',
  'sport',
  'plant',
  'techmeth',
  'instru',
  'word',
  'lang',
  'symbol',
  'veh',
  'currency'],
 'HUM': ['ind', 'gr', 'title'],
 'LOC': ['state', 'country', 'city', 'mount'],
 'NUM': ['date',
  'count',
  'money',
  'period',
  'volsize',
  'speed',
  'perc',
  'code',
  'dist',
  'temp',
  'ord',
  'weight']}

just the words

In [681]:
evaluateTREC()

50	65	67	100	7.11
94	118	127	200	7.875
141	182	196	300	7.266666666666667
192	243	260	400	7.2025
239	303	325	500	7.374
240	304	327	502	7.3585657370517925


In [718]:
indexspace["DESC"]

{'0': -1,
 '10': -1,
 '1022': -1,
 '1025': -1,
 '1029': -1,
 '1038': 1,
 '1039': 1,
 '1045': -1,
 '1050': -1,
 '1052': 1,
 '1053': 1,
 '1058': -1,
 '1067': 1,
 '1069': 1,
 '1075': 1,
 '1076': 1,
 '1080': -1,
 '1084': -1,
 '1087': 1,
 '1089': 1,
 '1092': 1,
 '1094': 1,
 '1095': -1,
 '1097': -1,
 '1100': 1,
 '1101': -1,
 '1102': -1,
 '1103': -1,
 '1112': -1,
 '1128': 1,
 '1129': -1,
 '1130': -1,
 '1132': 1,
 '1138': -1,
 '1143': 1,
 '1152': 1,
 '1153': -1,
 '1158': -1,
 '1163': 1,
 '1182': 1,
 '1186': 1,
 '1187': -1,
 '1191': -1,
 '1193': 1,
 '12': -1,
 '120': 1,
 '1204': -1,
 '1221': 1,
 '1243': -1,
 '1245': 1,
 '1246': 1,
 '1254': -1,
 '1257': 1,
 '1259': -1,
 '1263': -1,
 '1275': 1,
 '1276': -1,
 '1278': -1,
 '1279': -1,
 '128': 1,
 '1282': -1,
 '1285': 1,
 '1286': 1,
 '1297': 1,
 '13': -1,
 '1301': 1,
 '1302': 1,
 '1303': -1,
 '1307': 1,
 '131': -1,
 '1323': -1,
 '1324': 1,
 '133': 1,
 '1330': 1,
 '1335': 1,
 '1336': 1,
 '1337': -1,
 '1340': 1,
 '1367': -1,
 '1369': -1,
 '1372': -1,


In [None]:
for t in taglist:
    print(t,sparselength(ctx[t]))
    for w in indexspace:
        c = sparsecosine(ctx[t],indexspace[w])
        if w and c > 0.1:
            print("\t",w,c)

In [691]:
confusionmatrix

{}

In [639]:
uu = {}
for ws in ["how","long","be","the","river"]:
    uu = sparseadd(normalise(uu),normalise(indexspace[ws]))
for kk in taglist:
    for ll in spectag[kk]:
        c = sparsecosine(sparseadd(normalise(ctx[kk]),normalise(ctx[ll])),uu)
        if c > 0.1:
            print(kk,":",ll,"\t",c)

DESC : manner 	 0.1483
DESC : desc 	 0.1013
ENTY : animal 	 0.1359
ENTY : other 	 0.1163
ENTY : letter 	 0.1161
ENTY : religion 	 0.1204
ENTY : color 	 0.1782
ENTY : dismed 	 0.1041
ENTY : product 	 0.1091
ENTY : sport 	 0.1516
ENTY : plant 	 0.1107
ENTY : instru 	 0.1029
ENTY : symbol 	 0.1535
ENTY : currency 	 0.101
HUM : gr 	 0.1175
HUM : title 	 0.1165
NUM : date 	 0.15
NUM : count 	 0.202
NUM : money 	 0.1764
NUM : period 	 0.277
NUM : volsize 	 0.1942
NUM : speed 	 0.1876
NUM : perc 	 0.1725
NUM : code 	 0.149
NUM : dist 	 0.2467
NUM : temp 	 0.1729
NUM : ord 	 0.1178
NUM : weight 	 0.1548
LOC : state 	 0.162
LOC : country 	 0.177
LOC : city 	 0.1553
LOC : mount 	 0.1299


In [525]:
semanticdepparse("Who was Peugeot?",True)

root: [1]
1   who   WP
2   be   VBD
3   Peugeot   NNP
4   ?   .
1 who - cop -> 2 be
1 who - nsubj -> 3 Peugeot
1 who - punct -> 4 ?
1 who - cop -> 2 be
1 who - nsubj -> 3 Peugeot
1 who - punct -> 4 ?
who   WP


[<__main__.clause at 0x7f10a1d28ba8>]

In [468]:
uvv={}
uvv = sparseadd(indexspace['name'],uvv)
print("nu",sparsecosine(indexspace['name'],uvv))
print("8u",sparsecosine(utterancespace[8],uvv))
uvv = sparseadd(indexspace['he'],uvv)
print("nu",sparsecosine(indexspace['name'],uvv))
print("8u",sparsecosine(utterancespace[8],uvv))
uvv = sparseadd(indexspace['10'],uvv)
print("nu",sparsecosine(indexspace['name'],uvv))
print("8u",sparsecosine(utterancespace[8],uvv))
uvv = sparseadd(indexspace['HUM'],uvv)
print("nu",sparsecosine(indexspace['name'],uvv))
print("8u",sparsecosine(utterancespace[8],uvv))
uvv = sparseadd(indexspace['ind'],uvv)
print("nu",sparsecosine(indexspace['name'],uvv))
print("8u",sparsecosine(utterancespace[8],uvv))
print("n8",sparsecosine(indexspace['name'],utterancespace[8]))
utterance=dependencyanalysisstore[8]
print("M")
morphvec={}
morphvec=sparseadd(morphvec,permute(indexspace[utterance.agent.surfacestring],definitepermutation))
morphvec=sparseadd(morphvec,permute(indexspace[utterance.patient.surfacestring],definitepermutation))
morphvec=sparseadd(morphvec,permute(indexspace[utterance.event.surfacestring],pasttensepermutation))
uvv=sparseadd(normalise(uvv),normalise(morphvec))
print("nu",sparsecosine(indexspace['name'],uvv))
print("8u",sparsecosine(utterancespace[8],uvv))
print("S")
semroles={}
semroles=sparseadd(semroles,permute(indexspace[utterance.patient.surfacestring],patientpermutation))
semroles=sparseadd(semroles,permute(indexspace[utterance.event.surfacestring],eventpermutation))
semroles=sparseadd(semroles,permute(indexspace[utterance.event.adverbial],adverbialpermutation))
semroles=sparseadd(semroles,permute(indexspace[utterance.agent.surfacestring],agentpermutation))
uvv=sparseadd(normalise(uvv),normalise(semroles))
uvv2=sparseadd(uvv,semroles)
print("nu",sparsecosine(indexspace['name'],uvv))
print("8u",sparsecosine(utterancespace[8],uvv))
print("nu2",sparsecosine(indexspace['name'],uvv2))
print("8u2",sparsecosine(utterancespace[8],uvv2))
print("C")
constrvec={}
constrvec=sparseadd(constrvec,permute(agentabstract,definitepermutation))
constrvec=sparseadd(constrvec,permute(patientabstract,definitepermutation))
constrvec=sparseadd(constrvec,permute(pastabstract,tensepermutation))
uvv=sparseadd(normalise(uvv),normalise(constrvec))
print("nu",sparsecosine(indexspace['name'],uvv))
print("hu",sparsecosine(indexspace['he'],uvv))
print("tu",sparsecosine(indexspace['10'],uvv))
print("8u",sparsecosine(utterancespace[8],uvv))


nu 1.0
8u -0.0082
nu 0.7115
8u 0.074
nu 0.5774
8u 0.0662
nu 0.4979
8u 0.0675
nu 0.4543
8u 0.0655
n8 -0.0082
M
nu 0.3106
8u 0.063
S
nu 0.2392
8u 0.0578
nu2 0.0394
8u2 0.0212
C
nu 0.1535
hu 0.1818
tu 0.17
8u 0.0324


In [410]:
def rununittest():
    for v in utterancespace:
        print("-------------")
        print(v)
        neighbours = {}
        for w in utterancespace:
            print(w)
            if w == v:
                continue
            neighbours[w] = sparsecosine(utterancespace[v],utterancespace[w])
        ns = sorted(neighbours.items(), key=lambda neighbour:neighbour[1], reverse=True)[0:10]
        for ww in ns:
            print("-",ww,tags[idx],spex[idx],txts[idx],sep="\t")

In [236]:
rununittest()

-------------
qu0
qu0


In [None]:
def command_loop(debug=False,moredebug=False):
    instring = ''
    prev=newrandomvector(dimensionality,denseness)
    prevlex=newrandomvector(dimensionality,denseness)
    prevsem=newrandomvector(dimensionality,denseness)
    prevmor=newrandomvector(dimensionality,denseness)
    prevmgn=newrandomvector(dimensionality,denseness)
    instring = input('> ')
    while instring != 'quit':
        try:
            s = instring.rstrip()
            wds=nltk.word_tokenize(s.lower())
            chkwordspace(wds,debug)
            try:
                cs = semanticdepparse(s.lower(),debug)
                for c in cs:
                    u = utterancevector(c,moredebug, True, False, False,False)
                    print(" lexical     ",sparsecosine(prev,u),end="\t(")
                    print(sparsecosine(prevlex,u),")")
                    prevlex=u
                    u = utterancevector(c,moredebug, False, False, True, False)
                    print(" semrole     ",sparsecosine(prev,u),end="\t(")
                    print(sparsecosine(prevsem,u),")")
                    prevsem=u
                    u = utterancevector(c,moredebug,  False, True, False, False)
                    print(" morph     ",sparsecosine(prev,u),end="\t(")
                    print(sparsecosine(prevmor,u),")")
                    prevmor=u
                    u = utterancevector(c,moredebug, False, False, False, True)
                    print(" construction  ",sparsecosine(prev,u),end="\t(")
                    print(sparsecosine(prevmgn,u),")")
                    prevmgn=u
                    u = utterancevector(c,moredebug)
                    print(" in toto     ",sparsecosine(prev,u))
                    prev = u
            except:
                print("****")
            instring = input('> ')
        except:
            instring = 'quit'
    print("hey!")
    
    

In [None]:
command_loop(False,False)



In [None]:
# lexicon: what concepts are mentioned (context vectors)
f1 = False

# morphology: what morphological form is represented (index vectors permuted with morph permutations)
f2 = True

# semantic roles: e.g. is the agent definite? (index vectors, deep case permutations)
f3 = True

# constructions: (abstract random feature vectors)
f4 = True

#debug output (no good at present)
d = False
a = "The cat eats the cream."
b = "The fish ate the worm."
c = "she winked."
target=permute(pastabstract,tensepermutation)
ap = semanticdepparse(a.lower())[0]
bp = semanticdepparse(b.lower())[0]
cp = semanticdepparse(c.lower())[0]
av = utterancevector(ap,d,f1,f2,f3,f4)
bv = utterancevector(bp,d,f1,f2,f3,f4)
cv = utterancevector(cp,d,f1,f2,f3,f4)

print("a vs b\t",sparsecosine(av,bv))
print("a vs c\t",sparsecosine(av,cv))
print("b vs c\t",sparsecosine(bv,cv))

print("a vs tgt\t",sparsecosine(av,target))
print("b vs tgt\t",sparsecosine(bv,target))
print("c vs tgt\t",sparsecosine(cv,target))


# test negation

In [None]:
initconstantsandpermutations(dimensionality,0.2)
indexspace={}


In [168]:
#for denseness in [0.05]: #[0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01]:


#for k in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]:
#    negationabstract=newrandomvector(dimensionality,k)
#    presentabstract=newrandomvector(dimensionality,k)

def uv(words):
    utterance=semanticdepparse(words.lower())[0]
    return utterancevector(utterance,False, False, True, True, True)


print("definite:\t",sparsecosine(uv("The cat does not eat the fish."),permute(agentabstract,definitepermutation)))
print("definite:\t",sparsecosine(uv("The cat does not eat the fish."),permute(patientabstract,definitepermutation)))
print("negation:\t",sparsecosine(uv("The cat does not eat the fish."),permute(negationabstract,negationpermutation)))
print("non-neg vs negation:\t",sparsecosine(uv("The cat eats the fish."),permute(negationabstract,negationpermutation)))
print("present:\t",sparsecosine(uv("The cat eats the fish."),permute(presentabstract,tensepermutation)))
print("past vs present:\t",sparsecosine(uv("The cat has not eaten the fish."),permute(presentabstract,tensepermutation)))
print("neg vs non-neg:\t",sparsecosine(uv("The cat eats the fish."),uv("The cat did not eat the fish.")))
print("similar lex:\t",sparsecosine(uv("The cat eats the fish."),uv("The dog eats the fish.")))
print("similar lex:\t",sparsecosine(uv("The cat eats the fish."),uv("The fish ate the worm.")))

#initconstantsandpermutations(dimensionality,0.2)


definite:	 0.4053
definite:	 0.3847
negation:	 0.702
non-neg vs negation:	 -0.0143
present:	 0.5998
past vs present:	 0.0143
neg vs non-neg:	 0.4521
similar lex:	 1.0
similar lex:	 0.647


- one vector per main verb
- identify interesting use case: authorship? gender? 
- add features cpw wps complexity p1 p2 p3 here and now


# Can we find indefinite agents?

In [136]:
for i in utterancespace:
    c = sparsecosine(utterancespace[i],permute(agentabstract,definitepermutation)) #permute(negationabstract,negationpermutation))  #sparseadd(,normalise(permute(negationabstract,negationpermutation)))) #sparseadd(permute(negationabstract,negationpermutation),permute(agentabstract,definitepermutation)))) #,permute(negationabstract,negationpermutation)))
    d = sparsecosine(utterancespace[i],permute(negationabstract,negationpermutation))
    f = sparsecosine(utterancespace[i],permute(pastabstract,tensepermutation))
    e = sparsecosine(utterancespace[i],sparseadd(permute(pastabstract,tensepermutation),sparseadd(permute(agentabstract,definitepermutation),permute(negationabstract,negationpermutation))))
    if (d > 0.1):
        print (c, d, f, e, i, sep="\t")

0.4053	0.702	-0.0431	0.6244	it wouldn't take long for her to place who he was.
0.3943	0.6997	0.3965	0.8745	he almost asked but he didn't want her to look at him with that skeletal face and gap-tooth smirk.
0.4053	0.702	-0.0431	0.6244	joel had not pressured her.
0.4053	0.702	-0.0431	0.6244	they had never been camping.
0.3943	0.6997	0.3965	0.8745	she almost told dad that she wasn't feeling well and that she'd rather just go home and curl up in bed with a book.
0.3943	0.6997	0.3965	0.8745	there were no cars near his.
0.4053	0.702	-0.0431	0.6244	i never heard of him until that day when arlon, my boss, called and said some wackjob shot up the place.
0.3943	0.6997	0.3965	0.8745	they probably didn't have a car.
0.3943	0.6997	0.3965	0.8745	even after a month of similar exchanges, he denied her the full pleasure of his sex because he had found someone new and he didn't want to take advantage of her.
0.3943	0.6997	0.3965	0.8745	he had pressed his luck, no doubt about that, and if he pressed it a

# weighting

now an exponential, math.exp(200*math.pi*bign/globalfrequency(wd)) which defaults to 0.5 if unknown word, i'd prefer arc tan tho

In [95]:
def sparserecoverabilityexperiment():
    step = 0.1  #density
    start = 0.1 #density
    stop = 1 #density
    for dimension in 10,100,500,1000,2000:
        print("=============================================")
        print(dimension)
        for cellsize in [10,100,250]:
            print(dimension,"=============================================")
            print("number of features aggregated: ",cellsize)
            print("analytic optimum: ",(2*cellsize*dimension)**(-1/3)*dimension)
            d = start
            featspace = {} # a hash of cellsize random vectors of denseness d
            featvec = {} # an addition of cellsize random vectors of denseness d into one vector
            bigs = 0
            bigh = 0
            ii = 0
            while (d <= stop):
                i = 0
                featspace[d] = [None]*cellsize
                featvec[d] = {}
                while (i < cellsize): 
                    featspace[d][i] = newrandomvector(dimension,d)
                    prev = featvec[d]
                    featvec[d] = sparseadd(featvec[d],featspace[d][i])
                    i += 1
                i = 0
                lils = 0
                lilh = 0
                while (i < cellsize):
                    h = sparsecosine(featvec[d],newrandomvector(dimension,d))
                    c = sparsecosine(featvec[d],featspace[d][i])
                    lils += c
                    lilh += abs(h)
                    i += 1
                avs = lils / cellsize
                avh = lilh / cellsize
                try: 
                    sn = avs / avh
                except ZeroDivisionError:
                    sn = 0
                print(dimension,"{0:.2f}".format(d),"signal:","{0:.3f}".format(avs),"noise: ","{0:.3f}".format(avh),"signal/noise: ","{0:.3f}".format(sn), sep="\t",flush=True)
                ii += i
                bigs += lils
                bigh += lilh
                d += step
            avs = bigs / ii
            avh = bigh / ii
            try: 
                sn = avs / avh
            except ZeroDivisionError:
                sn = 0
            print("average signal: ","{0:.4f}".format(avs),sep="\t")    
            print("average noise: ","{0:.4f}".format(avh),sep="\t")  
            print("signal / noise: ","{0:.4f}".format(sn),sep="\t")    


In [96]:
sparserecoverabilityexperiment()


10
number of features aggregated:  10
analytic optimum:  1.7099759466766973
10	0.10	signal:	0.332	noise: 	0.256	signal/noise: 	1.294
10	0.20	signal:	0.374	noise: 	0.187	signal/noise: 	2.000
10	0.30	signal:	0.394	noise: 	0.330	signal/noise: 	1.192
10	0.40	signal:	0.354	noise: 	0.297	signal/noise: 	1.191
10	0.50	signal:	0.416	noise: 	0.260	signal/noise: 	1.600
10	0.60	signal:	0.216	noise: 	0.262	signal/noise: 	0.824
10	0.70	signal:	0.269	noise: 	0.320	signal/noise: 	0.841
10	0.80	signal:	0.269	noise: 	0.227	signal/noise: 	1.184
10	0.90	signal:	0.155	noise: 	0.207	signal/noise: 	0.750
10	1.00	signal:	0.390	noise: 	0.277	signal/noise: 	1.407
average signal: 	0.3169
average noise: 	0.2624
signal / noise: 	1.2074
number of features aggregated:  100
analytic optimum:  0.7937005259840999
10	0.10	signal:	0.082	noise: 	0.272	signal/noise: 	0.304
10	0.20	signal:	0.136	noise: 	0.264	signal/noise: 	0.516
10	0.30	signal:	0.057	noise: 	0.270	signal/noise: 	0.210
10	0.40	signal:	0.099	noise: 	0.274	si

1000	0.40	signal:	0.104	noise: 	0.023	signal/noise: 	4.558
1000	0.50	signal:	0.099	noise: 	0.029	signal/noise: 	3.393
1000	0.60	signal:	0.099	noise: 	0.028	signal/noise: 	3.542
1000	0.70	signal:	0.103	noise: 	0.023	signal/noise: 	4.529
1000	0.80	signal:	0.098	noise: 	0.027	signal/noise: 	3.680
1000	0.90	signal:	0.098	noise: 	0.026	signal/noise: 	3.851
1000	1.00	signal:	0.098	noise: 	0.028	signal/noise: 	3.508
average signal: 	0.0998
average noise: 	0.0261
signal / noise: 	3.8180
number of features aggregated:  250
analytic optimum:  12.599210498948734
1000	0.10	signal:	0.061	noise: 	0.025	signal/noise: 	2.416
1000	0.20	signal:	0.063	noise: 	0.025	signal/noise: 	2.488
1000	0.30	signal:	0.064	noise: 	0.024	signal/noise: 	2.647
1000	0.40	signal:	0.063	noise: 	0.022	signal/noise: 	2.828
1000	0.50	signal:	0.065	noise: 	0.024	signal/noise: 	2.741
1000	0.60	signal:	0.063	noise: 	0.027	signal/noise: 	2.347
1000	0.70	signal:	0.062	noise: 	0.024	signal/noise: 	2.559
1000	0.80	signal:	0.062	noise

In [133]:
def genplot():
    step = 0.01  #density
    start = 0.01 #density
    stop = 0.02 #density
    w = 5 # how many random noise vectors
    for dimension in 10,100,500,1000,2000:
        print("=============================================")
        for cellsize in [20]:
            print("dimensionality ",dimension)
            print("number of features aggregated: ",cellsize)
 #           print("analytic optimum: ",(2*cellsize*dimension)**(-1/3)*dimension)
            d = start
            featspace = {} # a hash of cellsize random vectors of denseness d
            featvec = {} # an addition of cellsize random vectors of denseness d into one vector
            bigs = 0
            bigh = 0
            ii = 0
            jj = 0
            kk = 0
            hh = {}
            cc = {}
            while (d <= stop):
                i = 0
                featspace[d] = [None]*cellsize
                featvec[d] = {}
                while (i < cellsize): 
                    featspace[d][i] = newrandomvector(dimension,d)
                    prev = featvec[d]
                    featvec[d] = sparseadd(featvec[d],featspace[d][i])
                    i += 1
                i = 0
                while (i < cellsize):
                    r = 0
                    while (r < w):
                        h = sparsecosine(featvec[d],newrandomvector(dimension,d),False)
                        jj += 1
                        bigh += abs(h)
                        hh[kk] = h
                        r += 1
                        kk += 1
                    c = sparsecosine(featvec[d],featspace[d][i],False)
                    bigs += c
                    i += 1
                    cc[kk] = c
                    kk += 1
                    r = 0
                    while (r < w):
                        h = sparsecosine(featvec[d],newrandomvector(dimension,d),False)
                        jj += 1
                        bigh += abs(h)
                        hh[kk] = h
                        r += 1
                        kk += 1
                ii += i
                d += step
            avs = bigs / ii
            avh = bigh / jj
            for hi in hh:
                print("(",hi,",",hh[hi],")",end=" ")
            print()
            for ci in cc:
                print("(",ci,",",cc[ci],")",end=" ")
            print()                 
            print("average signal: ","{0:.4f}".format(avs),sep="\t")    
            print("average noise: ","{0:.4f}".format(avh),sep="\t")  


In [134]:
genplot()

dimensionality  10
number of features aggregated:  20
( 0 , 0 ) ( 1 , 0 ) ( 2 , 0 ) ( 3 , 0 ) ( 4 , 0 ) ( 6 , 0 ) ( 7 , 0 ) ( 8 , 0 ) ( 9 , 0 ) ( 10 , 0 ) ( 11 , 0 ) ( 12 , 0 ) ( 13 , 0 ) ( 14 , 0 ) ( 15 , 0 ) ( 17 , 0 ) ( 18 , 0 ) ( 19 , 0 ) ( 20 , 0 ) ( 21 , 0 ) ( 22 , 0 ) ( 23 , 0 ) ( 24 , 0 ) ( 25 , 0 ) ( 26 , 0 ) ( 28 , 0 ) ( 29 , 0 ) ( 30 , 0 ) ( 31 , 0 ) ( 32 , 0 ) ( 33 , 0 ) ( 34 , 0 ) ( 35 , 0 ) ( 36 , 0 ) ( 37 , 0 ) ( 39 , 0 ) ( 40 , 0 ) ( 41 , 0 ) ( 42 , 0 ) ( 43 , 0 ) ( 44 , 0 ) ( 45 , 0 ) ( 46 , 0 ) ( 47 , 0 ) ( 48 , 0 ) ( 50 , 0 ) ( 51 , 0 ) ( 52 , 0 ) ( 53 , 0 ) ( 54 , 0 ) ( 55 , 0 ) ( 56 , 0 ) ( 57 , 0 ) ( 58 , 0 ) ( 59 , 0 ) ( 61 , 0 ) ( 62 , 0 ) ( 63 , 0 ) ( 64 , 0 ) ( 65 , 0 ) ( 66 , 0 ) ( 67 , 0 ) ( 68 , 0 ) ( 69 , 0 ) ( 70 , 0 ) ( 72 , 0 ) ( 73 , 0 ) ( 74 , 0 ) ( 75 , 0 ) ( 76 , 0 ) ( 77 , 0 ) ( 78 , 0 ) ( 79 , 0 ) ( 80 , 0 ) ( 81 , 0 ) ( 83 , 0 ) ( 84 , 0 ) ( 85 , 0 ) ( 86 , 0 ) ( 87 , 0 ) ( 88 , 0 ) ( 89 , 0 ) ( 90 , 0 ) ( 91 , 0 ) ( 92 , 0 ) ( 94 , 0 ) ( 95 , 0 

( 0 , 0.0 ) ( 1 , -0.037904902178945175 ) ( 2 , 0.0 ) ( 3 , 0.0 ) ( 4 , -0.11371470653683552 ) ( 6 , -0.037904902178945175 ) ( 7 , 0.037904902178945175 ) ( 8 , -0.037904902178945175 ) ( 9 , 0.0 ) ( 10 , 0.0 ) ( 11 , 0.0 ) ( 12 , -0.07580980435789035 ) ( 13 , 0.037904902178945175 ) ( 14 , 0.037904902178945175 ) ( 15 , 0.0 ) ( 17 , 0.037904902178945175 ) ( 18 , 0.0 ) ( 19 , 0.11371470653683552 ) ( 20 , -0.07580980435789035 ) ( 21 , -0.037904902178945175 ) ( 22 , 0.0 ) ( 23 , -0.037904902178945175 ) ( 24 , 0.0 ) ( 25 , -0.07580980435789035 ) ( 26 , 0.07580980435789035 ) ( 28 , 0.037904902178945175 ) ( 29 , 0.0 ) ( 30 , -0.037904902178945175 ) ( 31 , 0.0 ) ( 32 , 0.0 ) ( 33 , 0.037904902178945175 ) ( 34 , 0.037904902178945175 ) ( 35 , 0.0 ) ( 36 , 0.0 ) ( 37 , -0.037904902178945175 ) ( 39 , -0.07580980435789035 ) ( 40 , -0.037904902178945175 ) ( 41 , -0.07580980435789035 ) ( 42 , 0.18952451089472588 ) ( 43 , 0.0 ) ( 44 , -0.037904902178945175 ) ( 45 , 0.0 ) ( 46 , 0.037904902178945175 ) ( 

( 0 , 0.022473328748774737 ) ( 1 , 0.0 ) ( 2 , 0.0 ) ( 3 , 0.044946657497549475 ) ( 4 , 0.0 ) ( 6 , 0.0 ) ( 7 , 0.0 ) ( 8 , -0.022473328748774737 ) ( 9 , 0.022473328748774737 ) ( 10 , 0.022473328748774737 ) ( 11 , 0.0 ) ( 12 , 0.022473328748774737 ) ( 13 , 0.044946657497549475 ) ( 14 , 0.0 ) ( 15 , 0.0 ) ( 17 , 0.0 ) ( 18 , -0.022473328748774737 ) ( 19 , 0.0 ) ( 20 , -0.022473328748774737 ) ( 21 , 0.06741998624632421 ) ( 22 , -0.044946657497549475 ) ( 23 , -0.022473328748774737 ) ( 24 , -0.022473328748774737 ) ( 25 , 0.022473328748774737 ) ( 26 , 0.022473328748774737 ) ( 28 , -0.022473328748774737 ) ( 29 , 0.022473328748774737 ) ( 30 , 0.0 ) ( 31 , 0.022473328748774737 ) ( 32 , 0.0 ) ( 33 , -0.044946657497549475 ) ( 34 , -0.044946657497549475 ) ( 35 , 0.044946657497549475 ) ( 36 , -0.044946657497549475 ) ( 37 , 0.022473328748774737 ) ( 39 , -0.022473328748774737 ) ( 40 , -0.08989331499509895 ) ( 41 , 0.022473328748774737 ) ( 42 , 0.0 ) ( 43 , -0.044946657497549475 ) ( 44 , -0.022473328

( 0 , -0.02224970797449924 ) ( 1 , 0.01112485398724962 ) ( 2 , 0.01112485398724962 ) ( 3 , -0.01112485398724962 ) ( 4 , 0.01112485398724962 ) ( 6 , 0.02224970797449924 ) ( 7 , 0.02224970797449924 ) ( 8 , 0.01112485398724962 ) ( 9 , 0.02224970797449924 ) ( 10 , 0.02224970797449924 ) ( 11 , 0.02224970797449924 ) ( 12 , 0.03337456196174886 ) ( 13 , 0.0 ) ( 14 , -0.01112485398724962 ) ( 15 , 0.055624269936248104 ) ( 17 , 0.01112485398724962 ) ( 18 , 0.06674912392349772 ) ( 19 , 0.0 ) ( 20 , 0.01112485398724962 ) ( 21 , -0.01112485398724962 ) ( 22 , 0.01112485398724962 ) ( 23 , -0.03337456196174886 ) ( 24 , 0.03337456196174886 ) ( 25 , 0.0 ) ( 26 , 0.02224970797449924 ) ( 28 , 0.04449941594899848 ) ( 29 , -0.02224970797449924 ) ( 30 , 0.01112485398724962 ) ( 31 , 0.01112485398724962 ) ( 32 , 0.03337456196174886 ) ( 33 , -0.02224970797449924 ) ( 34 , 0.055624269936248104 ) ( 35 , -0.04449941594899848 ) ( 36 , -0.04449941594899848 ) ( 37 , 0.01112485398724962 ) ( 39 , -0.03337456196174886 ) (