In [2]:
import re
import math
import random
import time
import corenlp
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import codecs

import inspect  # for logger

dimensionality = 2000
denseness = 10 // dimensionality
indexspace = {}
globalfrequency = {}
bign = 0

parser_client = corenlp.CoreNLPClient(
    annotators="tokenize ssplit pos natlog lemma depparse".split())

In [203]:
def logger(msg, level=False, endspace="\n"):
    if level:
        print(time.ctime()," ",inspect.stack()[1][3],"(): ", msg, sep="", end=endspace)
        

def sparseadd(onevec, othvec, weight=1, normalised=False):
    if normalised:
        onevec = normalise(onevec)
        othvec = normalise(othvec)
    result = {}
    try:
        for l in onevec:
            result[l] = onevec[l]
        for k in othvec:
            if k in result:
                result[k] = result[k] + othvec[k] * float(weight)
            else:
                result[k] = othvec[k] * float(weight)
    except KeyError:
        print("sparseadd(): error")
        raise
    return result


def sparsemultiply(onevec, othvec, weight=1):
    result = {}
    try:
        for l in onevec:
            if l in othvec:
                result[l] = onevec[l] * othvec[l] * float(weight)
    except KeyError:
        print("sparsemultiply(): error ")
    return result


def sparsexor(onevec, othvec):
    result = {}
    try:
        for l in range(len(onevec)):
            if ((l in onevec) and not (l in othvec)):
                result[l] = 1
            if (not (l in onevec) and (l in othvec)):
                result[l] = 1
    except KeyError:
        print("sparsexor(): error ")
    return result


def newrandomvector(n, denseness):
    vec = {}
    k = int(n * denseness)
    if k % 2 != 0:
        k += 1
    if (k > 0):  # no need to be careful about this, right? and k % 2 == 0):
        nonzeros = random.sample(list(range(n)), k)
        negatives = random.sample(nonzeros, k // 2)
        for i in nonzeros:
            vec[str(i)] = 1
        for i in negatives:
            vec[str(i)] = -1
    return vec


def newoperator(n, k=0.1):
    return newrandomvector(n, k)


def sparsecosine(xvec, yvec, rounding=True, decimals=4):
    x2 = 0
    y2 = 0
    xy = 0
    try:
        for i in xvec:
            x2 += xvec[i] * xvec[i]
    except KeyError:
        print("sparsecosine(): error at position ", i)
    try:
        for j in yvec:
            y2 += yvec[j] * yvec[j]
            if j in xvec:
                xy += xvec[j] * yvec[j]
    except KeyError:
        print("sparsecosine(): errors at position ", j)
    if (x2 * y2 == 0):
        cos = 0
    else:
        cos = xy / (math.sqrt(x2) * math.sqrt(y2))
    if (rounding):
        cos = round(cos, decimals)
    return cos


def sparselength(vec, rounding=True):
    x2 = 0
    length = 0
    try:
        for i in vec:
            x2 += vec[i] * vec[i]
    except KeyError:
        print("sparselength(): error at position ", i)
    if (x2 > 0):
        length = math.sqrt(x2)
    if (rounding):
        length = round(length, 4)
    return length


def comb(vec, k=0.1, dim=dimensionality):
    newvector = {}
    n = int(k * dim / 2)
    sorted_items = sorted(vec.items(),  key=lambda x: x[1])
    bot = sorted_items[:n]
    top = sorted_items[-n:]
    for l in bot:
        newvector[l[0]] = l[1]
    for l in top:
        newvector[l[0]] = l[1]
    return newvector


def sparsesum(vec):
    s = 0
    for i in vec:
        s += float(vec[i])
    return s


def normalise(vec):
    newvector = {}
    vlen = sparselength(vec, False)
    if (vlen > 0):
        for i in vec:
            newvector[i] = vec[i] / math.sqrt(vlen * vlen)
    else:
        newvector = vec
    return newvector


def modify(vec, factor):
    newvector = {}
    for i in vec:
        if (random.random() > factor):
            newvector[i] = vec[i]
        else:
            newvector[i] = float(vec[i]) * (0.5 - random.random()) * 2.0
    return newvector


def createpermutation(k):
    permutation = random.sample(range(k),  k)
    return permutation


def permute(vector, permutation):
    newvector = {}
    try:
        for i in range(len(permutation)):
            if str(i) in vector:
                newvector[str(permutation[i])] = vector[str(i)]
    except KeyError:
        newvector = vector
        print("permute(): no permutation done, something wrong")
    return newvector


def vectorsaturation(vector):
    d = 0
    for c in vector:
        d += 1
    return d


def frequencyweight(word):
    try:
        w = math.exp(-300 * math.pi * globalfrequency[word] / bign)
    except KeyError:
        w = 0.5
    return w


def chkwordspace(words, debug=False):
    global globalfrequency
    global indexspace
    global bign
    for w in words:
        bign += 1
        if w in indexspace:
            globalfrequency[w] += 1
        else:
            indexspace[w] = newrandomvector(dimensionality, denseness)
            logger(str(w) + " is new and now hallucinated.", debug)
        globalfrequency[w] = 1


def semanticdepparse(string, debug=False, verbose=False):
    depgraph = parser_client.annotate(string)
    utterances = []
    for ss in depgraph.sentence:
        for w in ss.token:
            if w.lemma not in indexspace:
                chkwordspace([w.lemma])
        utterances.append(depparseprocess(string, ss, debug))
    return utterances


def depparseprocess(string, ss, debug=False):
    negated = False
    target = "epsilon"
    adverbial = "epsilon"
    subject = "epsilon"
    verb = "epsilon"
    qu = "epsilon"
    scratch = {}
    question = {}
    logger("root: "+str(ss.basicDependencies.root), debug)
    i = 0
    for w in ss.token:
        logger(str(i)+" "+w.lemma+" "+w.pos, debug)
        i += 1
    for e in ss.basicDependencies.edge:
        logger(str(e.source) + ss.token[e.source - 1].lemma + "-" + e.dep + "->" +
               str(e.target) + ss.token[e.target - 1].lemma, debug)
    sentenceitems = {}
    sentenceitems["epsilon"] = None
    sentencepos = {}
    root = ss.basicDependencies.root[0]  # only one root for now fix this!
    qu = root
    target = root
    verb = root
    i = 1
    for w in ss.token:
        sentenceitems[i] = w.lemma
        sentencepos[i] = w.pos
        scratch[i] = False
        if w.pos == "WP":
            qu = i
        if w.pos == "WRB":
            qu = i
        i += 1
    tense = "PRESENT"
    if sentencepos[root] == "VBD":
        tense = "PAST"
    if sentencepos[root] == "VBN":
        tense = "PAST"

    for edge in ss.basicDependencies.edge:
        logger(str(edge.source) + " " + sentenceitems[edge.source] +
               " " + "-" + " " + edge.dep + " " + "->" + " " +
               str(edge.target) + " " + sentenceitems[edge.target], debug)
        if edge.dep == 'nsubj':
            subject = edge.target
        elif edge.dep == 'neg':
            negated = True
        elif edge.dep == 'advmod':
            if edge.target == qu:
                if edge.source == root:
                    target = "epsilon"
                else:
                    target = edge.source
            else:
                adverbial = edge.target
        elif edge.dep == 'cop':
            if edge.target == qu:
                target = edge.source
            else:
                adverbial = edge.target
        elif edge.dep == 'aux':
            if (sentenceitems[edge.target] == "have"):
                scratch['aux'] = "have"
            if (sentenceitems[edge.target] == "do"):
                scratch['aux'] = "do"
            if (sentencepos[edge.target] == "VBD"):
                tense = "PAST"
            if (sentenceitems[edge.target] == "will"):
                scratch['aux'] = "will"
            if (sentenceitems[edge.target] == "shall"):
                scratch['aux'] = "shall"
    if target == "epsilon":
        if subject != "epsilon":
            target = subject
    try:
        logger(sentenceitems[root] + " " + sentencepos[root], debug)
        if sentencepos[root] == "VB":
            if 'aux' in scratch:
                if (scratch['aux'] == "will" or scratch['aux'] == "shall"):
                    tense = "FUTURE"
    except KeyError:
        logger("tense situation in " + string, True)
    question["question"] = sentenceitems[qu]
    question["target"] = sentenceitems[target]
    question["verb"] = sentenceitems[verb]
    question["adverbial"] = sentenceitems[adverbial]
    question["subject"] = sentenceitems[subject]
    question["tense"] = tense
    question["negated"] = negated
#    logger(question["question"] + " " + question["target"] + " " +
#           question["verb"] + " " + question["adverbial"] + " " +
# question["subject"] + " " + question["tense"] + " " +
# question["negated"] + " " + sep="\t",debug)
    return question
def evaluateConfusionMatrix(confusionmatrix):
    for gold in sorted(confusionmatrix):
        print("---")
        carat = 0
        maximum = 0
        hitn = 0
        sortedglitter = sorted(
                                confusionmatrix[gold].items(),
                                key=lambda glitter: glitter[1],
                                reverse=True)
        for glitter in sortedglitter:
            hit = ""
            carat += glitter[1]
            if glitter[0] == gold:
                hit = "***"
                hitn = glitter[1]
            print(gold,glitter[0],glitter[1],hit,sep="\t")
        print(gold,"sum",hitn,carat,hitn / carat,sep="\t")



def addconfusion(facit, predicted):
    global confusionmatrix
    if facit in confusionmatrix:
        if predicted in confusionmatrix[facit]:
            confusionmatrix[facit][predicted] += 1
        else:
            confusionmatrix[facit][predicted] = 1
    else:
        confusionmatrix[facit] = {}
        confusionmatrix[facit][predicted] = 1


def weightfunction(word):
    if word in globalfrequency:
        return globalfrequency[word]
    elif word == "be":
        return 0.1
    else:
        return 1


def readthefile(trainfile, category, debug=False, train=True, lexical=True, roles=True):
    global textspace
    global utterancespace
    global idx
    ticker = 0
    batch = 100
    textvector = {}
    with codecs.open(trainfile,"r", encoding='utf-8') as infile:
        try:
            rawtext = infile.read().lower()
        except UnicodeDecodeError:
            rawtext = ""
            logger("File Read Problem: "+trainfile,error)
        rawtext = re.sub('\n', ' ', rawtext)
        rawtext = re.sub('\"', ' ', rawtext)
        rawtext = re.sub('\s+', ' ', rawtext)
        sents = sent_tokenize(rawtext)
        for sentence in sents:
            idx += 1
            ticker += 1
            if ticker >= batch and monitor:
                print(".",end="")
                ticker = 0
            logger(sentence, debug)
            wds=nltk.word_tokenize(sentence)
            chkwordspace(wds, debug)
            analyses = []
            try:
                analyses = semanticdepparse(sentence.lower(), debug)
            except:
                logger("PARSE ERROR "+str(idx)+"\t"+sentence,error)
            kk = 0
            for analysis in analyses:
                wds = analysis.values()
                chkwordspace(wds,debug)
                for role in analysis:
                    if role not in permutationcollection:
                        permutationcollection[role] = createpermutation(dimensionality)
                u = getvector(analysis, sentence, debug, lexical, roles, train)
                if (kk > 0):
                    idx += 1
                utterancespace[idx] = u
                textvector = sparseadd(textvector, u, 1)
                kk += 1
    return textvector


def getvector(roleworddict, sentencestring, debug=False, lexical=True, roles=True, character=True, train=True, style=False):
    uvector = {}  # vector for test item
    if roles:
        for role in roleworddict:
            item = roleworddict[role]
            factor = 1
            if item in indexspace:
                factor = specificity(item)
            elif not train:
                factor = 0
            uvector = sparseadd(uvector,
                        permute(normalise(indexspace[item]),
                                permutationcollection[role]), factor)
    if lexical:  
        wds = word_tokenize(sentencestring)
        for item in wds:
            factor = 1
            if item in indexspace:
                factor = specificity(item)
            elif not train:
                factor = 0
            uvector = sparseadd(uvector, normalise(indexspace[item]), factor)
    if character: 
        for n in 3,4,5:
            ngrams = [sentencestring[i:i+n] for i in range(len(sentencestring)-n+1)]
            for ngram in ngrams:
                pass
    if style:
        wds = word_tokenize(sentencestring)
        cpw = len(sentencestring)/len(wds)
        wps = len(wds)
    # seq nchargrams
    # seq newordgrams
    # adverbials
    # hedges and amps
    # verb classes use wordspace!
    # sent sequences
    return uvector

organise weighting


In [211]:
def idftable(trainfile, category):
    global df
    global dfn
    global cft
    global categories
    global sf
    global sn
    try:
        sn
    except NameError:
        sn = 0
    try:
        categories
    except NameError:
        categories = set()
    categories.update(category)
    try:
        dfn
    except NameError:
        dfn = 0
    with codecs.open(trainfile,"r", encoding='utf-8') as infile:
        dfn += 1
        try:
            rawtext = infile.read().lower()
        except UnicodeDecodeError:
            rawtext = ""
            logger("File Read Problem: "+trainfile, error)
        wds=nltk.word_tokenize(rawtext)
        for wd in set(wds):
            if wd in df:
                df[wd] += 1
            else:
                df[wd] = 1
            if wd not in cft:
                cft[wd] = [category]
            else:
                if category not in cft[wd]:
                    cft[wd].append(category)
        sents=nltk.sent_tokenize(rawtext)
        for sentence in sents:
            sn += 1
            wds = nltk.word_tokenize(sentence)
            for wd in set(wds):
                if wd in sf:
                    sf[wd] += 1
                else:
                    sf[wd] = 1                

                    
def x2table(trainfile, category):
    global globalfrequency
    global termfrequency
    global categorysize
    global bign
    with codecs.open(trainfile,"r", encoding='utf-8') as infile:
        try:
            rawtext = infile.read().lower()
        except UnicodeDecodeError:
            rawtext = ""
            logger("File Read Problem: "+trainfile, error)
        wds=nltk.word_tokenize(rawtext)
        uniquewds = set(wds)
        for wd in wds:
            bign += 1
            if wd not in termfrequency:
                termfrequency[wd] = {}
            if category not in termfrequency[wd]:
                termfrequency[wd][category] = 1

                termfrequency[wd] += 1
            else:
            if wd in globalfrequency:
                globalfrequency[wd] += 1
            else:
                globalfrequency[wd] = 1
            if category not in categorysize:
                categorysize[category] = 1
            else:
                categorysize[category] += 1


traindir = "/home/jussi/data/pan/pan12A/"
debug = False  # logger level
error = True   # logger level
monitor = True # logger level
globalfrequency = {}
termfrequency = {}
categorysize = {}
bign = 0
for task in "A":
    for author in "A", "B", "C":
        logger("\nauthor "+author+" at "+time.ctime(),monitor)
        av = {}
        for item in "1", "2":
            trainfile = traindir+"clean.12"+task+"train"+author+item+".txt"
            x2table(trainfile,author)

Wed Feb 21 21:40:39 2018 <module>(): 
author A at Wed Feb 21 21:40:39 2018
Wed Feb 21 21:40:39 2018 <module>(): 
author B at Wed Feb 21 21:40:39 2018
Wed Feb 21 21:40:39 2018 <module>(): 
author C at Wed Feb 21 21:40:39 2018


In [206]:
def specificity(wd,category):
    if wd in cft:
        c = math.log(len(categories)/(1+len(cft[wd]))) # (len(categories)/(len(categories)+1+len(cft[wd])))**4
    else:
        c = 0.1
    if wd in df:
        d = math.log(dfn/(1+df[wd])) # (dfn/(dfn+df[wd]+1))**4
    else:
        d = 0.1
    if wd in sf:
        s = math.log(sn/(sf[wd]+1)) # (sn/(sf[wd]+1+sn))**4
    else:
        s = 0.1
    if wd in globalfrequency:
        e = globalfrequency[wd]*categorysize[category]/bign
        x = (e-termfrequency)
    else:
        x = 0.1
    return (dx*x+dc*c+dd*d+ds*s)

In [209]:
ds = 1
dc = 1
dd = 1
specificity("victor")

7.048246268300952

In [210]:
traindir = "/home/jussi/data/pan/pan12A/"
denseness = 0.2
chkwordspace(["epsilon"])
debug = False  # logger level
error = True   # logger level
monitor = True # logger level

idx = 0
dc = 0
dd = 0
ds = 1
print("S")
trainfiles = {}
authorspace = {}
indexspace = {}
utterancespace = {}
permutationcollection = {}
for task in "A":
    for author in "A", "B", "C":
        logger("\nauthor "+author+" at "+time.ctime(),monitor)
        av = {}
        for item in "1", "2":
            trainfile = traindir+"clean.12"+task+"train"+author+item+".txt"
            filevector = readthefile(trainfile, author, debug, True)
            trainfiles[trainfile] = filevector
            av = sparseadd(av,filevector)
        authorspace[author] = av
print()
for file in trainfiles:
    res = {}
    for author in authorspace:
        res[author] = sparsecosine(trainfiles[file],authorspace[author])
    key = file[-6:-5]
    print(key, sorted(res.items(),key=lambda i: i[1], reverse=True), sep="\t")
print()
testfiles = {}
for task in "A":
    logger("testing at "+time.ctime(),monitor)
    for item in "1", "2", "3", "4", "5", "6":
        testfile = traindir+"clean.12"+task+"test"+"0"+item+".txt"
        filevector = readthefile(testfile, author, debug, False)
        testfiles[testfile] = filevector
facit = {"1":"B", "2":"A", "3":"A", "4":"C", "5":"C", "6":"B"}
print()
for file in testfiles:
    res = {}
    key = file[-5]
    for author in authorspace:
        res[author] = sparsecosine(testfiles[file],authorspace[author])
    print(key, facit[key], sorted(res.items(),key=lambda i: i[1], reverse=True), sep="\t")
    
    
idx = 0
dc = 1
dd = 0
ds = 0
print("C")
trainfiles = {}
authorspace = {}
indexspace = {}
utterancespace = {}
permutationcollection = {}
for task in "A":
    for author in "A", "B", "C":
        logger("\nauthor "+author+" at "+time.ctime(),monitor)
        av = {}
        for item in "1", "2":
            trainfile = traindir+"clean.12"+task+"train"+author+item+".txt"
            filevector = readthefile(trainfile, author, debug, True)
            trainfiles[trainfile] = filevector
            av = sparseadd(av,filevector)
        authorspace[author] = av
print()
for file in trainfiles:
    res = {}
    for author in authorspace:
        res[author] = sparsecosine(trainfiles[file],authorspace[author])
    key = file[-6:-5]
    print(key, sorted(res.items(),key=lambda i: i[1], reverse=True), sep="\t")
print()
testfiles = {}
for task in "A":
    logger("testing at "+time.ctime(),monitor)
    for item in "1", "2", "3", "4", "5", "6":
        testfile = traindir+"clean.12"+task+"test"+"0"+item+".txt"
        filevector = readthefile(testfile, author, debug, False)
        testfiles[testfile] = filevector
facit = {"1":"B", "2":"A", "3":"A", "4":"C", "5":"C", "6":"B"}
print()
for file in testfiles:
    res = {}
    key = file[-5]
    for author in authorspace:
        res[author] = sparsecosine(testfiles[file],authorspace[author])
    print(key, facit[key], sorted(res.items(),key=lambda i: i[1], reverse=True), sep="\t")
    
print("D")
idx = 0
dc = 0
dd = 1
ds = 0
trainfiles = {}
authorspace = {}
indexspace = {}
utterancespace = {}
permutationcollection = {}
for task in "A":
    for author in "A", "B", "C":
        logger("\nauthor "+author+" at "+time.ctime(),monitor)
        av = {}
        for item in "1", "2":
            trainfile = traindir+"clean.12"+task+"train"+author+item+".txt"
            filevector = readthefile(trainfile, author, debug, True)
            trainfiles[trainfile] = filevector
            av = sparseadd(av,filevector)
        authorspace[author] = av
print()
for file in trainfiles:
    res = {}
    for author in authorspace:
        res[author] = sparsecosine(trainfiles[file],authorspace[author])
    key = file[-6:-5]
    print(key, sorted(res.items(),key=lambda i: i[1], reverse=True), sep="\t")
print()
testfiles = {}
for task in "A":
    logger("testing at "+time.ctime(),monitor)
    for item in "1", "2", "3", "4", "5", "6":
        testfile = traindir+"clean.12"+task+"test"+"0"+item+".txt"
        filevector = readthefile(testfile, author, debug, False)
        testfiles[testfile] = filevector
facit = {"1":"B", "2":"A", "3":"A", "4":"C", "5":"C", "6":"B"}
print()
for file in testfiles:
    res = {}
    key = file[-5]
    for author in authorspace:
        res[author] = sparsecosine(testfiles[file],authorspace[author])
    print(key, facit[key], sorted(res.items(),key=lambda i: i[1], reverse=True), sep="\t")

S
Wed Feb 21 18:26:33 2018 <module>(): 
author A at Wed Feb 21 18:26:33 2018
......Wed Feb 21 18:26:59 2018 <module>(): 
author B at Wed Feb 21 18:26:59 2018
.....Wed Feb 21 18:27:34 2018 <module>(): 
author C at Wed Feb 21 18:27:34 2018
....
B	[('B', 0.9035), ('A', 0.4211), ('C', 0.3433)]
A	[('A', 0.8883), ('C', 0.6832), ('B', 0.673)]
C	[('C', 0.9647), ('A', 0.7255), ('B', 0.5038)]
B	[('B', 0.8573), ('A', 0.7312), ('C', 0.6711)]
C	[('C', 0.8461), ('A', 0.6905), ('B', 0.5414)]
A	[('A', 0.9515), ('C', 0.7354), ('B', 0.5341)]

Wed Feb 21 18:27:52 2018 <module>(): testing at Wed Feb 21 18:27:52 2018
................
6	B	[('A', 0.7011), ('C', 0.6768), ('B', 0.6095)]
1	B	[('A', 0.7042), ('C', 0.6826), ('B', 0.6232)]
5	C	[('A', 0.7739), ('C', 0.7732), ('B', 0.5267)]
3	A	[('A', 0.8541), ('C', 0.7101), ('B', 0.6688)]
4	C	[('A', 0.728), ('C', 0.6958), ('B', 0.496)]
2	A	[('A', 0.8416), ('C', 0.6665), ('B', 0.4729)]
C
Wed Feb 21 18:29:03 2018 <module>(): 
author A at Wed Feb 21 18:29:03 2018
....

In [200]:
for l in testfiles:
    for k in authorspace:
        print(k,l[-5],facit[l[-5]],sparsecosine(authorspace[k],testfiles[l])) 

B 6 B 0.927
A 6 B 0.9593
C 6 B 0.956
B 1 B 0.8952
A 1 B 0.9546
C 1 B 0.9519
B 5 C 0.9003
A 5 C 0.9584
C 5 C 0.9568
B 3 A 0.8985
A 3 A 0.9712
C 3 A 0.9621
B 4 C 0.7994
A 4 C 0.8718
C 4 C 0.8695
B 2 A 0.926
A 2 A 0.975
C 2 A 0.9587
