# Linguistics TP7
Ning Tientso

09 December 2019

In [1]:
import sys
import numpy as np

In [52]:
#emission and transition probabilities are "learned" in a perception (one layer neural network)
#meaning the application of previous HMM-trigram tagger does not change
#the only thing that changes is how we calculate the values of emission/transitions

#global variables
tags = set()
words = set()

q = {} #we're gonna keep the transition probabilities here
e = {} #we're gonna keep the emission probabilities here
    
#viterbi proper, taken directly from solutions provided for TP3
def viterbi(sentence):
    
    n = len(sentence)
    pi = {}
    pi[(-1, "*", "*")] = 1
    bp = {}
    S = {}
    S[-2] = ["*"]
    S[-1] = ["*"]
    
    for k in range(n):
        S[k] = tags
        
    for k in range(n):
        if sentence[k] in words:
            word = sentence[k]
        else:
            word = "_RARE_"
            
        for u in S[k-1]:
            for v in S[k]:
                maxProb = 0
                maxClass = ""
                for w in S[k-2]:
                    prob = pi.get((k-1, w, u), 0) * q.get((v,w,u), 0) * e.get((word, v), 0)
                    if prob > maxProb:
                        maxProb = prob
                        maxClass = w
                if maxProb > 0:
                    pi[(k,u,v)] = maxProb
                    bp[(k,u,v)] = maxClass
    
    maxProb = 0
    maxU, maxV = "", ""
    for u in S[n-2]:
        for v in S[n-1]:
            prob = pi.get((n-1, u, v),0) * q.get(("STOP",u,v),0)
            if prob > maxProb:
                maxProb = prob
                maxU, maxV = u, v
    
    y = {}
    if maxU == "":
        y[n-2] = "NOGENE"
    else:
        y[n-2] = maxU
    
    if maxV == "":
        y[n-1] = "NOGENE"
    else:
        y[n-1] = maxV

    for k in range(n-3, -1, -1):
        try:
            y[k] = bp[(k+2, y[k+1], y[k+2])]
        except KeyError:
            y[k] = "NOGENE"
    return [y[k] for k in sorted(y)]

In [3]:
#helper functions to access files and tag proper
def sentenceIterator(filehandle):
    currentSentence = [] #Buffer for the current sentence
    for l in filehandle:
        l = l.strip()
        if l == "":
            if currentSentence:
                yield currentSentence
                currentSentence = []
            else:
                sys.stderr.write("WARNING: Got empty input file/stream.\n")
                raise StopIteration
        else:
            currentSentence.append(l)
    if currentSentence:
        yield currentSentence

def tagFile(infilename, outfilename):
    infile = open(infilename, "r")
    outfile = open(outfilename, "w")
    for sentence in sentenceIterator(infile):
        tagSequence = viterbi(sentence)
        for i in range(len(sentence)):
            outfile.write("{0} {1}\n".format(sentence[i], tagSequence[i]))
        outfile.write("\n")
    infile.close()
    outfile.close()

In [51]:
#perceptron to learn emission and transition probabilities (e and q)
def perceptron_algorithm (filename, output, T):
    '''
    Where T is the number of iterations over the training set.
    '''
    #init all parameters to zero by reading the sentences and inserting 0 values for each in q/e
    #we have to split the tag from the word in gene.train and populate q and e separately
    fp = open(filename, "r")
    all_sentences = [x for x in sentenceIterator(fp)]
    #isolate q/e vals
    s1 = []
    s2 = []
    for sentence in all_sentences:
        p1 = []
        p2 = []
        for word in sentence:
            p1.append(word.split()[0])
            p2.append(word.split()[1])
        s1.append(p1)
        s2.append(p2)
    #populate q/e vals
    for sentence in s1:
        for i in range(0,len(sentence)-2):
            q[(sentence[i],sentence[i+1],sentence[i+2])] = 0 #set q to zero
    for k, sentence in enumerate(s1):
        for i in range(0,len(sentence)):
            e[(sentence[i],s2[k][i])] = 0 #set e to zero
    
    #compare labels to viterbi output (with current itr q/e vals) and adjust
    #run again until convergence
    for t in range(T):
        #for every sentence
        for sentence in s1:
            #get best tagged sequence via viterbi
            tagSequence = viterbi(sentence)
            #for every trigram 
            for trigram in q:
                #search tagSequence for trigram
                for c in range(0, len(tagSequence)-3, 3):
                    if tagSequence[c:c+3] == trigram:
                        q[trigram] += 1
                
        #for every tag,word pair
        for pair in e:
            #search for pair
            for c in range(0, len(s1)):
                if (s1[c],s2[c]) == pair:
                    e[pair] += 1
            
    
    #run viterbi so it can let me down one last time...
    tagFile(filename, output)

In [16]:
fp = open("./gene/gene.train", "r")
all_sentences = [x for x in sentenceIterator(fp)]
some_sentences = all_sentences[0:5]

#how to isolate q/e vals that we want
s1 = []
s2 = []

for sentence in some_sentences:
    p1 = []
    p2 = []
    for word in sentence:
        p1.append(word.split()[0])
        p2.append(word.split()[1])
    s1.append(p1)
    s2.append(p2)
    
print(s1) #see if it is legit
print(s2)

fp.close()

[['Comparison', 'with', 'alkaline', 'phosphatases', 'and', '5', '-', 'nucleotidase'], ['Pharmacologic', 'aspects', 'of', 'neonatal', 'hyperbilirubinemia', '.'], ['When', 'CSF', '[', 'HCO3', '-]', 'is', 'shown', 'as', 'a', 'function', 'of', 'CSF', 'PCO2', 'the', 'data', 'of', 'K', '-', 'depleted', 'rats', 'are', 'no', 'longer', 'displaced', 'when', 'compared', 'to', 'controls', 'but', 'still', 'have', 'a', 'significantly', 'greater', 'slope', '(', '1', '.', '21', '+/-', '0', '.', '23', 'vs', '.'], ['Flurazepam', 'thus', 'appears', 'to', 'be', 'an', 'effective', 'hypnotic', 'drug', 'with', 'the', 'optimum', 'dose', 'for', 'use', 'in', 'general', 'practice', 'being', '15', 'mg', 'at', 'night', '.'], ['Beta', 'blocking', 'agents', '.']]
[['NOGENE', 'NOGENE', 'GENE', 'GENE', 'NOGENE', 'GENE', 'GENE', 'GENE'], ['NOGENE', 'NOGENE', 'NOGENE', 'NOGENE', 'NOGENE', 'NOGENE'], ['NOGENE', 'NOGENE', 'NOGENE', 'NOGENE', 'NOGENE', 'NOGENE', 'NOGENE', 'NOGENE', 'NOGENE', 'NOGENE', 'NOGENE', 'NOGENE', '

In [54]:
perceptron_algorithm("./gene/gene.train","tester.test",1)

KeyboardInterrupt: 