# HMM Para Named Entity Recognition en Español

### Librerías

In [3]:
import codecs
import random
import math

### Clase Train

In [4]:
class trainTag:
    trainSents = []         # sentences array
    sentsNum = 0            # sentences num
    wordNum = 0             # words num
    wordPosFreq = {}        # wordPosFreq[word] = {pos1:fre1,pos2:fre2}. word and its pos:frequency
    posFreq = {}            # posFreq[pos] = fre. pos and its frequency
    posTransFreq = {}       # posTransFreq[pos] = {pos1:frq1,pos2:frq2}. frequency of pos transferring to pos_x
    posTransPro = {}        # posTransPro[pos] = {pos1:pro1,pos2:pro2}. probability of pos transferring to pos_x
    wordPosHeadFreq = {}    # wordPosHeadFreq[word] = {pos1:fre1,pos2:fre2}. word and its frequency of being the head of sentence as pos
    wordPosHeadPro = {}     # wordPosHeadPro[word] = {pos1:pro1,pos2:pro2}. word and its probability of being the head of sentence as pos

    def __init__(self, sents):
        self.trainSents = sents
        self.sentsNum = len(sents)

    def train(self):
        print('raining:')
        for sent in self.trainSents:
            prePos = ''
            currPos = ''

            index = -1
            for pair in sent:
                self.wordNum += 1
                index += 1

                word = pair[0]# word
                pos = pair[1]# pos

                if pos in self.posFreq:
                    self.posFreq[pos] += 1
                else:
                    self.posFreq[pos] = 1


                if word in self.wordPosFreq:
                    posList = self.wordPosFreq[word].keys()
                    if pos in posList:
                        self.wordPosFreq[word][pos] += 1
                    else:
                        self.wordPosFreq[word][pos] = 1
                else:
                    self.wordPosFreq[word] = {}
                    self.wordPosFreq[word][pos] = 1

                if index == 0:
                    currPos = pos  
                    if word in self.wordPosHeadFreq:
                        posList = self.wordPosHeadFreq[word].keys()
                        if pos in posList:
                            self.wordPosHeadFreq[word][pos] += 1
                        else:
                            self.wordPosHeadFreq[word][pos] = 1
                    else:

                        self.wordPosHeadFreq[word] = {}
                        self.wordPosHeadFreq[word][pos] = 1
                else:
                    prePos = currPos
                    currPos = pos
                    if prePos in self.posTransFreq:
                        toPosList = self.posTransFreq[prePos].keys()
                        if currPos in toPosList:
                            self.posTransFreq[prePos][currPos] += 1
                        else:
                            self.posTransFreq[prePos][currPos] = 1
                    else:
                        self.posTransFreq[prePos] = {}
                        self.posTransFreq[prePos][currPos] = 1

        for fromPos in self.posTransFreq.keys():
            posTransDic = self.posTransFreq[fromPos]
            posSum = sum(posTransDic.values())
            self.posTransPro[fromPos] = {}
            for toPos in posTransDic.keys():
                self.posTransPro[fromPos][toPos] = 1.0 * posTransDic[toPos] / posSum

        for word in self.wordPosHeadFreq.keys():
            posHeadDic = self.wordPosHeadFreq[word]
            posSum = sum(posHeadDic.values())
            self.wordPosHeadPro[word] = {}
            for pos in posHeadDic.keys():
                self.wordPosHeadPro[word][pos] = 1.0 * posHeadDic[pos] / posSum

### Clase Test

In [5]:
class wordNode:
    def __init__(self,word,posNodeList):
        self.word = word
        self.posNodeList = posNodeList

class posNode:
    def __init__(self,pos,cost,bestPath):
        self.pos = pos
        self.cost = cost
        self.bestPath = bestPath

class testTag:
    testSents = []          # sentences array
    wordPosFreq = {}        # wordPosFreq[word] = {pos1:fre1,pos2:fre2}. word and its pos:frequency
    posFreq = {}            # posFreq[pos] = fre. pos and its frequency
    posTransPro = {}        # posTransPro[pos] = {pos1:pro1,pos2:pro2}. probability of pos transferring to pos_x
    wordPosHeadPro = {}     # wordPosHeadPro[word] = {pos1:pro1,pos2:pro2}. word and its probability of being the head of sentence as pos
    outputFile = ''         # output file

    sentsNum = 0            # sentences num
    wordNum = 0             # words num
    correctNum= 0           # correct num
    correctRatio = 0        # correct ratio

    def __init__(self, sents, wordPosFreq, posFreq, posTransPro, wordPosHeadPro, outputFile):
        self.testSents = sents
        self.sentsNum = len(sents)
        self.wordPosFreq = wordPosFreq
        self.posFreq = posFreq
        self.posTransPro = posTransPro
        self.wordPosHeadPro = wordPosHeadPro
        self.outputFile = outputFile

    def test(self):
        output = codecs.open(self.outputFile,'w+','utf-8')
        print('test:')
        for sent in self.testSents:
            wordList=[]# a list of wNode in one sentence
            index = -1

            for pair in sent:
                self.wordNum += 1
                index += 1

                word = pair[0]# word

                if index == 0:
                    wNode = wordNode(word,[])
                    if word in self.wordPosFreq:
                        if word in self.wordPosHeadPro:
                            posHeadDic = self.wordPosHeadPro[word]
                            posHeadList = posHeadDic.keys()
                            for p in posHeadList:
                                cost = (-math.log(self.wordPosHeadPro[word][p]))
                                pNode=posNode(p,cost,[p])
                                wNode.posNodeList.append(pNode)

                            posList = self.wordPosFreq[word].keys()
                            for p in posList:
                                if p not in posHeadList:
                                    pNode=posNode(p,30,[p])
                                    wNode.posNodeList.append(pNode)
                        else:
                            posFreqDic = self.wordPosFreq[word]
                            posList = posFreqDic.keys()
                            for p in posList:
                                pNode=posNode(p,30,[p])
                                wNode.posNodeList.append(pNode)
                    else:
                        posList = self.posFreq.keys()
                        for p in posList:
                            pNode=posNode(p,30,[p])
                            wNode.posNodeList.append(pNode)

                    wordList.append(wNode)
                else:
                    wNode = wordNode(word,[])
                    if word in self.wordPosFreq:
                        preWNode = wordList[index-1]
                        prePosNodeList = preWNode.posNodeList

                        posFreqDic = self.wordPosFreq[word]
                        posList = posFreqDic.keys()
                        for p in posList:
                            minCost = 100000000
                            maxPreNode = posNode('',0,[])

                            for pn in prePosNodeList:
                                prePos = pn.pos
                                preCost = pn.cost

                                transProb = math.exp(-30)
                                if prePos in self.posTransPro:
                                    if p in self.posTransPro[prePos]:
                                        transProb = self.posTransPro[prePos][p]

                                cost = preCost + (-math.log(transProb)) + (-math.log( 1.0*self.wordPosFreq[word][p]/self.posFreq[p] ))
                                if cost < minCost:
                                    minCost = cost
                                    maxPreNode = pn


                            path=[]
                            for e in maxPreNode.bestPath:
                                path.append(e)
                            path.append(p)
                            pNode = posNode(p,minCost,path)
                            wNode.posNodeList.append(pNode)
                    else: 
                        preWNode = wordList[index-1]
                        prePosNodeList = preWNode.posNodeList

                        posList = self.posFreq.keys()
                        for p in posList:
                            minCost = 100000000
                            maxPreNode = posNode('',0,[])

                            for pn in prePosNodeList:
                                prePos = pn.pos
                                preCost = pn.cost

                                transProb = math.exp(-30)
                                if prePos in self.posTransPro:
                                    if p in self.posTransPro[prePos]:
                                        transProb = self.posTransPro[prePos][p]

                                cost = preCost + (-math.log(transProb))
                                if cost < minCost:
                                    minCost = cost
                                    maxPreNode = pn

                            path=[]
                            for e in maxPreNode.bestPath:
                                path.append(e)
                            path.append(p)
                            pNode = posNode(p,minCost,path)
                            wNode.posNodeList.append(pNode)

                    wordList.append(wNode)

            lastWord = wordList[index]
            minCost = 100000000
            bestPath = []
            for pn in lastWord.posNodeList:
                if pn.cost < minCost:
                    minCost = pn.cost
                    bestPath = pn.bestPath


            index = -1
            for pair in sent:
                index += 1
                word = pair[0]# word
                pos = pair[1]# pos

                output.write(word)
                output.write('/' + bestPath[index] + '  ')
                if pos == bestPath[index]:
                    self.correctNum += 1
            output.write('\r\n')

        output.close()
        self.correctRatio = 1.0 * self.correctNum / self.wordNum

        print('correctNum')
        print(self.correctNum)
        print('wordNum')
        print(self.wordNum)
        print('correctRatio')
        print(self.correctRatio)

### Parte principal del código

In [6]:
trainFile = 'esp_train.txt'
testFile = 'esp_testa.txt'

if __name__ == '__main__':
    trainSents = []
    testSents = []
    f = codecs.open(trainFile, 'r', encoding='utf_8')
    pairList = []
    for line in f:
        line = line.strip()
        if len(line) != 0:
            l = line.split(' ')
            pairList.append( [l[0], l[1]] )
        else:
            r = random.random()
            trainSents.append(pairList)
            pairList = []
    f.close()

    f = codecs.open(testFile, 'r', encoding='utf_8')
    # length = 0
    pairList = []
    for line in f:
        # length += 1
        line = line.strip()
        if len(line) != 0:
            l = line.split(' ')
            pairList.append( [l[0], l[1]] )
        else:
            testSents.append(pairList)
            pairList = []
    f.close()

    tr = trainTag(trainSents)
    tr.train()
    ts = testTag(testSents, tr.wordPosFreq, tr.posFreq, tr.posTransPro, tr.wordPosHeadPro, 'test_result.txt')
ts.test()

raining:
test:
correctNum
49118
wordNum
52923
correctRatio
0.9281030931731006
