# HMM for Named Entity Recognition in Spanish

## Reading data

In [1]:
# -*- coding: utf-8 -*-

f = open("data/esp-train.txt", "r", encoding='utf8',errors="ignore")

In [2]:
train_text  = f.read()
train_lines = train_text.split("\n")

In [3]:
n_lines = len(train_lines)
print("Lines : ", n_lines)

Lines :  273038


## Getting inputs and outputs

In [4]:
def getData(lines):
    input_ = []
    label_ = []
    blanks = 0
    
    n_lines = len(lines)
    
    for i in range(n_lines):
        if lines[i] != "":
            Xi, yi = lines[i].split(" ")
            input_.append(Xi)
            label_.append(yi)
        else:
            blanks+=1
        
    n_lines -= blanks
    
    return input_, label_, n_lines

In [5]:
input_train, label_train, n_lines = getData(train_lines)

In [6]:
assert (n_lines == len(input_train))

In [7]:
import collections

label_count =  collections.Counter(label_train)
label_prob  = { k : v/n_lines for k, v in label_count.items()}

In [8]:
assert (1 == sum(label_prob.values()))

## Function to map X to y

In [9]:
def mapping_function(input_train, label_input,n_lines, prob_y):
    res = {}
    
    for i in range(n_lines):
        if input_train[i] not in res:
            res[input_train[i]] = {}
        
        if label_input[i] not in res[input_train[i]]:
                res[input_train[i]][label_input[i]]  = 1
                
        else:
            res[input_train[i]][label_input[i]] += 1 
    
    
    for key, val in res.items():
        sumLabels = sum(val.values())
        for k, v in val.items():
            res[key][k] = v / sumLabels
    
    for key, val in res.items():
        for k, v in val.items():
            res[key][k] = v * prob_y[k]
            
    return res

In [10]:
prob_dist = mapping_function(input_train, label_train, n_lines, label_prob)

## Function to predict labels

In [11]:
def predict(word, prob_dist):
    if word in prob_dist:
        maxVal = max(prob_dist[word].values())
        for k, v in prob_dist[word].items():
            if v == maxVal:
                return k
    else:
        return '0'

In [12]:
predict('Australia', prob_dist)

'B-LOC'

In [13]:
f.close()

## Getting accuracy for 'esp-texta.txt' and 'esp-textb.txt'

In [14]:
def testing(input_test, label_test, prob_dist):
    n   = len(input_test)
    acc = 0.
    
    for i in range(n):
        if predict(input_test[i], prob_dist) == label_test[i]:
            acc += 1
            
    return acc/n

In [15]:
f = open("data/esp-testa.txt", "r", encoding='utf8',errors="ignore")

testa_text  = f.read()
testa_lines = testa_text.split("\n")

In [16]:
input_testa, label_testa, n_lines = getData(testa_lines)

In [17]:
acc_a = testing(input_testa, label_testa, prob_dist)
print("Accuracy for 'esp-testa.txt' : {:4.3f}".format(acc_a))

Accuracy for 'esp-testa.txt' : 0.867


In [18]:
f.close()

In [19]:
f = open("data/esp-testb.txt", "r", encoding='utf8',errors="ignore")

testb_text  = f.read()
testb_lines = testb_text.split("\n")

In [20]:
input_testb, label_testb, n_lines = getData(testb_lines)

In [21]:
acc_b = testing(input_testb, label_testb, prob_dist)
print("Accuracy for 'esp-testb.txt' : {:4.3f}".format(acc_b))

Accuracy for 'esp-testb.txt' : 0.897


In [22]:
f.close()