In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# def flatten_list(lists):
#     return functools.reduce(operator.iconcat, lists, [])

# def counter_sort(counter):
#     return sorted(counter.items(), key=lambda item: item[1], reverse=True)

# import time
# start_time = time.time()
# main()
# print("--- %s seconds ---" % (time.time() - start_time))

# Function Definitions

In [2]:
import functools
import itertools
import operator
from collections import Counter
import numpy as np
import pandas as pd
import json


# transform data json to text file
def write_data(json_file, N1, N2, to_file):
    # load json file to dictionary
    with open(json_file, "r", encoding='utf-8') as read_file:
        data = json.load(read_file)
    # data string
    sents = data["sentence"]
    res, temp = "", 0
    for n in range(N1 - 1, N2):
        for dict in sents[n]["morp"]:
            for dt in sents[n]["NE"]:
                if dict["lemma"] in dt["text"]:
                    res += dict["lemma"] + '\t' + dt['type'] + '\n'
                    temp = 1
                    break
            if temp == 0: 
                res += dict["lemma"] + '\t' + 'O' + '\n'
            temp = 0
        res += '\n'
    # write data to file
    with open(to_file, 'w', encoding='utf-8') as f:
        res = res.rstrip() + '\n'
        f.write(res)
        
# transform data json and write only the words (not the tags) to text file
def write_data_words(json_file, N1, N2, to_file):
    # load json file to dictionary
    with open(json_file, "r", encoding='utf-8') as read_file:
        data = json.load(read_file)
    # data string
    sents = data["sentence"]
    res = ""
    for n in range(N1 - 1, N2):
        for dict in sents[n]["morp"]:
            res += dict["lemma"] + '\n'
        res += '\n'
    # write data to file
    with open(to_file, 'w', encoding='utf-8') as f:
        res = res.rstrip() + '\n'
        f.write(res)

In [3]:
# get emission probability only dependent on the current tag
# Laplace smoothing added
def get_emission(word, tag, helper1, helper2):
    V = len(lexicon)
    return (helper1[(word, tag)] + 1) / float(helper2[tag] + V)

# get transition probability from tag1 to tag2
def get_transition(tag1, tag2):
    trans = []
    tags_all = [x if len(x) > 1 else x + ['<END>'] for x in lexicon_all]
    tags_all = [row[1] for row in tags_all]
    for i in range(len(tags_all) - 1):
        seq = ' '.join(tags_all[i:i+2])
        trans.append(seq)
    return Counter(trans)[tag1+' '+tag2] / float(Counter(tags_all)[tag1])

# get starting probability based on first tags of sentences
def get_start_prob(tag):
    start = [lexicon_all[i+1] for i in range(len(lexicon_all) - 1) if lexicon_all[i] == [''] or i == 0]
    return Counter([row[1] for row in start])[tag] / float(len(start))

# TEST
# get_emission('일', 'DT', helper1, helper2)
# get_emission('김연아', 'PS', helper1, helper2)
# get_transition('O', 'PS')
# for tag in tags:
#     print(tag)
#     print(get_start_prob(tag))

In [4]:
# store the above probabilities to csv
def store_to_csv():
    # transition matrix
    temp = np.zeros([len(tags), len(tags)])
    for i in range(len(tags)):
        for j in range(len(tags)):
            temp[i, j] = get_transition(tags[i], tags[j])
    np.savetxt("stored_transition.csv", temp, delimiter=",")
    # starting probability matrix
    temp = np.zeros([1, len(tags)])
    for i in range(len(tags)):
        temp[0, i] = get_start_prob(tags[i])
    np.savetxt("stored_start_prob.csv", temp, delimiter=",")
    # emission matrix
    temp = np.zeros([len(tags), len(lexicon)])
    for t in range(len(tags)):
        for l in range(len(lexicon)):
            temp[t, l] = get_emission(lexicon[l], tags[t], helper1, helper2)
    np.savetxt("stored_emission.csv", temp, delimiter=",")
    
def get_stored(file1, file2, file3):
    stored_start_prob = np.genfromtxt(file1, delimiter=',')
    stored_transition = np.genfromtxt(file2, delimiter=',')
    stored_emission = np.genfromtxt(file3, delimiter=',')
    # adding a column of uniform probability (for unseen words) to emission matrix (a la Laplace smoothing)
    avg_per_tag, V = len(lexicon_split) / len(tags), len(lexicon)
    stored_emission = np.append(stored_emission, [[1 / float(avg_per_tag + V)] for x in range(len(tags))], axis=1)
    return stored_start_prob, stored_transition, stored_emission
    
# store_to_csv()

In [5]:
# A and B are transition and emission matrices, respectively
# initial (starting probability) and text are lists
def viterbi_sentence(A, B, initial, text):
    def index_helper(w):
        try:
            return lexicon.index(w)
        except ValueError:
            return len(lexicon)
    M = A.shape[0]    
    N = len(text)
    textn = [index_helper(x) for x in text]
    # initialization
    best_prob = np.zeros([M, N])
    best_tags = np.zeros([M, N - 1])
    best_prob[:, 0] = np.multiply(initial, B[:, textn[0]])
    # dynamic programming for best_prob (which is our Viterbi function)
    for i in range(1, N):
        for t in range(M):
            temp = np.multiply(best_prob[:, i-1], A[:, t])
            best_prob[t, i] = np.max(temp) * B[t, textn[i]]
            best_tags[t, i-1] = np.argmax(temp)
    # recovering sequence based on max probability
    max_temp = np.zeros([1, N])
    max_temp[0, -1] = np.argmax(best_prob[:, -1])
    # backtracking
    for i in range(N - 2, -1, -1):
        max_temp[0, i] = best_tags[int(max_temp[0, i+1]), i]
    opt_tags = max_temp.astype(int)[0]
    return np.stack([text, [tags[x] for x in opt_tags]]).T.tolist()

# can input a file (not just a sentence) using this function
def viterbi_file(A, B, initial, file_from, file_to):
    import time
    start_time = time.time()
    
    print("---- NER tagging using Hidden Markov Model and Viterbi ----")
    lexicon_dev = [line.rstrip('\n') for line in open(file_from, encoding='utf-8')]
    lexicon_dev = [list(y) for x, y in itertools.groupby(lexicon_dev, lambda z: z == '') if not x]
    viterbi_res = [viterbi_sentence(A, B, initial, sent) for sent in lexicon_dev]
    res = '\n'.join([''.join(['\t'.join(w) + '\n' for w in viterbi_res[sent]]) for sent in range(len(viterbi_res))])
    with open(file_to, "w", encoding='utf-8') as f:
        f.write(res)
#     return viterbi_res
    print("-- Successfully exported to", file_to, "--")
    
    print("---- Took %s seconds ----" % (time.time() - start_time))

# TEST
# text = ['한편', '지나', 'ㄴ', '해', 'K', '리그', '챔피언', '포항', '은', '중국', '창춘', '스타디움', '에서', '열리', 'ㄴ', '지난해', '중국', '슈퍼', '리그', '우승팀', '창춘', '과', '의', 'E', '조', '3', '차전', '원정', '경기', '에서', '후반', '40', '분', '결승골', '을', '내주', '고', '0', '-', '1', '로', '패하', '았', '다', '.']
# viterbi_sentence(stored_transition, stored_emission, stored_start_prob, text)

In [6]:
#!/usr/bin/python
#
# scorer for NLP class Spring 2016
# ver.1.0
#
# score a key file against a response file
# both should consist of lines of the form:   token \t tag
# sentences are separated by empty lines
#
def score_old (keyFileName, responseFileName):
    keyFile = open(keyFileName, 'r', encoding='utf-8')
    key = keyFile.readlines()
    responseFile = open(responseFileName, 'r', encoding='utf-8')
    response = responseFile.readlines()
    if len(key) != len(response):
        print ("length mismatch between key and submitted file")
        exit()
    correct = 0
    incorrect = 0
    for i in range(len(key)):
        key[i] = key[i].rstrip('\n')
        response[i] = response[i].rstrip('\n')
        if key[i] == "":
            if response[i] == "":
                continue
            else:
                print ("sentence break expected at line " + str(i))
                exit()
        keyFields = key[i].split('\t')
        if len(keyFields) != 2:
            print ("format error in key at line " + str(i) + ":" + key[i])
            exit()
        keyToken = keyFields[0]
        keyPos = keyFields[1]
        responseFields = response[i].split('\t')
        if len(responseFields) != 2:
            print ("format error at line " + str(i))
            exit()
        responseToken = responseFields[0]
        responsePos = responseFields[1]
        if responseToken != keyToken:
            print ("token mismatch at line " + str(i))
            exit()
        if responsePos == keyPos:
            correct = correct + 1
        else:
            incorrect = incorrect + 1
    print (str(correct) + " out of " + str(correct + incorrect) + " tags correct")
    accuracy = 100.0 * correct / (correct + incorrect)
    print ("  accuracy: %f" % accuracy)
    
# score_old ('HMM_test_correct.data','HMM_test.data')

In [7]:
def score(file_correct, file):
    res1 = [line.rstrip('\n').split('\t')[1] for line in open(file_correct, encoding='utf-8') if line != '\n']
    c1 = Counter(res1)
    res2 = [line.rstrip('\n').split('\t')[1] for line in open(file, encoding='utf-8') if line != '\n']
    c2 = Counter(res2)
    c3 = Counter(list(zip(res1, res2)))
    res = []
    for tag in ['O', 'OG', 'DT', 'PS', 'LC', 'TI']:
        prec = c3[(tag, tag)] / c2[tag]
        rec = c3[(tag, tag)] / c1[tag]
        res += [[prec, rec, 2 * prec * rec / float(prec + rec)]]
    df = pd.DataFrame(res, index=['O', 'OG', 'DT', 'PS', 'LC', 'TI'], columns=['Precision', 'Recall', 'F1'])
    return df

# Global Variables

In [8]:
data_hmm = 'HMM_train.data'
lexicon_split = [line.rstrip('\n').split('\t') for line in open(data_hmm, encoding='utf-8') if line.rstrip('\n') != '']
lexicon_all = [line.rstrip('\n').split('\t') for line in open(data_hmm, encoding='utf-8')]
lexicon = sorted({row[0] for row in lexicon_split})
tags = sorted({row[1] for row in lexicon_split})

helper1, helper2 = Counter([tuple(x) for x in lexicon_split]), Counter([row[1] for row in lexicon_split])
file1, file2, file3 = 'stored_start_prob.csv', 'stored_transition.csv', 'stored_emission.csv'
stored_start_prob, stored_transition, stored_emission = get_stored(file1, file2, file3)

# Demo Starts Here

In [9]:
print('Number of unique words in training data:', len(lexicon))
print('NER tags in training data:', tags)

# can segment the json file differently if desirable
# write_data("NEtaggedCorpus_train.json", 1, 3000, data_hmm)
# write_data_words("NEtaggedCorpus_train.json", 3001, 3555, "HMM_test.words")
# write_data("NEtaggedCorpus_train.json", 3001, 3555, 'HMM_test_correct.data')

viterbi_file(stored_transition, stored_emission, stored_start_prob, 'HMM_test.words', 'HMM_test.data')
score('HMM_test_correct.data', 'HMM_test.data')

Number of unique words in training data: 12142
NER tags in training data: ['DT', 'LC', 'O', 'OG', 'PS', 'TI']
---- NER tagging using Hidden Markov Model and Viterbi ----
-- Successfully exported to HMM_test.data --
---- Took 4.197366237640381 seconds ----


Unnamed: 0,Precision,Recall,F1
O,0.883633,0.988008,0.93291
OG,0.808429,0.230475,0.358691
DT,0.845038,0.771429,0.806557
PS,0.526316,0.029499,0.055866
LC,0.752525,0.318376,0.447447
TI,0.752475,0.655172,0.700461
