In [20]:
from collections import Counter
from operator import itemgetter
import copy

training_file = "./EN/train"
test_file = "./EN/dev.out"

def update_params(training_file, predicted_file, transition_params, emission_params):
    with open(training_file, encoding = 'utf-8') as train, open(predicted_file, encoding = 'utf-8') as predicted:
        prev_state = "start"
        for t_line in train:
            if len(t_line.split()) != 0:
                current_state = t_line.split()[1]
                y0_y1 = prev_state + " " + current_state
                prev_state = current_state
            else:
                current_state = "stop"
                y0_y1 = prev_state + " " + current_state
                prev_state = "start"
            
            if y0_y1 in transition_params.keys():
                transition_params[y0_y1] += 1
            else:
                transition_params[y0_y1] = 1
        
        prev_state = "start"
        for p_line in predicted:
            if len(p_line.split()) != 0:
                current_state = t_line.split()[1]
                y0_y1 = prev_state + " " + current_state
                prev_state = current_state
            else:
                current_state = "stop"
                y0_y1 = prev_state + " " + current_state
                prev_state = "start"
            
            if y0_y1 in transition_params.keys():
                transition_params[y0_y1]-= 1
            else:
                transition_params[y0_y1] = -1
                
                
        
                
                






def init_t_para(training_file): 
    transition_parameter = Counter() # count the y_to_y1 tags
    state = "start" #" hold previous tag info
    cnt_y = Counter() # count the y tags
    f = open(training_file, "r", encoding='utf8')
    y0_y1 = ""
    
    for line in f:
        line = line.rstrip('\n')
        if (line == ""):
            y0_y1 = state + " stop"
            state = "stop"
        else:
            line = line.split()[1]
            y0_y1 = state + " " + line
            state = line
            
        transition_parameter[y0_y1]+=1

    for i,line in enumerate(transition_parameter):
        transition_parameter[line] = 0
#        print(i,line,transition_parameter[line]) # uncomment to see the output
    f.close()
    return  transition_parameter

t_para = init_t_para(training_file)

print (t_para.keys())

dict_keys(['start B-neutral', 'stop B-neutral', 'I-positive O', 'O stop', 'I-neutral B-neutral', 'B-positive stop', 'B-negative stop', 'stop B-negative', 'stop B-positive', 'O B-negative', 'B-positive B-negative', 'I-neutral O', 'B-neutral I-neutral', 'I-positive stop', 'O B-neutral', 'B-positive B-neutral', 'B-negative O', 'I-positive B-neutral', 'I-neutral stop', 'I-negative stop', 'O B-positive', 'I-neutral B-negative', 'stop O', 'I-negative O', 'I-positive I-positive', 'B-positive O', 'I-positive B-positive', 'I-negative I-negative', 'I-negative B-neutral', 'B-neutral stop', 'B-neutral B-neutral', 'B-neutral O', 'I-neutral B-positive', 'B-positive B-positive', 'B-negative B-neutral', 'I-neutral I-neutral', 'B-negative I-negative', 'B-positive I-positive', 'O O'])


In [17]:
def init_e_para(training_file): # initialize alpha_w,t exist in training set to zero
    emission_parameter = Counter() # count the y->x tags
    cnt_y = Counter()
    states = []
    f = open(training_file, "r", encoding='utf8')
    for line in f:
        line = line.rstrip('\n')
        if(line!=""):
            entry = line.split()
            emission_parameter[line]+=1
            y = entry[1]
            cnt_y[y]+=1
            if y not in states:
                states.append(y)
            
    for tag in cnt_y: # generates emission for new word
        emission_parameter["new_word "+tag] = 1
        
    for i,line in enumerate(emission_parameter):
        emission_parameter[line] = 0
#         print(i,line,emission_parameter[line]) # uncomment to see the output

    f.close()
    return  emission_parameter, states

e_para, states = init_e_para(training_file)



In [5]:
test_file = './EN/dev.in'
output_file = '/EN/dev.p5.out'


def viterbi(test_file, output_file, transmission_params, emission_params, states):
    sentences = []
    words = set()
    for key in emission_params.keys():
        word = emission_params[key].split()[0]
        words.add(word)
    with open(test_file, encoding ='utf-8') as ifile, codecs.open(output_file, 'w', 'utf-8-sig') as ofile:
        sentence = []
        for line in ifile:
            if len(line.split())!=0:
                sentence.append(line.split()[0])
            else:
                sentences.append(sentence)
                sentence = []
        
        for s in sentences:
            nodes = calculate_node_scores(s,transmission_params, emission_params, states, words)
            labelled_sentence = backtracking(s,nodes)
            for word in labelled_sentence:
                ofile.write(word+'\n')
            ofile.write("\n")

        
def calculate_node_scores(s, transmission_params, emission_params, states, words):
    nodes = {}
    #base case
    nodes[0] = {'start':[1,'nil']}
    #recursive
    for k in range (1, len(s)+1): #for each word
        X = s[k-1]
        for V in states: #for each nodeci
            prev_nodes_dict = nodes[k-1] #access prev nodes
            highest_score = 0
            parent = 'nil'
            
            #emission params
            if X in words:
                pair = X + " " + V
                if pair in emission_params.keys():
                    b = emission_labels[pair]
                else:
                    b = 0
            else:
                b = emission_params['new_word '+V]
                
            for U in prev_nodes_dict.keys():
                #transmissionparams
                for key in transmission_params.keys():
                    if V == key[1]:
                        
                prev_states = transmission_params[V]
                if U in prev_states.keys():
                    a = prev_states[U]
                else:
                    a = 0
                
                #prev node score
                prev_score = prev_nodes_dict[U][0]
                score = prev_score+a+b
                
                if score>= highest_score:
                    highest_score = score
                    parent = U
            if k in nodes.keys():
                nodes[k][V] = [highest_score,parent]
            else:
                new_dict = {V:[highest_score,parent]}
                nodes[k] = new_dict
            
    #end case
    prev_nodes_dict = nodes[len(s)]
    highest_score = 0
    parent = 'nil'
    for U in prev_nodes_dict.keys():
        #transmission
        prev_states = transmission_params['STOP']
        if U in prev_states.keys():
            a = prev_states[U]
        else:
            a = 0
        #prev node score
        prev_score = prev_nodes_dict[U][0]
        score = prev_score*a
        if score>= highest_score:
            highest_score = score
            parent = U
    indiv_node = {'stop': [highest_score,parent]}
    nodes[len(s)+1]=indiv_node
    
    return nodes


def backtracking(s, nodes):
    prev_state = 'stop'
    for i in range(len(s)+1, 1,-1):
        prev_node = nodes[i][prev_state]
        prev_state = prev_node[1]
        s[i-2] += " "+prev_state
    return s
    
                

SyntaxError: unexpected EOF while parsing (<ipython-input-5-6965e00b9f3e>, line 8)