In [4]:
import codecs
EN_train = "./EN/train"
EN_test = "./EN/dev.in"
EN_output = "./EN/dev.p2.out"
EN_gold = "./EN/dev.out"
EN_viterbi = "./EN/dev.p3.out"
EN_topk = "./EN/dev.p4.out"

CN_train = "./CN/train"
CN_test = "./CN/dev.in"
CN_output = "./CN/dev.p2.out"
CN_gold = "./CN/dev.out"
CN_viterbi = "./CN/dev.p3.out"


SG_train = "./SG/train"
SG_test = "./SG/dev.in"
SG_output = "./SG/dev.p2.out"
SG_gold = "./SG/dev.out"
SG_viterbi = "./SG/dev.p3.out"


ES_train = "./ES/train"
ES_test = "./ES/dev.in"
ES_output = "./ES/dev.p2.out"
ES_gold = "./ES/dev.out"
ES_viterbi = "./ES/dev.p3.out"
ES_topk = "./ES/dev.p4.out"



def emission_params(train_file):
    with open(train_file, encoding = 'utf-8') as file:
        emission_count= {}
        label_count={}
        for line in file:
            pair = line.split()
            if len(line.split())!=0:
                #add 1 to count of (Xi, Yi)
                word = pair[0]
                sentiment = pair[1]
                if word in emission_count.keys():
                    if sentiment in emission_count[word].keys():
                        emission_count[word][sentiment] +=1
                    else:
                        sentiments = emission_count[word]
                        sentiments[sentiment] = 1
                else:
                    sentiment_count = {}
                    sentiment_count[sentiment] = 1
                    emission_count[word]=sentiment_count
    
                #add 1 to count of label Yi
                if sentiment in label_count.keys():
                    label_count[sentiment]+=1
                else:
                    label_count[sentiment]=1
        for keya in emission_count.keys():
            for keyb in emission_count[keya].keys():
                emission_count[keya][keyb]/=(label_count[keyb]+1)
        new_word = {}
        for key in label_count.keys():
            new_word[key] = 1/(label_count[key]+1)
        emission_count['new_word'] = new_word
       
        return (emission_count,label_count)
                          

def sentiment_analysis(test_file,output_file,emission_params, label_count):
    with open(test_file, encoding ='utf-8') as ifile, codecs.open(output_file, 'w', 'utf-8-sig') as ofile:
        for line in ifile:
            if len(line.split())!=0:
                word = line.split()[0]
                if word in emission_params.keys():
                    value = emission_params[word]
                    a = max(value,key=value.get)
                    ofile.write(word+" "+a+'\n')
                else:
                    value = emission_params['new_word']
                    a = max(value,key=value.get)
                    ofile.write(word+" "+a+'\n')
            else:
                ofile.write('\n')

                
emission_params_EN, label_count_EN = emission_params(EN_train)
sentiment_analysis(EN_test,EN_output,emission_params_EN, label_count_EN)
emission_params_ES, label_count_ES = emission_params(ES_train)
sentiment_analysis(ES_test,ES_output,emission_params_ES, label_count_ES)
emission_params_CN,label_count_CN = emission_params(CN_train)
sentiment_analysis(CN_test,CN_output,emission_params_CN, label_count_CN)
emission_params_SG,label_count_SG = emission_params(SG_train)
sentiment_analysis(SG_test,SG_output,emission_params_SG, label_count_SG)

# EN          
#Entity in gold data: 662
#Entity in prediction: 2659
#Correct Entity : 359
# Entity  precision: 0.1350
# Entity  recall: 0.5423
# Entity  F: 0.2162
#Correct Sentiment : 111
# Sentiment  precision: 0.0417
# Sentiment  recall: 0.1677
# Sentiment  F: 0.0668

# ES
#Entity in gold data: 1326
#Entity in prediction: 5449
#Correct Entity : 767
# Entity  precision: 0.1408
# Entity  recall: 0.5784
# Entity  F: 0.2264
#Correct Sentiment : 246
# Sentiment  precision: 0.0451
# Sentiment  recall: 0.1855
# Sentiment  F: 0.0726

#CN
#Entity in gold data: 935
#Entity in prediction: 5318
#Correct Entity : 546
# Entity  precision: 0.1027
# Entity  recall: 0.5840
# Entity  F: 0.1746
#Correct Sentiment : 266
# Sentiment  precision: 0.0500
# Sentiment  recall: 0.2845
# Sentiment  F: 0.0851

#SG
#Entity in gold data: 4779
#Entity in prediction: 14257
#Correct Entity : 2624
# Entity  precision: 0.1840
# Entity  recall: 0.5491
# Entity  F: 0.2757
#Correct Sentiment : 920
# Sentiment  precision: 0.0645
# Sentiment  recall: 0.1925
# Sentiment  F: 0.0967



In [5]:
def transition_params(train_file):
    transition_count= {}
    state_count={}
    prev = 'START'
    end = 'STOP'
    state_count[prev] = 0
    state_count[end] = 0
    transition_count[end] = {}
    with open(train_file, encoding = 'utf-8') as file:    
        for line in file:
            pair = line.split()
            if len(pair)!= 0:
                sentiment = pair[1]
                # add prev to sentiment transition count
                if sentiment in transition_count.keys():
                    sentiment_list = transition_count[sentiment]
                    if prev in sentiment_list.keys():
                        sentiment_list[prev] += 1
                    else:
                        sentiment_list[prev] = 1
                else:
                    new_sentiment = {}
                    new_sentiment[prev] = 1
                    transition_count[sentiment] = new_sentiment

                # add to start and stop state counts
                if prev == 'START':
                    state_count[prev] += 1
                    state_count[end] += 1

                # add to state count  
                if sentiment in state_count.keys():
                    state_count[sentiment]+=1
                else:
                    state_count[sentiment]=1
              
                prev = sentiment

            else:
                sentiment_list = transition_count[end]
                if prev in sentiment_list.keys():
                    sentiment_list[prev] +=1
                else:
                    sentiment_list[prev] =1   
                prev = 'START'
    for V in transition_count.keys():
        for U in transition_count[V].keys():
            transition_count[V][U] /= state_count[U]
    return transition_count


def viterbi_algo(test_file, output_file, transition_params, emission_params, labels):
    sentences = []

    with open(test_file, encoding ='utf-8') as ifile, codecs.open(output_file, 'w', 'utf-8-sig') as ofile:
        sentence = []
        for line in ifile:
            if len(line.split())!=0:
                sentence.append(line.split()[0])
            else:
                sentences.append(sentence)
                sentence = []
        
        for s in sentences:
            nodes = calculate_node_scores(s,transition_params, emission_params, labels)
            labelled_sentence = backtracking(s,nodes)
            for word in labelled_sentence:
                ofile.write(word+'\n')
            ofile.write("\n")

        
def calculate_node_scores(s, transition_params, emission_params, labels):
    nodes = {}
    #base case
    nodes[0] = {'START':[1,'nil']}
    #recursive
    for k in range (1, len(s)+1): #for each word
        X = s[k-1]
        for V in labels.keys(): #for each node
            prev_nodes_dict = nodes[k-1] #access prev nodes
            highest_score = 0
            parent = 'nil'
            #emission params
            if X in emission_params.keys():
                emission_labels = emission_params[X]

                if V in emission_labels:
                    b = emission_labels[V]
                else:
                    b = 0
            else:
                b = emission_params['new_word'][V]  
                
            for U in prev_nodes_dict.keys():
                #transitionparams
                prev_states = transition_params[V]
                if U in prev_states.keys():
                    a = prev_states[U]
                else:
                    a = 0
                
                #prev node score
                prev_score = prev_nodes_dict[U][0]
                score = prev_score*a*b
                
                if score>= highest_score:
                    highest_score = score
                    parent = U
            if k in nodes.keys():
                nodes[k][V] = [highest_score,parent]
            else:
                new_dict = {V:[highest_score,parent]}
                nodes[k] = new_dict
            
    #end case
    prev_nodes_dict = nodes[len(s)]
    highest_score = 0
    parent = 'nil'
    for U in prev_nodes_dict.keys():
        #transition
        prev_states = transition_params['STOP']
        if U in prev_states.keys():
            a = prev_states[U]
        else:
            a = 0
        #prev node score
        prev_score = prev_nodes_dict[U][0]
        score = prev_score*a
        if score>= highest_score:
            highest_score = score
            parent = U
    indiv_node = {'STOP': [highest_score,parent]}
    nodes[len(s)+1]=indiv_node
    
    return nodes


def backtracking(s, nodes):
    prev_state = 'STOP'
    for i in range(len(s)+1, 1,-1):
        prev_node = nodes[i][prev_state]
        prev_state = prev_node[1]
        s[i-2] += " "+prev_state
    return s

transition_params_EN = transition_params(EN_train)
viterbi_algo(EN_test, EN_viterbi, transition_params_EN, emission_params_EN, label_count_EN)
transition_params_ES = transition_params(ES_train)
viterbi_algo(ES_test, ES_viterbi, transition_params_ES, emission_params_ES, label_count_ES)
transition_params_CN = transition_params(CN_train)
viterbi_algo(CN_test, CN_viterbi, transition_params_CN, emission_params_CN, label_count_CN)
transition_params_SG = transition_params(SG_train)
viterbi_algo(SG_test, SG_viterbi, transition_params_SG, emission_params_SG, label_count_SG)

#EN
#Entity in gold data: 662
#Entity in prediction: 1022
#Correct Entity : 232
# Entity  precision: 0.2270
# Entity  recall: 0.3505
# Entity  F: 0.2755
#Correct Sentiment : 108
# Sentiment  precision: 0.1057
# Sentiment  recall: 0.1631
# Sentiment  F: 0.1283

#ES
#Entity in gold data: 1326
#Entity in prediction: 2528
#Correct Entity : 530
# Entity  precision: 0.2097
# Entity  recall: 0.3997
# Entity  F: 0.2750
#Correct Sentiment : 269
# Sentiment  precision: 0.1064
# Sentiment  recall: 0.2029
# Sentiment  F: 0.1396

#CN
#Entity in gold data: 935
#Entity in prediction: 1966
#Correct Entity : 408
# Entity  precision: 0.2075
# Entity  recall: 0.4364
# Entity  F: 0.2813
#Correct Sentiment : 247
# Sentiment  precision: 0.1256
# Sentiment  recall: 0.2642
# Sentiment  F: 0.1703

#SG
#Entity in gold data: 4779
#Entity in prediction: 5429
#Correct Entity : 1606
# Entity  precision: 0.2958
# Entity  recall: 0.3361
# Entity  F: 0.3147
#Correct Sentiment : 624
# Sentiment  precision: 0.1149
# Sentiment  recall: 0.1306
# Sentiment  F: 0.1223

In [6]:
def viterbi_algo_topk(test_file, output_file, transition_params, emission_params, labels, top_k, i_th):
    sentences = []

    with open(test_file, encoding ='utf-8') as ifile, codecs.open(output_file, 'w', 'utf-8-sig') as ofile:
        sentence = []
        for line in ifile:
            if len(line.split())!=0:
                sentence.append(line.split()[0])
            else:
                sentences.append(sentence)
                sentence = []
        
        for s in sentences:
            nodes = calculate_topk_node_scores(s,transition_params, emission_params, labels, top_k)
            labelled_sentence = backtracking_topk(s,nodes, i_th)
            for word in labelled_sentence:
                ofile.write(word+'\n')
            ofile.write("\n")


def calculate_topk_node_scores(s, transition_params, emission_params, labels, top_k):
    nodes = {}
    #base case
    nodes[0] = {'START':[[1,'nil',0]]}
    #recursive
    for k in range (1, len(s)+1): #for each word
        X = s[k-1]
        for V in labels.keys(): #for each node
            prev_nodes_dict = nodes[k-1] #access prev nodes
            #emission params
            if X in emission_params.keys():
                emission_labels = emission_params[X]

                if V in emission_labels:
                    b = emission_labels[V]
                else:
                    b = 0
            else:
                b = emission_params['new_word'][V]  
            scores = []
            for U in prev_nodes_dict.keys():
                #transitionparams
                prev_states = transition_params[V]
                if U in prev_states.keys():
                    a = prev_states[U]
                else:
                    a = 0
                index = 0
                for prev_k_nodes in prev_nodes_dict[U]:
                    #prev node score
                    score = prev_k_nodes[0]*a*b
                    scores.append([score, U, index])
                    index += 1
            
            #take top k scores
            scores.sort(key=lambda x: x[0],reverse=True)
            topk_scores = scores[:top_k]
            if k in nodes.keys():
                nodes[k][V] = topk_scores
            else:
                new_dict = {V:topk_scores}
                nodes[k] = new_dict
            
    #end case
    prev_nodes_dict = nodes[len(s)]
    scores = []
    for U in prev_nodes_dict.keys():
        #transition
        prev_states = transition_params['STOP']
        if U in prev_states.keys():
            a = prev_states[U]
        else:
            a = 0
        #prev node score
        index = 0
        for prev_k_nodes in prev_nodes_dict[U]:
            score = prev_k_nodes[0]*a
            scores.append([score, U, index])
            index += 1
    scores.sort(key=lambda x: x[0], reverse=True)
    topk_scores = scores[:top_k]
    indiv_node = {'STOP': topk_scores}
    nodes[len(s)+1]=indiv_node
    
    return nodes


def backtracking_topk(s, nodes, i_th):
    prev_state = 'STOP'
    prev_index = 0
    for i in range(len(s)+1, 1,-1):
        if i==len(s)+1:
            prev_node = nodes[i][prev_state][i_th-1]
        else:
            prev_node = nodes[i][prev_state][prev_index]
        prev_state = prev_node[1]
        prev_index = prev_node[2]
        s[i-2] += " "+prev_state
    return s

viterbi_algo_topk(EN_test, EN_topk, transition_params_EN, emission_params_EN, label_count_EN, 7, 5)
viterbi_algo_topk(ES_test, ES_topk, transition_params_ES, emission_params_ES, label_count_ES, 7, 5)

#EN
#Entity in gold data: 662
#Entity in prediction: 1319
#Correct Entity : 266
# Entity  precision: 0.2017
# Entity  recall: 0.4018
# Entity  F: 0.2686
#Correct Sentiment : 113
# Sentiment  precision: 0.0857
# Sentiment  recall: 0.1707
# Sentiment  F: 0.1141

#ES
#Entity in gold data: 1326
#Entity in prediction: 2637
#Correct Entity : 565
# Entity  precision: 0.2143
# Entity  recall: 0.4261
# Entity  F: 0.2851
#Correct Sentiment : 226
# Sentiment  precision: 0.0857
# Sentiment  recall: 0.1704
# Sentiment  F: 0.1141

