In [4]:
import codecs
EN_train = "./EN/train"
EN_test = "./EN/dev.in"
EN_output = "./EN/dev.p2.out"
EN_gold = "./EN/dev.out"
EN_viterbi = "./EN/dev.p3.out"

CN_train = "./CN/train"
CN_test = "./CN/dev.in"
CN_output = "./CN/dev.p2.out"
CN_gold = "./CN/dev.out"
CN_viterbi = "./CN/dev.p3.out"


SG_train = "./SG/train"
SG_test = "./SG/dev.in"
SG_output = "./SG/dev.p2.out"
SG_gold = "./SG/dev.out"
SG_viterbi = "./SG/dev.p3.out"


ES_train = "./ES/train"
ES_test = "./ES/dev.in"
ES_output = "./ES/dev.p2.out"
ES_gold = "./ES/dev.out"
ES_viterbi = "./ES/dev.p3.out"



def emission_params(train_file):
    with open(train_file, encoding = 'utf-8') as file:
        emission_count= {}
        label_count={}
        for line in file:
            pair = line.split()
            if len(line.split())!=0:
                #add 1 to count of (Xi, Yi)
                word = pair[0]
                sentiment = pair[1]
                if word in emission_count.keys():
                    if sentiment in emission_count[word].keys():
                        emission_count[word][sentiment] +=1
                    else:
                        sentiments = emission_count[word]
                        sentiments[sentiment] = 1
                else:
                    sentiment_count = {}
                    sentiment_count[sentiment] = 1
                    emission_count[word]=sentiment_count
    
                #add 1 to count of label Yi
                if sentiment in label_count.keys():
                    label_count[sentiment]+=1
                else:
                    label_count[sentiment]=1
        for keya in emission_count.keys():
            for keyb in emission_count[keya].keys():
                emission_count[keya][keyb]/=(label_count[keyb]+1)
        new_word = {}
        for key in label_count.keys():
            new_word[key] = 1/(label_count[key]+1)
        emission_count['new_word'] = new_word
       
        return (emission_count,label_count)
                          

def sentiment_analysis(test_file,output_file,emission_params, label_count):
    with open(test_file, encoding ='utf-8') as ifile, codecs.open(output_file, 'w', 'utf-8-sig') as ofile:
        for line in ifile:
            if len(line.split())!=0:
                word = line.split()[0]
                if word in emission_params.keys():
                    value = emission_params[word]
                    a = max(value,key=value.get)
                    ofile.write(word+" "+a+'\n')
                else:
                    value = emission_params['new_word']
                    a = max(value,key=value.get)
                    ofile.write(word+" "+a+'\n')
            else:
                ofile.write('\n')


# EN
emission_params_EN, label_count_EN = emission_params(EN_train)
sentiment_analysis(EN_test,EN_output,emission_params_EN, label_count_EN)
                   
#Entity in gold data: 662
#Entity in prediction: 2659
#Correct Entity : 359
# Entity  precision: 0.1350
# Entity  recall: 0.5423
# Entity  F: 0.2162
#Correct Sentiment : 111
# Sentiment  precision: 0.0417
# Sentiment  recall: 0.1677
# Sentiment  F: 0.0668

# ES
emission_params_ES, label_count_ES = emission_params(ES_train)
sentiment_analysis(ES_test,ES_output,emission_params_ES, label_count_ES)

#Entity in gold data: 1326
#Entity in prediction: 5449
#Correct Entity : 767
# Entity  precision: 0.1408
# Entity  recall: 0.5784
# Entity  F: 0.2264
#Correct Sentiment : 246
# Sentiment  precision: 0.0451
# Sentiment  recall: 0.1855
# Sentiment  F: 0.0726

#CN
emission_params_CN,label_count_CN = emission_params(CN_train)
sentiment_analysis(CN_test,CN_output,emission_params_CN, label_count_CN)

#Entity in gold data: 935
#Entity in prediction: 5318
#Correct Entity : 546
# Entity  precision: 0.1027
# Entity  recall: 0.5840
# Entity  F: 0.1746
#Correct Sentiment : 266
# Sentiment  precision: 0.0500
# Sentiment  recall: 0.2845
# Sentiment  F: 0.0851

#SG
emission_params_SG,label_count_SG = emission_params(SG_train)
sentiment_analysis(SG_test,SG_output,emission_params_SG, label_count_SG)

#Entity in gold data: 4779
#Entity in prediction: 14257
#Correct Entity : 2624
# Entity  precision: 0.1840
# Entity  recall: 0.5491
# Entity  F: 0.2757
#Correct Sentiment : 920
# Sentiment  precision: 0.0645
# Sentiment  recall: 0.1925
# Sentiment  F: 0.0967



In [11]:
def transmission_params(train_file):
    transmission_count= {}
    state_count={}
    prev = 'START'
    end = 'STOP'
    state_count[prev] = 0
    state_count[end] = 0
    transmission_count[end] = {}
    with open(train_file, encoding = 'utf-8') as file:    
        for line in file:
            pair = line.split()
            if len(pair)!= 0:
                sentiment = pair[1]
                # add prev to sentiment transition count
                if sentiment in transmission_count.keys():
                    sentiment_list = transmission_count[sentiment]
                    if prev in sentiment_list.keys():
                        sentiment_list[prev] += 1
                    else:
                        sentiment_list[prev] = 1
                else:
                    new_sentiment = {}
                    new_sentiment[prev] = 1
                    transmission_count[sentiment] = new_sentiment

                # add to start and stop state counts
                if prev == 'START':
                    state_count[prev] += 1
                    state_count[end] += 1

                # add to state count  
                if sentiment in state_count.keys():
                    state_count[sentiment]+=1
                else:
                    state_count[sentiment]=1
              
                prev = sentiment

            else:
                sentiment_list = transmission_count[end]
                if prev in sentiment_list.keys():
                    sentiment_list[prev] +=1
                else:
                    sentiment_list[prev] =1   
                prev = 'START'
    for keya in transmission_count.keys():
        for keyb in transmission_count[keya].keys():
            transmission_count[keya][keyb] /= state_count[keyb]
    return transmission_count, state_count
transmission_params_EN, state_count_EN = transmission_params(EN_train)

def viterbi_algo(test_file, output_file transmission_params_EN, emission_params_EN):
    sentence = []
    with open(test_file, encoding ='utf-8') as ifile, codecs.open(output_file, 'w', 'utf-8-sig') as ofile:
        for line in ifile:
            if len(line.split())!=0:
                sentence.append(line.split()[0])
            else:
                for k in range(len(sentence)):
                    for state 
                    
                calculate_score(k,v)
                
                
                
                
                
                
                
                
                sentence = []
            
            
            pass
def calculate_score(k, v):
    
viterbi_algo(EN_test, EN_viterbi, transmission_params_EN, emission_params_EN)
    

{'B-positive': {'B-positive': 0.002652519893899204, 'I-neutral': 0.002793296089385475, 'I-positive': 0.004291845493562232, 'O': 0.017859187178019462, 'START': 0.05191489361702128}, 'I-neutral': {'I-neutral': 0.276536312849162, 'B-neutral': 0.4457831325301205}, 'STOP': {'B-positive': 0.01856763925729443, 'I-neutral': 0.051675977653631286, 'O': 0.06285060103033772, 'B-neutral': 0.016351118760757316, 'I-negative': 0.02, 'I-positive': 0.04291845493562232, 'B-negative': 0.025210084033613446}, 'O': {'B-positive': 0.5251989389920424, 'I-neutral': 0.6508379888268156, 'O': 0.8627933600457928, 'B-neutral': 0.5283993115318416, 'I-negative': 0.78, 'START': 0.7251063829787234, 'I-positive': 0.6695278969957081, 'B-negative': 0.6050420168067226}, 'B-neutral': {'B-positive': 0.005305039787798408, 'I-neutral': 0.01675977653631285, 'O': 0.051230681167716084, 'B-neutral': 0.00946643717728055, 'I-negative': 0.02, 'START': 0.20170212765957446, 'I-positive': 0.004291845493562232, 'B-negative': 0.02521008403