## Common functions

In [4]:
def eval_part(pred, data_out):
    total_correct = 0 
    total_pred = 0
    total_gold = 0
    for i,j in zip(pred,data_out):
        if i != 'O':
            total_pred +=1
        if j != 'O':
            total_gold +=1
            total_correct += i == j
            
    p_score = total_correct/total_pred
    r_score = total_correct/total_gold
    print("accuracy : " + str(total_correct/len(pred)))
    print("total precision : " + str(total_correct/total_pred))
    print("total recall : " + str(total_correct/total_gold))
    print("net f1 : " + str(2/(1/p_score + 1/r_score)))
    
def write_part(data, pred, part, dirname):
    
    j = 0

    for i in range(len(data)):
        content = data[i].split(" ")
        if len(content)!=1:
            content[-1] = pred[j]
            j+=1
        data[i] = " ".join(content) + "\n"
        
    file1 = open(f"{dirname}/dev.p{part}.out","w+", encoding="utf8")
    file1.write("".join(data))
    file1.close()

## Part 1

In [5]:
def train_part1(data):
    obs_set = set()
    e_prob = {'O':{}, 'B-positive':{}, 'B-neutral':{}, 'B-negative':{}, 'I-positive':{}, 'I-neutral':{}, 'I-negative':{}}
    e_count = {'O':0, 'B-positive':0, 'B-neutral':0, 'B-negative':0, 'I-positive':0, 'I-neutral':0, 'I-negative':0}

    for i in data:
        content = i.split(" ")
        if len(content)!=1:
            obs = " ".join(content[:-1])
            state = content[-1]
            if e_prob[state].get(obs, -1) != -1:
                e_prob[state][obs] += 1
            else:
                e_prob[state][obs] = 1
            e_count[state] += 1
            obs_set.add(" ".join(content[:-1]))
    

    for i in e_prob.keys():
        for j in obs_set:
            tmp = e_prob[i].get(j, 0) 
            e_prob[i][j] = tmp / (e_count[i] + 1)
        e_prob[i]["#UNK#"] = 1 / (e_count[i] + 1)
        
    return e_prob

def pred_part1(word, e_prob):
    
    ans = {}
    for i in e_prob.keys():
        ans[e_prob[i].get(word,e_prob[i]["#UNK#"])] = i
    return ans[max(ans.keys())]

In [6]:
f = open("ES/train", "r", encoding="utf8")
data = []
for r in f:
    data.append(r.strip("\n"))

e_prob = train_part1(data)

f = open("ES/dev.in", "r", encoding="utf8")
data = []
for r in f:
    data.append(r.strip("\n"))
data = [x for x in data if x]
pred = []
for i in data:
    pred.append(pred_part1(i, e_prob))
f = open("ES/dev.out", "r", encoding="utf8")
data = []
data.append
for r in f:
    data.append(r.strip("\n"))
data_out = [x.split(' ')[1] for x in data if x]

eval_part(pred, data_out)
write_part(data, pred, 1, 'ES')

accuracy : 0.026015951386251426
total precision : 0.06832917705735661
total recall : 0.42024539877300615
net f1 : 0.11754611754611755


In [14]:
f = open("RU/train", "r", encoding="utf8")
data = []
for r in f:
    data.append(r.strip("\n"))

e_prob = train_part1(data)

f = open("RU/dev.in", "r", encoding="utf8")
data = []
for r in f:
    data.append(r.strip("\n"))
data = [x for x in data if x]
pred = []
for i in data:
    pred.append(pred_part1(i, e_prob))
f = open("RU/dev.out", "r", encoding="utf8")
data = []
data.append
for r in f:
    data.append(r.strip("\n"))
data_out = [x.split(' ')[1] for x in data if x]

eval_part(pred, data_out)
write_part(data, pred, 1, 'RU')

accuracy : 0.02471256772829391
total precision : 0.07626427406199021
total recall : 0.29037267080745344
net f1 : 0.12080103359173128


## Part 2

In [5]:
from math import log

In [6]:
def logprod(var):
    result = 0
    for v in var:
        try:
            result += log(v)
        except ValueError:
            return -10^5
    return result

In [7]:
def train_part2(data):
    q_prob = {'START':{} ,'O':{}, 'B-positive':{}, 'B-neutral':{}, 'B-negative':{}, 'I-positive':{}, 'I-neutral':{}, 'I-negative':{}}
    q_count = {'START': 0, 'O':0, 'B-positive':0, 'B-neutral':0, 'B-negative':0, 'I-positive':0, 'I-neutral':0, 'I-negative':0}

    data = ['temp START'] + data
    i = 0
    while i < len(data):
        if data[i] == "":
            data = data[:i] + ["temp STOP", 'temp START'] + data[i+1:]
        i+=1
    data = data[:-1]
    
    i = 0
    while i < len(data)-1:
        prev_tag = data[i].split(" ")[-1]
        curr_tag = data[i+1].split(" ")[-1]
        if prev_tag == "STOP":
            i+=1
            continue
        if q_prob[prev_tag].get(curr_tag):
            q_prob[prev_tag][curr_tag] += 1
        else:
            q_prob[prev_tag][curr_tag] = 1
        q_count[prev_tag] += 1
        i+=1

    for i in q_prob.keys():
        for j in q_prob[i].keys():
            q_prob[i][j] /= q_count[i]
        
    return q_prob

def pred_part2(sentence, e_prob, q_prob):
    
    pie = {0:{'START': 1, 'O':0, 'B-positive':0, 'B-neutral':0, 'B-negative':0, 'I-positive':0, 'I-neutral':0, 'I-negative':0}}
    ls = ['B-positive', 'B-neutral', 'B-negative', 'I-positive', 'I-neutral', 'I-negative']
    path = {0:'START'}
    i = 1
    max_path = 0
    while i<len(sentence)+1:
        pie[i] = {}
        path[i] = {}
        # current state
        for u in ['O', 'B-positive', 'B-neutral', 'B-negative', 'I-positive', 'I-neutral', 'I-negative']:
            max_path = 'O'
            # previous state
            for v in ['START', 'O','B-positive', 'B-neutral', 'B-negative', 'I-positive', 'I-neutral', 'I-negative']:
                p = pie[i-1].get(v, 0) * q_prob[v].get(u,0) * e_prob[u].get(sentence[i-1],e_prob[u]["#UNK#"])
                
                if p > pie[i].get(u,0):
                    max_path = v                
    
                pie[i][u] = pie[i-1][max_path] * q_prob[max_path].get(u,0) * e_prob[u].get(sentence[i-1],e_prob[u]["#UNK#"])
                path[i][u] = max_path
    
        i+=1
        
    
    u = "STOP"
    max_path = 'O'
    pie[i] = {}
    path[i] = {}
    for v in q_prob.keys():
        p = pie[i-1].get(v, 0) * q_prob[v].get(u,0)
        if p > pie[i].get(u,0):
            max_path = v                
    
    pie[i][u] = pie[i-1][max_path] * q_prob[max_path].get(u,0)
    path[i][u] = max_path

    best_path = []
    
    while i!=0:
        best_path.append(path[i][u])
        u = path[i][u]
        i-=1
    best_path.reverse()
    
    return best_path[1:]

In [8]:
f = open("ES/train", "r", encoding="utf8")
data = []
for r in f:
    data.append(r.strip("\n"))
e_prob = train_part1(data)
q_prob = train_part2(data)

f = open("ES/dev.in", "r", encoding="utf8")
data = []
temp = []
for r in f:
    if r == "\n":
        data.append(temp)
        temp = []
        continue
    temp.append(r.strip("\n"))
pred = []
for i in data:
    pred += pred_part2(i, e_prob, q_prob)

f = open("ES/dev.out", "r", encoding="utf8")
data = []
data.append
for r in f:
    data.append(r.strip("\n"))
data_out = [x.split(' ')[1] for x in data if x]

eval_part(pred, data_out)
write_part(data, pred, 2, 'ES')

accuracy : 0.025826053930877325
total precision : 0.13807106598984772
total recall : 0.4171779141104294
net f1 : 0.2074752097635393


In [10]:
f = open("RU/train", "r", encoding="utf8")
data = []
for r in f:
    data.append(r.strip("\n"))

e_prob = train_part1(data)
q_prob = train_part2(data)

f = open("RU/dev.in", "r", encoding="utf8")
data = []
for r in f:
    data.append(r.strip("\n"))
data = [x for x in data if x]
pred = []
for i in data:
    pred.append(pred_part1(i, e_prob))
f = open("RU/dev.out", "r", encoding="utf8")
data = []
data.append
for r in f:
    data.append(r.strip("\n"))
data_out = [x.split(' ')[1] for x in data if x]

eval_part(pred, data_out)
write_part(data, pred, 2, 'RU')

accuracy : 0.02471256772829391
total precision : 0.07626427406199021
total recall : 0.29037267080745344
net f1 : 0.12080103359173128


## Part 3

In [111]:
def pred_part3(sentence, e_prob, q_prob, top_k=2):
    
    pie = {0:{'START': 1, 'O':0, 'B-positive':0, 'B-neutral':0, 'B-negative':0, 'I-positive':0, 'I-neutral':0, 'I-negative':0}}
    path = {0:'START'}
    
    # initialize
    for state in pie[0].keys():
        pie[0][state] = [pie[0][state]]*top_k 

    path[0] = [pie[0]]*top_k 
    
    i = 1
    max_path = 0
    while i<len(sentence)+1:
        print("Position ",i)
        pie[i] = {}
        path[i] = {}
        # current state
        for u in ['O', 'B-positive', 'B-neutral', 'B-negative', 'I-positive', 'I-neutral', 'I-negative']:
            v_k = {}
            # previous state
            for v in ['START', 'O','B-positive', 'B-neutral', 'B-negative', 'I-positive', 'I-neutral', 'I-negative']:
                # top-k
                for k in range(top_k):
                    prev_p = 0
                    if v in pie[i-1].keys():
                        prev_p = pie[i-1][v][k]
                    p =  prev_p * q_prob[v].get(u,0) * e_prob[u].get(sentence[i-1],e_prob[u]["#UNK#"])
                    v_k[(v, k)] = p            

            v_k = sorted(v_k.items(), key=lambda x: x[1], reverse=True)
            pie[i][u] = [item[1] for item in v_k[:top_k]]
            print(pie[i])
            path[i][u] = [item[0][0] for item in v_k[:top_k]]
#             print(path[i])
    
        i+=1
        
    
    u = "STOP"
    pie[i] = {}
    path[i] = {}
    v_k = {}
    for v in q_prob.keys():
        for k in range(top_k):
            prev_p = 0
            if v in pie[i-1].keys():
                prev_p = pie[i-1][v][k]
            p = prev_p * q_prob[v].get(u,0)
            v_k[(v, k)] = p            

    v_k = sorted(v_k.items(), key=lambda x: x[1], reverse=True)
    pie[i][u] = v_k[:top_k]
    path[i][u] = [item[0][0] for item in v_k[:top_k]]

    best_path = []
    
    while i!=0:
        print(path[i][u])
#         best_path+=path[i][u]
#         u = path[i][u]
        i-=1
    best_path.reverse()
        
    print(best_path)
    return best_path[1:]

In [15]:
f = open("ES/train", "r", encoding="utf8")
data = []
for r in f:
    data.append(r.strip("\n"))
e_prob = train_part1(data)
q_prob = train_part2(data)

In [112]:
f = open("ES/dev.in", "r", encoding="utf8")
data = []
temp = []
for r in f:
    if r == "\n":
        data.append(temp)
        temp = []
        continue
    temp.append(r.strip("\n"))
    
# data = data[0]
pred = []
for i in data:
    pred += pred_part3(i, e_prob, q_prob)

f = open("ES/dev.out", "r", encoding="utf8")
data = []
data.append
for r in f:
    data.append(r.strip("\n"))
data_out = [x.split(' ')[1] for x in data if x]

eval_part(pred, data_out)
# write_part(data, pred)

Position  1
{'O': [2.9366812929114513e-05, 2.9366812929114513e-05]}
{'O': [2.9366812929114513e-05, 2.9366812929114513e-05], 'B-positive': [0.0, 0.0]}
{'O': [2.9366812929114513e-05, 2.9366812929114513e-05], 'B-positive': [0.0, 0.0], 'B-neutral': [0.0, 0.0]}
{'O': [2.9366812929114513e-05, 2.9366812929114513e-05], 'B-positive': [0.0, 0.0], 'B-neutral': [0.0, 0.0], 'B-negative': [3.040711751787826e-05, 3.040711751787826e-05]}
{'O': [2.9366812929114513e-05, 2.9366812929114513e-05], 'B-positive': [0.0, 0.0], 'B-neutral': [0.0, 0.0], 'B-negative': [3.040711751787826e-05, 3.040711751787826e-05], 'I-positive': [0.0, 0.0]}
{'O': [2.9366812929114513e-05, 2.9366812929114513e-05], 'B-positive': [0.0, 0.0], 'B-neutral': [0.0, 0.0], 'B-negative': [3.040711751787826e-05, 3.040711751787826e-05], 'I-positive': [0.0, 0.0], 'I-neutral': [0.0, 0.0]}
{'O': [2.9366812929114513e-05, 2.9366812929114513e-05], 'B-positive': [0.0, 0.0], 'B-neutral': [0.0, 0.0], 'B-negative': [3.040711751787826e-05, 3.040711751787

KeyError: 'STOP'

## Part 4

In [11]:
def initialize_train_part4(val):
    state_ls = ['PRESTART', 'START', 'O', 'B-positive', 'B-neutral', 'B-negative', 'I-positive', 'I-neutral', 'I-negative', 'STOP']

    q_dict = {}
    for state1 in state_ls:
        for state2 in state_ls:
#             if state2!="START" and state1!="STOP":
                q_dict[(state1, state2)] = val
            
#     q_dict[("PRESTART", "START")] = val
    return q_dict

def train_part4(data):
    
    q_prob = initialize_train_part4({})
    q_count = initialize_train_part4(0)

    prev_prev_tag = "PRESTART"
    prev_tag = "START"
    for line in data:
        content = line.strip().rsplit(' ', 1)
            
        # empty line/char indicates end of sentence
        if len(content)==1:
            if q_prob[(prev_prev_tag, prev_tag)].get("STOP"):
                q_prob[(prev_prev_tag, prev_tag)]["STOP"] += 1
            else:
                q_prob[(prev_prev_tag, prev_tag)]["STOP"] = 1
            q_count[(prev_prev_tag, prev_tag)] += 1
            prev_prev_tag = "PRESTART"
            prev_tag = "START"
            continue
            
        curr_tag = content[-1]
        if q_prob[(prev_prev_tag, prev_tag)].get(curr_tag):
            q_prob[(prev_prev_tag, prev_tag)][curr_tag] += 1
        else:
            q_prob[(prev_prev_tag, prev_tag)][curr_tag] = 1
        q_count[(prev_prev_tag, prev_tag)] += 1
        
        prev_prev_tag = prev_tag
        prev_tag = curr_tag

    for i in q_prob.keys():
        for j in q_prob[i].keys():
            if q_count[i]>0:
                q_prob[i][j] /= q_count[i]
        
    return q_prob

def pred_part4(sentence, e_prob, q_prob):
    
    pie = {}
    pie[0] = initialize_train_part4(0)
    pie[0][("PRESTART", "START")] = 1
    path = {0:("PRESTART", "START")}

    i = 1
    max_path = 0
    
    while i<len(sentence)+1:
        pie[i] = {}
        path[i] = {}
        # current state
        for u in ['O', 'B-positive', 'B-neutral', 'B-negative', 'I-positive', 'I-neutral', 'I-negative']:
            max_path = 'O'
            # previous state
            for v in ['START', 'O','B-positive', 'B-neutral', 'B-negative', 'I-positive', 'I-neutral', 'I-negative']:
#                 if i>1 and v=="START":
#                         continue
                # 2nd previous state
                for t in ['PRESTART', 'START', 'O','B-positive', 'B-neutral', 'B-negative', 'I-positive', 'I-neutral', 'I-negative']:
#                     if i>1 and t=="PRESTART":
#                         continue
                    q = 0 if q_prob.get((t, v), -1) == -1 else q_prob[(t, v)].get(u,0)
                    p = pie[i-1].get((t, v), 0) * q * e_prob[u].get(sentence[i-1],e_prob[u]["#UNK#"])

                    if p > pie[i].get((v, u),0):
                        max_path = t  
                    
                    q = 0 if q_prob.get((max_path, v), -1) == -1 else q_prob[(max_path, v)].get(u,0)
                    pie[i][(v, u)] = pie[i-1].get((max_path, v), 0) * q * e_prob[u].get(sentence[i-1],e_prob[u]["#UNK#"])
                    path[i][(v, u)] = max_path
    
        i+=1
        
    
    u = "STOP"
    max_path = 'O'
    pie[i] = {}
    path[i] = {}
    for v in ['O','B-positive', 'B-neutral', 'B-negative', 'I-positive', 'I-neutral', 'I-negative']:
        for t in ['O','B-positive', 'B-neutral', 'B-negative', 'I-positive', 'I-neutral', 'I-negative']:
            q = 0 if q_prob.get((t,v), -1) == -1 else q_prob[(t, v)].get(u,0)
            p = pie[i-1].get((t, v), 0) * q
            
            if p > pie[i].get((v, u),0):
                max_path = t   
            
            q = 0 if q_prob.get((max_path, v), -1) == -1 else q_prob[(max_path, v)].get(u,0)
            pie[i][(v, u)] = pie[i-1].get((max_path, v), 0) * q
            path[i][(v, u)] = max_path
    
    best_path = []
    state_2seq = max(pie[i], key=pie[i].get)
    while i!=0:
        best_path.append(state_2seq[0])
        state_2seq = (path[i][state_2seq], state_2seq[0])
        i-=1
    best_path.reverse()
        
    return best_path[1:]


In [12]:
f = open("ES/train", "r", encoding="utf8")
data = []
for r in f:
    data.append(r.strip("\n"))
    
e_prob = train_part1(data)
q_prob = train_part4(data)

f = open("ES/dev.in", "r", encoding="utf8")
data = []
temp = []
for r in f:
    if r == "\n":
        data.append(temp)
        temp = []
        continue
    temp.append(r.strip("\n"))
    
pred = []
for i in data:
    pred += pred_part4(i, e_prob, q_prob)

f = open("ES/dev.out", "r", encoding="utf8")
data = []
data.append
for r in f:
    data.append(r.strip("\n"))
data_out = [x.split(' ')[1] for x in data if x]

eval_part(pred, data_out)
write_part(data, pred, 4, 'ES')

accuracy : 0.010064565134827194
total precision : 0.7571428571428571
total recall : 0.16257668711656442
net f1 : 0.2676767676767677


In [13]:
f = open("RU/train", "r", encoding="utf8")
data = []
for r in f:
    data.append(r.strip("\n"))
    
e_prob = train_part1(data)
q_prob = train_part4(data)

f = open("RU/dev.in", "r", encoding="utf8")
data = []
temp = []
for r in f:
    if r == "\n":
        data.append(temp)
        temp = []
        continue
    temp.append(r.strip("\n"))
    
pred = []
for i in data:
    pred += pred_part4(i, e_prob, q_prob)

f = open("RU/dev.out", "r", encoding="utf8")
data = []
data.append
for r in f:
    data.append(r.strip("\n"))
data_out = [x.split(' ')[1] for x in data if x]

eval_part(pred, data_out)
write_part(data, pred, 4, 'RU')

accuracy : 0.01083652702524118
total precision : 0.6212121212121212
total recall : 0.12732919254658384
net f1 : 0.21134020618556698
