In [3]:


#Libraries
import os
import sys
import math
import copy

# Part 1


In [3]:
#Part 1
def train(filepath):
	print("Training..")
	with open(filepath, "r", encoding="utf8") as f:
		lines = f.readlines()

	# Set of all unique tokens in file
	tokens = []
	# Nested dictionary to keep track of emission count
	# {tag: {token: count} }
	emission_count = {} 

	# Iterate through file to update tokens and emission_count
	for line in lines:
		line_split = line.strip().rsplit(" ", 1)
		if len(line_split) == 2:
			token = line_split[0]
			tag = line_split[1]

			if token not in tokens:
				tokens.append(token)

			if tag not in emission_count:
				nested_tag_dict = {}
			else:
				nested_tag_dict = emission_count[tag]
			if token not in nested_tag_dict:
				nested_tag_dict[token] = 1
			else:
				nested_tag_dict[token] += 1
			emission_count[tag] = nested_tag_dict

	return tokens, emission_count


def est_emission_param(emission_count, token, tag):
	tag_dict = emission_count[tag]

	a = tag_dict.get(token, 0)	# Returns 0 if none
	b = sum(tag_dict.values())

	return a / b


def est_emission_param(emission_count, token, tag, k=1):
	tag_dict = emission_count[tag]

	if token != "#UNK#":
		a = tag_dict.get(token, 0)
	else:
		a = k 
	b = sum(tag_dict.values()) + k

	return a / b


def get_sentence_tag(sentence, tokens, emission_count, k=1):
	pred_tags = []

	for word in sentence:
		pred_tag = ""
		max_emission = float('-inf')

		for tag in emission_count:
			if word not in tokens:
				word = "#UNK#"

			if word in emission_count[tag] or word == "#UNK#":
				emission = est_emission_param(emission_count, word, tag, k)
				if emission > max_emission:
					pred_tag = tag 
					max_emission = emission

		pred_tags.append(pred_tag)

	return pred_tags


def evaluate(filepath, tokens, emission_count, k=1):
	print("Evaluating..")
	with open(filepath, "r", encoding="utf8") as f:
		lines = f.readlines()

	all_pred_tags = []

	sentence = []
	for line in lines:
		if line != "\n":
			sentence.append(line.strip())
		else:
			pred_tags = get_sentence_tag(sentence, tokens, emission_count, k)
			all_pred_tags += pred_tags + ["\n"]

			sentence = []

	return lines, all_pred_tags


def write_output(filepath, lines, all_pred_tags):
	print("Writing output..")
	with open(filepath, "w", encoding="utf8") as f:
		for i in range(len(lines)):
			word = lines[i].strip()

			if word != "\n":
				tag = all_pred_tags[i]

				if tag != "\n":
					f.write(word + " " + tag)
				f.write("\n")

	print("Output successfully written!")


if __name__ == '__main__':
	root_dir = "./"
	datasets = ["ES", "RU"]

	for dataset in datasets:
		print("For dataset {}:".format(dataset))
		train_path = root_dir + "{}/train".format(dataset)
		evaluation_path = root_dir + "{}/dev.in".format(dataset)

		# Train
		tokens, emission_count = train(train_path)

		# Estimate emission parameters using MLE
		"""
		for tag in emission_count:
			for token in tokens:
				#emission = est_emission_param(emission_count, token, tag)		# Without tackling special word token
				emission = est_emission_param(emission_count, token, tag, k=1)	# With special word token
				if emission != 0:
					print("Emission parameters for {x} given {y}: {emission}".format(x=token, y=tag, emission=emission))
		"""
		
		# Evaluate
		lines, all_pred_tags = evaluate(evaluation_path, tokens, emission_count, k=1)

		# Write output file
		output_path = root_dir + "{}/dev.p1.out".format(dataset)
		write_output(output_path, lines, all_pred_tags)

		print("Dataset {} done.".format(dataset))

	print("Done for all datasets!!")

For dataset ES:
Training..
Evaluating..
Writing output..
Output successfully written!
Dataset ES done.
For dataset RU:
Training..
Evaluating..
Writing output..
Output successfully written!
Dataset RU done.
Done for all datasets!!


# Part 2

In [9]:
#Part 2
def train(filepath):
	print("Training..")
	with open(filepath, "r", encoding="utf8") as f:
		lines = f.readlines()
		
	start = "START"
	stop = "STOP"

	# Set of all unique tokens in file
	tokens = []
	# Nested dictionary to keep track of emission count
	# {tag: {token: count} }
	emission_count = {} 

	# Iterate through file to update tokens and emission_count
	for line in lines:
		line_split = line.strip().rsplit(" ", 1)
		if len(line_split) == 2:
			token = line_split[0]
			tag = line_split[1]

			if token not in tokens:
				tokens.append(token)

			if tag not in emission_count:
				nested_tag_dict = {}
			else:
				nested_tag_dict = emission_count[tag]
			if token not in nested_tag_dict:
				nested_tag_dict[token] = 1
			else:
				nested_tag_dict[token] += 1
			emission_count[tag] = nested_tag_dict
			
	return tokens, emission_count


def est_emission_param(emission_count, token, tag, k=1):
	tag_dict = emission_count[tag]

	if token != "#UNK#":
		a = tag_dict.get(token, 0)
	else:
		a = k 
	b = sum(tag_dict.values()) + k

	return a / b


def transition(filepath):
	with open(filepath, "r", encoding="utf8") as f:
		lines = f.readlines()
		
	start = "START"
	stop = "STOP"

	# Set of all unique tokens in file
	tokens = []
	# Nested dictionary to keep track of emission count
	# {tag: {token: count} }
	u = start
	transition_count = {} 

	# Iterate through file to update tokens and transition_count
	for line in lines:
		line_split = line.strip().rsplit(" ", 1)
		
		#Case 1
		
		if len(line_split) == 2:
			token = line_split[0]
			v = line_split[1]

			if u not in transition_count:
				u_dict = {}
			else:
				u_dict = transition_count[u]

			if v in u_dict:
				u_dict[v] += 1
			else:
				u_dict[v] = 1

			transition_count[u] = u_dict
			u = v

		#Case 2
		
		else:
			u_dict = transition_count[u]
			v = stop

			if v in u_dict:
				u_dict[v] += 1
			else:
				u_dict[v] = 1

			transition_count[u] = u_dict
			u = start
	print(transition_count)
	return transition_count

def transition_param(transition_count, u, v):
	

	if u not in transition_count:
		a = 0
	else:
		u_dict = transition_count[u]
		a = u_dict.get(v,0)
		b = sum(u_dict.values())

	return a / b

def viterbi_forward(emissions, transitions, words, labels):
	n = len(labels)
	smallest = -9999
	
	states = list(transitions.keys())
	print(states)
	states.remove("START")

	# initialize score dict
	scores = {}

	scores[0] = {}

	for v in states:
		transition_fraction = transition_param(transitions, "START", v)
		if transition_fraction != 0:
			trans = math.log(transition_fraction)
		else:
			trans = smallest

		if labels[0] not in words:
			token = "#UNK#"
		else:
			token = labels[0]

		# Emission Probability
		if ((token in emissions[v]) or (token == "#UNK#")): 
			emmision_fraction = est_emission_param(emissions, token, v)
			emission = math.log(emmision_fraction)
		else:
			emission = smallest

		start = trans + emission
		scores[0][v] = ("START", start)
        
		# State 1 to n
		for i in range(1, n):

			scores[i] = {}
			for v in states:
				findmax = []

				for u in states:
                	# Transition Probability
					transition_fraction = transition_param(transitions, u, v)

					if transition_fraction != 0:
						trans = math.log(transition_fraction)
					else:
						trans = smallest
                # if the word does not exist, assign special token
				if labels[i] not in words:
					token = "#UNK#"
				else:
					token = labels[i]

                # Emission Probability
				if ((token in emissions[v]) or token == "#UNK#"):
					emission_fraction = est_emission_param(emissions, token, v)
					emission = math.log(emission_fraction)
				else:
					emission = smallest

				
				current = scores[i-1][u][1] + trans + emission
				findmax.append(current)
    

            # ARGMAX
			ans = max(findmax)
			state_ans = states[findmax.index(ans)]
			scores[i][v] = (state_ans, ans)

	# STATE N to Stop State
	scores[n] = {}
	stopmax = []
	for u in states:
        # Transition Probability
		transition_fraction = transition_param(transitions, u, "STOP")
		if transition_fraction != 0:
			transition = math.log(transition_fraction)
		else:
			transition = smallest
        
		stopscore = scores[n-1][u][1] + trans
		stopmax.append(stopscore)
    
    # ARGMAX
	stop = max(stopmax)
	state_ans = states[stopmax.index(stop)]
	scores[n] = (state_ans, stop)

	# Backtracking path
	path = ["STOP"]
    # scores[n] = ('O', -308.32462005568965)
	last = scores[n][0]
	path.insert(0, last)

	for k in range(n-1, -1, -1):
		
		last = scores[k][last][0]
		path.insert(0, last)
	return path


if __name__ == '__main__':
	root_dir = "./"
	datasets = ["ES", "RU"]

	for dataset in datasets:
		print("For dataset {}:".format(dataset))
		train_path = root_dir + "{}/train".format(dataset)
		evaluation_path = root_dir + "{}/dev.in".format(dataset)

		# Train
		transition_count = transition(train_path)
		tokens, emission_count = train(train_path)

		with open(evaluation_path, "r", encoding="utf8", errors='ignore') as f:
			lines = f.readlines()

		labels = []
		all_prediction = []
		print(dataset)

		for line in lines:    
			if line != "\n":    
				line = line.strip()
				labels.append(line)
			else:
				# print(emission_count)
				print(transition_count)
				sentence_prediction = viterbi_forward(emission_count, transition_count, tokens, labels)
				sentence_prediction.remove("START")
				sentence_prediction.remove("STOP")
				all_prediction = all_prediction + sentence_prediction
				all_prediction = all_prediction + ["\n"]
				labels = []
        
		assert len(lines) == len(all_prediction)
		print("All words have a tag. Proceeding..")

        # create output file
		with open(root_dir + "{folder}/dev.p2.out".format(folder = dataset), "w", encoding="utf8", errors='ignore') as g:
			for j in range(len(lines)):
				word = lines[j].strip()
				if word != "\n":
					tag = all_prediction[j]
					if(tag != "\n"):
						g.write(word + " " + tag)
						g.write("\n")
					else:
						g.write("\n")
	print("done")
			

For dataset ES:
{'START': {'O': 1918, 'B-negative': 27, 'B-positive': 110, 'B-neutral': 10}, 'O': {'O': 27939, 'B-positive': 1162, 'STOP': 2050, 'B-negative': 402, 'B-neutral': 74}, 'B-positive': {'O': 1100, 'I-positive': 162, 'B-positive': 2, 'STOP': 9, 'B-neutral': 1}, 'B-negative': {'O': 347, 'I-negative': 78, 'STOP': 4}, 'I-negative': {'I-negative': 151, 'O': 78}, 'I-positive': {'O': 160, 'I-positive': 238, 'STOP': 2}, 'B-neutral': {'O': 69, 'I-neutral': 16}, 'I-neutral': {'O': 16, 'I-neutral': 28}}
Training..
ES
{'START': {'O': 1918, 'B-negative': 27, 'B-positive': 110, 'B-neutral': 10}, 'O': {'O': 27939, 'B-positive': 1162, 'STOP': 2050, 'B-negative': 402, 'B-neutral': 74}, 'B-positive': {'O': 1100, 'I-positive': 162, 'B-positive': 2, 'STOP': 9, 'B-neutral': 1}, 'B-negative': {'O': 347, 'I-negative': 78, 'STOP': 4}, 'I-negative': {'I-negative': 151, 'O': 78}, 'I-positive': {'O': 160, 'I-positive': 238, 'STOP': 2}, 'B-neutral': {'O': 69, 'I-neutral': 16}, 'I-neutral': {'O': 16, 'I

KeyError: 'I-neutral'