In [None]:


#Libraries
import os
import sys
import math
import copy

# Part 1


In [None]:
#Part 1
def train(filepath):
	print("Training..")
	with open(filepath, "r", encoding="utf8") as f:
		lines = f.readlines()

	# Set of all unique tokens in file
	tokens = []
	# Nested dictionary to keep track of emission count
	# {tag: {token: count} }
	emission_count = {} 

	# Iterate through file to update tokens and emission_count
	for line in lines:
		line_split = line.strip().rsplit(" ", 1)
		if len(line_split) == 2:
			token = line_split[0]
			tag = line_split[1]

			if token not in tokens:
				tokens.append(token)

			if tag not in emission_count:
				nested_tag_dict = {}
			else:
				nested_tag_dict = emission_count[tag]
			if token not in nested_tag_dict:
				nested_tag_dict[token] = 1
			else:
				nested_tag_dict[token] += 1
			emission_count[tag] = nested_tag_dict

	return tokens, emission_count


def est_emission_param(emission_count, token, tag):
	tag_dict = emission_count[tag]

	a = tag_dict.get(token, 0)	# Returns 0 if none
	b = sum(tag_dict.values())

	return a / b


def est_emission_param(emission_count, token, tag, k=1):
	tag_dict = emission_count[tag]

	if token != "#UNK#":
		a = tag_dict.get(token, 0)
	else:
		a = k 
	b = sum(tag_dict.values()) + k

	return a / b


def get_sentence_tag(sentence, tokens, emission_count, k=1):
	pred_tags = []

	for word in sentence:
		pred_tag = ""
		max_emission = float('-inf')

		for tag in emission_count:
			if word not in tokens:
				word = "#UNK#"

			if word in emission_count[tag] or word == "#UNK#":
				emission = est_emission_param(emission_count, word, tag, k)
				if emission > max_emission:
					pred_tag = tag 
					max_emission = emission

		pred_tags.append(pred_tag)

	return pred_tags


def evaluate(filepath, tokens, emission_count, k=1):
	print("Evaluating..")
	with open(filepath, "r", encoding="utf8") as f:
		lines = f.readlines()

	all_pred_tags = []

	sentence = []
	for line in lines:
		if line != "\n":
			sentence.append(line.strip())
		else:
			pred_tags = get_sentence_tag(sentence, tokens, emission_count, k)
			all_pred_tags += pred_tags + ["\n"]

			sentence = []

	return lines, all_pred_tags


def write_output(filepath, lines, all_pred_tags):
	print("Writing output..")
	with open(filepath, "w", encoding="utf8") as f:
		for i in range(len(lines)):
			word = lines[i].strip()

			if word != "\n":
				tag = all_pred_tags[i]

				if tag != "\n":
					f.write(word + " " + tag)
				f.write("\n")

	print("Output successfully written!")


if __name__ == '__main__':
	root_dir = "./"
	datasets = ["ES", "RU"]

	for dataset in datasets:
		print("For dataset {}:".format(dataset))
		train_path = root_dir + "{}/train".format(dataset)
		evaluation_path = root_dir + "{}/dev.in".format(dataset)

		# Train
		tokens, emission_count = train(train_path)

		# Estimate emission parameters using MLE
		"""
		for tag in emission_count:
			for token in tokens:
				#emission = est_emission_param(emission_count, token, tag)		# Without tackling special word token
				emission = est_emission_param(emission_count, token, tag, k=1)	# With special word token
				if emission != 0:
					print("Emission parameters for {x} given {y}: {emission}".format(x=token, y=tag, emission=emission))
		"""
		
		# Evaluate
		lines, all_pred_tags = evaluate(evaluation_path, tokens, emission_count, k=1)

		# Write output file
		output_path = root_dir + "{}/dev.p1.out".format(dataset)
		write_output(output_path, lines, all_pred_tags)

		print("Dataset {} done.".format(dataset))

	print("Done for all datasets!!")

# Part 2

In [None]:
#Part 2
def train(filepath):
	print("Training..")
	with open(filepath, "r", encoding="utf8") as f:
		lines = f.readlines()
		
	start = "START"
	stop = "STOP"

	# Set of all unique tokens in file
	tokens = []
	# Nested dictionary to keep track of emission count
	# {tag: {token: count} }
	emission_count = {} 

	# Iterate through file to update tokens and emission_count
	for line in lines:
		line_split = line.strip().rsplit(" ", 1)
		if len(line_split) == 2:
			token = line_split[0]
			tag = line_split[1]

			if token not in tokens:
				tokens.append(token)

			if tag not in emission_count:
				nested_tag_dict = {}
			else:
				nested_tag_dict = emission_count[tag]
			if token not in nested_tag_dict:
				nested_tag_dict[token] = 1
			else:
				nested_tag_dict[token] += 1
			emission_count[tag] = nested_tag_dict
			
	return tokens, emission_count


def est_emission_param(emission_count, token, tag, k=1):
	tag_dict = emission_count[tag]

	if token != "#UNK#":
		a = tag_dict.get(token, 0)
	else:
		a = k 
	b = sum(tag_dict.values()) + k

	return a / b


def transition(filepath):
	with open(filepath, "r", encoding="utf8") as f:
		lines = f.readlines()
		
	start = "START"
	stop = "STOP"

	# Set of all unique tokens in file
	tokens = []
	# Nested dictionary to keep track of emission count
	# {tag: {token: count} }
	u = "START"
	emission_count = {} 

	# Iterate through file to update tokens and emission_count
	for line in lines:
		line_split = line.strip().rsplit(" ", 1)
		
		#Case 1
		
		if len(line_split) == 2:
			token = line_split[0]
			v = line_split[1]

			if u not in emission_count:
				u_dict = {}
			else:
				u_dict = emission_count[u]

			if v in u_dict:
				u_dict[v] += 1
			else:
				u_dict[v] = 1

			emission_count[u] = u_dict
			u = v

		#Case 2
		
		else:
			u_dict = emission_count[u]
			v = stop

			if v in u_dict:
				u_dict[v] += 1
			else:
				u_dict[v] = 1

			emission_count[u] = u_dict
			u = start
			
	return emission_count

def transition_param(emission_count, u, v):
	

	if u not in emission_count:
		a = 0
	else:
		u_dict = emission_count[u]
		a = u_dict.get(v,0)
		b = sum(u_dict.values())

	return a / b

def viterbi_forward(emissions, transitions, words, labels):
	n = len(labels)
	smallest = float('-inf')
	
	states = list(transitions.keys())
    states.remove("START")

	for v in states:
		transition_fraction = transition_param(transitions, "START", v)
		if transition_fraction != 0:
            trans = math.log(trans_frac)
        else:
            trans = smallest
	



			

In [None]:
#Part 2
def transition(data):
    # Initialise variables
    START = "START"
    STOP = "STOP"
    count_u_to_v = {}
    count_y = {}

    # Generate the sequences from data together with START and STOP state
    sentences = []
    sequence = []
    for idx, line in enumerate(data):
        print(idx, line)
        if not line[0]: # line is empty
            sequence = [START] + sequence + [STOP]
            sentences.append(sequence)
            sequence = []
        else:
            sequence.append(line[1])

    # Counting
    num_sentences = len(sentences)
    count_y[START] = num_sentences
    count_y[STOP] = num_sentences

    for seq in sentences:
        prev_y = START
        for curr_y in seq[1:]:
            key = (prev_y, curr_y)

            if key not in count_u_to_v:
                count_u_to_v[(prev_y,curr_y)] = 1
            elif key in count_u_to_v:
                count_u_to_v[(prev_y,curr_y)] += 1

            if curr_y != STOP:
                if curr_y not in count_y:
                    count_y[curr_y] = 1
                elif curr_y in count_y:
                    count_y[curr_y] += 1
            prev_y = curr_y
            
    transitions = {}
    for (u,v), count in count_u_to_v.items():
        transitions[(u,v)] = count/count_y[u]

    return transitions

with open("ES/train", "r", encoding="utf-8") as f:
    training_data = [line.strip("\n").split(" ") for line in f.readlines()]
c=transition(training_data)