In [1]:
import json
import pandas as pd
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Task 1

In [2]:
# file paths 
train_file = "./data/train"
dev_file = "./data/dev"
test_file = "./data/test"

def dataset_preparation(file_path, columnnames):
    dataset = pd.read_csv(file_path, sep="\t", header=None, names=columnnames)    
    return dataset

train_dataset = dataset_preparation(train_file, columnnames=["index", "word", "pos_tags" ])
dev_dataset = dataset_preparation(dev_file, columnnames=["index", "word", "pos_tags" ])
test_dataset = dataset_preparation(test_file, columnnames = ["index", "word"])

In [3]:
# Vocabulary Creation
threshold = 2
vocab_count = train_dataset.word.value_counts() 
handle_rare = pd.DataFrame({"word": vocab_count.index, "occurrences" : vocab_count.values})
handle_rare.loc[handle_rare.occurrences < threshold, "word"] = "<unk>"
count_unk = handle_rare.word.value_counts()
word = count_unk.index.to_list()
occurrence = [ vocab_count[w] if w != "<unk>" else count_unk[w] for w in word ]
vocabulary = pd.DataFrame({"word": word, "occurrences" : occurrence})
vocabulary = pd.concat([vocabulary.loc[:0],vocabulary.loc[1:].sort_values(by=["occurrences"], ascending=False)], ignore_index=True)
vocabulary["index"] = [ i for i in range( vocabulary.shape[0])]
vocabulary = vocabulary[["word", "index", "occurrences"]]

In [4]:
# What is the selected threshold for unknown words replacement? What is the total size of your vocabulary ?
print( "Total Size of my vocabulary (including <unk>) :", vocabulary.shape[0])

# what is the total occurrences of the special token ‘< unk >’ after replacement?
print( "Total occurances of the special token <unk> :", vocabulary.loc[0].occurrences)

# save vocabulary to vocab.txt file
vocabulary.to_csv("vocab.txt", sep="\t", index=None, header=False)

Total Size of my vocabulary (including <unk>) : 23182
Total occurances of the special token <unk> : 20011


# Task 2

In [5]:
# list of unique tags and their occurances
unique_tags_count = train_dataset.pos_tags.value_counts()
tags_set = unique_tags_count.index
tags = train_dataset.pos_tags
transition = {tag: {t: 0 for t in tags_set} for tag in tags_set}
initial_transition = train_dataset[train_dataset["index"] == 1].pos_tags.to_list()
transition["<start>"]= {t: 0 for t in tags_set}
for tag in initial_transition:
    transition["<start>"][tag] += 1
    
for index in range(1, len(tags)):
    transition[tags[index-1]][tags[index]] += 1
    
for tag in transition.keys():
    tag_count = unique_tags_count[tag] if tag != "<start>" else len(initial_transition)
    for t in transition[tag].keys():
        if  transition[tag][t] == 0:
            transition[tag][t] = 1e-10
        else:
            transition[tag][t] /= tag_count

transition_object = {}
for tag in transition.keys():
    for t in transition[tag].keys():
        if transition[tag][t] != 0:
            transition_object[f"({tag},{t})"] = transition[tag][t]

In [6]:
vocabulary 
vocab_with_pos = []
for index in range(train_dataset.shape[0]):
    data = train_dataset.loc[index]
    if data.word in vocabulary.word.to_list():
        vocab_with_pos.append([data.word, data.pos_tags])
    else:
        vocab_with_pos.append(["<unk>", data.pos_tags])

emission = {tag: {word: 0 for word in vocabulary.word.to_list()} for tag in tags_set}

for word, pos_tag in vocab_with_pos:
    emission[pos_tag][word] += 1

for tag in emission.keys():
    for word in emission[tag].keys():
        if emission[tag][word] == 0:
            emission[tag][word] = 1e-10
        else:
            tag_count = unique_tags_count[tag]
            emission[tag][word] /= tag_count
        
# creating emission object
emission_object = {}
for tag in emission.keys():
    for word in emission[tag].keys():
        if emission[tag][word]:
            emission_object[f"({tag},{word})"] = emission[tag][word]

In [7]:
print("Total number of Transition parameters :", len(transition_object))
print("Total number of emission parameters :", len(emission_object))

Total number of Transition parameters : 2070
Total number of emission parameters : 1043190


In [8]:
with open("hmm.json", 'w', encoding='utf-8') as file:
    json.dump( {"transition": transition_object, "emission": emission_object} , file, ensure_ascii=False)

## Task 3

In [9]:
def greedy_decoding(sentences, tags_set = tags_set, transition = transition, transition_object= transition_object,
                    emission_object = emission_object):
    tagPredictions = []
    tagScores = []
    for sent in sentences:
        previousTag = "<start>"
        sentPrediction = []
        sentScore = []
        for i in range(len(sent)):
            highestScore = -1
            
            for j in range(len(tags_set)):
                currScore = 1
                if i == 0:
                    currScore *= transition["<start>"][tags_set[j]]
                else:
                    if str("(" + previousTag + "," + tags_set[j] + ")") in transition_object:
                        currScore *= transition_object["(" + previousTag + "," + tags_set[j] + ")"]
                if str("(" + tags_set[j] + "," + str(sent[i]) + ")") in emission_object:
                    currScore *= emission_object["(" + tags_set[j] + "," + str(sent[i]) + ")"]
                else:
                    currScore *= emission_object["(" + tags_set[j] + "," + "<unk>" + ")"]
                if(currScore > highestScore):
                    highestScore = currScore
                    highestProbabilityTag = tags_set[j]
            
            previousTag = highestProbabilityTag
            sentPrediction.append(previousTag)
            sentScore.append(highestScore)

        tagPredictions.append(sentPrediction)
        tagScores.append(sentScore)
    
    return tagPredictions, tagScores

In [10]:
dev_sentences = []
dev_sentence = []
for index, row in dev_dataset.iterrows():
    if row["index"] == 1:
        dev_sentences.append(dev_sentence)
        dev_sentence = []
        
    dev_sentence.append(row["word"])
dev_sentences.append(dev_sentence)

prediction, scores  = greedy_decoding(dev_sentences)
flatten_prediction = [j for sub in prediction for j in sub]
dev_dataset["prediction"] = flatten_prediction
dev_dataset[["word","index","prediction"]].to_csv("dev_greedy.out", sep="\t", header=False, index=False)

In [11]:
! python eval.py -p ./dev_greedy.out -g ./data/dev

total: 131768, correct: 123218, accuracy: 93.51%


In [12]:
test_sentences = []
test_sentence = []
for index, row in test_dataset.iterrows():
    if row["index"] == 1:
        test_sentences.append(test_sentence)
        test_sentence = []        

        
    test_sentence.append(row["word"])
test_sentences.append(test_sentence)

prediction, scores  = greedy_decoding(test_sentences)
flatten_prediction = [j for sub in prediction for j in sub]
test_dataset["prediction"] = flatten_prediction
test_dataset[["word","index","prediction"]].to_csv("greedy.out", sep="\t", header=False, index=False)


In [13]:
# ! python eval.py -p ./greedy.out -g ./test

# Task 4

In [14]:
def viterbi_decoding(sentence, tags_set = tags_set, transition = transition, transition_object= transition_object,
                    emission_object = emission_object):
    n = len(tags_set)
    viterbi_list = []
    data = {}
    for t in tags_set:
        if "(" + t + "," + str(sentence[0]) + ")" in emission_object:
            viterbi_list.append(transition["<start>"][t] * emission_object["(" + t + "," + str(sentence[0]) + ")"])
        else:
            # print(" mein ja raha hu mujhe yeh nahi mill raha hai" , "(" + t + "," + str(sentence[0]) + ")" )
            viterbi_list.append(transition["<start>"][t] * emission_object["(" + t + "," + "<unk>" + ")"])

    for i,word in enumerate(sentence):
        if i == 0: continue
        temp_list = [None] * n
        for j,tag in enumerate(tags_set):
            
            score = -1
            val = 1
            for k, prob in enumerate(viterbi_list):
                if str("(" + tags_set[k] + "," + tag + ")") in transition_object and str("(" + tag + "," + str(word) + ")") in emission_object:
                    val = prob * transition_object["(" + tags_set[k] + "," + tag + ")"] * emission_object["(" + tag + "," + str(word) + ")"]
                else:
                    # print(" mein ja raha hu mujhe yeh nahi mill raha hai" , str("(" + tag + "," + str(word) + ")"))                  
                    val = prob * transition_object["(" + tags_set[k] + "," + tag + ")"] * emission_object["(" + tag + "," + "<unk>" + ")"]
                if(score < val):
                    score = val
                    data[str(i) + ", " + tag] = [tags_set[k], val]
            temp_list[j] = score
        viterbi_list = [x for x in temp_list]
    
    return data, viterbi_list

In [15]:
def viterbi_backward(tags_set, data, viterbi_list):
    num_states = len(tags_set)
    n = len(data) // num_states
    best_sequence = []
    best_sequence_breakdown = []
    x = tags_set[np.argmax(np.asarray(viterbi_list))]
    best_sequence.append(x)

    for i in range(n, 0, -1):
        val = data[str(i) + ', ' + x][1]
        x = data[str(i) + ', ' + x][0]
        best_sequence = [x] + best_sequence
        best_sequence_breakdown =  [val] + best_sequence_breakdown
    
    return best_sequence, best_sequence_breakdown

In [16]:
c = []
v = []
for sentence in dev_sentences[1:]:
    a, b = viterbi_decoding(sentence)
    c.append(a)
    v.append(b)

best_seq = []
best_seq_score = []
for data, viterbi_list in zip(c, v):
    a, b = viterbi_backward(tags_set, data, viterbi_list)
    best_seq.append(a)
    best_seq_score.append(b)

flatten_prediction = [j for sub in best_seq for j in sub]
len(flatten_prediction)
dev_dataset["viterbi_prediction"] = flatten_prediction
dev_dataset[["word","index","viterbi_prediction"]].to_csv("dev_viterbi.out", sep="\t", header=False, index=False)

In [17]:
! python eval.py -p ./dev_viterbi.out -g ./data/dev

total: 131768, correct: 124924, accuracy: 94.81%


In [28]:
c = []
v = []
for sentence in test_sentences[1:]:
    a, b = viterbi_decoding(sentence)
    c.append(a)
    v.append(b)

best_seq = []
best_seq_score = []
for data, viterbi_list in zip(c, v):
    a, b = viterbi_backward(tags_set, data, viterbi_list)
    best_seq.append(a)
    best_seq_score.append(b)

flatten_prediction = [j for sub in best_seq for j in sub]
test_dataset["viterbi_prediction"] = flatten_prediction
test_dataset[["word","index","viterbi_prediction"]].to_csv("viterbi.out", sep="\t", header=False, index=False)

In [None]:
# ! python eval.py -p ./viterbi.out -g ./data/dev