In [1]:
import numpy as np
import pandas as pd
import nltk
import random
import csv
import pickle
from collections import Counter

from nltk.data import load
from sklearn.model_selection import train_test_split

In [2]:
nltk.download('treebank')
nltk.download('tagsets')

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\jaehw\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\jaehw\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [3]:
tagdict = load('help/tagsets/upenn_tagset.pickle')

In [4]:
TAGS = list(tagdict.keys())
TAGS.append("UNK")

In [5]:
len(TAGS)

46

In [6]:
def readFile(fileName):
    finalList = []
    temp = []
    with open("data/"+fileName) as tsv:
        for line in csv.reader(tsv, dialect="excel-tab"): #You can also use delimiter="\t" rather than giving a dialect.            
            if len(line) == 0:  # New Line
                if fileName != "test" and len(temp) > 0 and temp[-1][1] != ".":
                    # Missing period at the end of sentence, add it in
                    temp.append(tuple(["","."]))
                finalList.append(temp)
                temp = []
            else:
                if fileName == "test":
                    temp.append(tuple([line[1]]))
                else:
                    # Train or Dev
                    temp.append(tuple([line[1], line[2]]))

    return finalList

In [7]:
trainList = readFile("train")
testList = readFile("test")
devList = readFile("dev")

In [8]:
def combineSentences(l):
    finalList = []
    for i in l:
        for j in i:
            finalList.append(j)
    return finalList

In [9]:
combinedDevData = combineSentences(devList)

In [10]:
sentence_dev_tags = [tag[1] for tag in combinedDevData]
sentence_dev_words = [tag[0] for tag in combinedDevData]

In [11]:
combinedData = combineSentences(trainList)

TRAINING_TAGS = [tag[1] for tag in combinedData]
TRAINING_WORDS = [tag[0] for tag in combinedData]

In [12]:
# Getting rid of tags labeled unknown
unknown_count = 0
for ind, tag in enumerate(TRAINING_TAGS):
    if tag not in set(TAGS):
        unknown_count += 1
        TRAINING_TAGS[ind] = "UNK"
print("Unknown Counts", unknown_count, "out of", len(TRAINING_TAGS))

Unknown Counts 2753 out of 915031


In [13]:
counter = Counter(TRAINING_WORDS)

In [14]:
remove_set = set()
threshold = 1

for c,v in counter.items():
    if v <= threshold:
        remove_set.add(c)

In [15]:
print(len(remove_set))

20010


In [16]:
tags_transition_prob = [[0 for x in range(len(TAGS))] for y in range(len(TAGS))]

for x, tag_1 in enumerate(TAGS):
    tag_1_count = len([tag for tag in TRAINING_TAGS if tag == tag_1])
    for y, tag_2 in enumerate(TAGS):
        # Given tag_1, what is tag_2 (prev to curr)
        count = 0
        for i in range(1, len(TRAINING_TAGS)):
            prev = TRAINING_TAGS[i-1]
            curr = TRAINING_TAGS[i]
            if prev == tag_1 and curr == tag_2:
                count += 1
        if tag_1_count == 0:
            tags_transition_prob[x][y]  = 0
        else:
            tags_transition_prob[x][y] = count/tag_1_count

filehandler = open("tags_transition_prob_6.obj","wb")
pickle.dump(tags_transition_prob, filehandler)
filehandler.close()

In [17]:
filehandler = open("tags_transition_prob_6.obj","wb")
pickle.dump(tags_transition_prob, filehandler)
filehandler.close()

In [18]:
filtered_tags = TRAINING_TAGS
tag_count = len(filtered_tags)

dictCount = dict((i, filtered_tags.count(i)) for i in set(filtered_tags))

filtered_tags_dict = {}
for tag in TAGS:
    filtered_tags_dict[tag] = [t for t in combinedData if t[1] == tag]

In [19]:
# emission_dict = {}

# for tag in TAGS:
#     filtered_tags_raw = filtered_tags_dict[tag]
#     filtered_tags = [t[0] for t in filtered_tags_raw]
#     tag_count = len(filtered_tags)
    
#     tag_counter = Counter(filtered_tags)
#     for word,word_count in tag_counter.items():
#         emission_prob = 0
#         if tag_count != 0:
#             emission_prob = word_count/tag_count
#         emission_dict[(tag, word)] = emission_prob

In [20]:
# filehandler = open("emission_prob_2.obj","wb")
# pickle.dump(emission_dict, filehandler)
# filehandler.close()

In [21]:
filtered_tags_dict = {}
for tag in TAGS:
    filtered_tags_dict[tag] = [TRAINING_WORDS[ind] for ind, t in enumerate(TRAINING_TAGS) if t == tag]

In [22]:
emission_dict = {}

for ind, word in enumerate(list(set(TRAINING_WORDS))):
    probs = []
    for tag in TAGS:
        filtered_words = filtered_tags_dict[tag]
        tag_count = len(filtered_words)
        
        # How many words are in a certain tag
        emission_prob = 0
        if tag_count != 0:
            word_count = len([t for t in filtered_words if t == word])
            emission_prob = word_count/tag_count  
        
        probs.append(emission_prob)
    if ind % 1000 == 0:
        print(ind)
    
    emission_dict[word] = probs
    
filehandler = open("emission_prob_6.obj","wb")
pickle.dump(emission_dict, filehandler)
filehandler.close()

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000


In [23]:
initial_prob = {}
initial_total = 0
for d in trainList:
    d = d[0]
    if d[1] not in initial_prob:
        initial_prob[d[1]] = 1
    else:
        temp = initial_prob[d[1]]
        initial_prob[d[1]] = temp + 1
    initial_total += 1

In [24]:
combinedDevData = combineSentences(devList)
sentence = combinedDevData
sentence_words = [t[0] for t in sentence]
sentence_tags = [t[1] for t in sentence]

In [25]:
trainData = combinedData
words = [t[0] for t in trainData]
unique_words = list(set(words))

In [26]:
default = len([t for t in TRAINING_TAGS if t == 'UNK'])/len(TRAINING_TAGS)

In [27]:
filehandler = open("emission_prob.obj","rb")
emission_dict = pickle.load(filehandler)
filehandler.close()

filehandler = open("tags_transition_prob.obj","rb")
tags_transition_prob = pickle.load(filehandler)
filehandler.close()

In [28]:
testList

[[('Influential',),
  ('members',),
  ('of',),
  ('the',),
  ('House',),
  ('Ways',),
  ('and',),
  ('Means',),
  ('Committee',),
  ('introduced',),
  ('legislation',),
  ('that',),
  ('would',),
  ('restrict',),
  ('how',),
  ('the',),
  ('new',),
  ('savings-and-loan',),
  ('bailout',),
  ('agency',),
  ('can',),
  ('raise',),
  ('capital',),
  (',',),
  ('creating',),
  ('another',),
  ('potential',),
  ('obstacle',),
  ('to',),
  ('the',),
  ('government',),
  ("'s",),
  ('sale',),
  ('of',),
  ('sick',),
  ('thrifts',),
  ('.',)],
 [('The',),
  ('bill',),
  (',',),
  ('whose',),
  ('backers',),
  ('include',),
  ('Chairman',),
  ('Dan',),
  ('Rostenkowski',),
  ('(',),
  ('D.',),
  (',',),
  ('Ill.',),
  (')',),
  (',',),
  ('would',),
  ('prevent',),
  ('the',),
  ('Resolution',),
  ('Trust',),
  ('Corp.',),
  ('from',),
  ('raising',),
  ('temporary',),
  ('working',),
  ('capital',),
  ('by',),
  ('having',),
  ('an',),
  ('RTC-owned',),
  ('bank',),
  ('or',),
  ('thrift',),
 

### Greedy Decoding

In [37]:
final_greedy_list = []

for s in devList:
    sentence_words = [t[0] for t in s]
    
    period_ind = TAGS.index(".")
    greedy_predicted_tags = []
    prev_ind = period_ind
    
    for word_ind, word in enumerate(sentence_words):
        final_prob = 0
        final_tag = ""
        final_tag_ind = -1
        for tag_ind, tag in enumerate(TAGS): # s2
            # How many tags go to the next tag
            transition_prob = 0
            if word_ind == 0:            
                if tag in initial_prob:
                    transition_prob = initial_prob[tag]/initial_total
            else:
                transition_prob = tags_transition_prob[prev_ind][tag_ind]
                
            # What tag is to what word
            emission_prob = 0.003008641237291414
            if word in emission_dict:
                emission_prob = emission_dict[word][tag_ind]

            curr_final_prob = transition_prob * emission_prob

            if final_prob < curr_final_prob:
                final_prob = curr_final_prob
                final_tag = tag
                final_tag_ind = tag_ind

        greedy_predicted_tags.append(final_tag)
        prev_ind = final_tag_ind
        
        final_greedy_list.append(str(word_ind+1)+"\t"+word+"\t"+final_tag)
    final_greedy_list.append("")

In [38]:
with open("greedy.out", "w") as outfile:
    for line in final_greedy_list:
        outfile.write(f'{line}\n')

### Viterbi

In [31]:
default = len([t for t in TRAINING_TAGS if t == 'UNK'])/len(TRAINING_TAGS)
print(default)

0.003008641237291414


In [39]:
final_hmm_list = []

for s in devList:
    sentence_words = [t[0] for t in s]

    final_prob_matrix = [[0 for _ in range(len(TAGS))] for _ in range(len(sentence_words))]
    tag_ind_matrix = [[0 for _ in range(len(TAGS))] for _ in range(len(sentence_words)-1)]

    # just do initial state here
    period_ind = TAGS.index(".")
    for tag_ind, tag in enumerate(TAGS):
    #     transition_prob = tags_transition_prob[period_ind][tag_ind]
        word = sentence_words[0]
        
        transition_prob = 0
        if tag in initial_prob:
            transition_prob = initial_prob[tag]/initial_total
            
        emission_prob = 0.003008641237291414
        if word in emission_dict:
            emission_prob = emission_dict[word][tag_ind]

        final_prob_matrix[0][tag_ind] = emission_prob * transition_prob

    # Prob for not initial state
    for word_ind in range(1, len(sentence_words)):
        word = sentence_words[word_ind]
        for tag_ind, curr_tag in enumerate(TAGS): # s2 
            max_transition_prob = 0
            max_transition_ind = 45
            
            # How many words are in a certain tag
            emission_prob = 0.003008641237291414
            if word in emission_dict:
                emission_prob = emission_dict[word][tag_ind]

            for prev_ind, prev_tag in enumerate(TAGS):
                # How many tags go to the next tag
                transition_prob = tags_transition_prob[prev_ind][tag_ind]
                prev_prob = final_prob_matrix[word_ind-1][prev_ind]

                temp_prob = prev_prob * transition_prob
                if max_transition_prob < temp_prob:
                    max_transition_prob = temp_prob
                    max_transition_ind = prev_ind

            final_prob_matrix[word_ind][tag_ind] = emission_prob * max_transition_prob
    #         print(word_ind, tag_ind, emission_prob, max_transition_prob)
            tag_ind_matrix[word_ind-1][tag_ind] = max_transition_ind
    

    viterbi_predicted_tags = []

    max_value = -1
    max_ind = 45
    for ind, value in enumerate(final_prob_matrix[-1]):
        if value > max_value:
            max_value = value
            max_ind = ind
    viterbi_predicted_tags.append(TAGS[max_ind])

    prev_index = max_ind
    for n in range(len(sentence_words)-2, -1, -1):
        index = tag_ind_matrix[n][prev_index]
        viterbi_predicted_tags.append(TAGS[index])
        prev_index = index

    viterbi_predicted_tags.reverse()

    for word_ind, word in enumerate(sentence_words):
        final_tag = viterbi_predicted_tags[word_ind]
        final_hmm_list.append(str(word_ind+1)+"\t"+word+"\t"+final_tag)
    final_hmm_list.append("")

In [40]:
with open("viterbi.out", "w") as outfile:
    for line in final_hmm_list:
        outfile.write(f'{line}\n')

In [34]:
counter = Counter(TRAINING_TAGS)

In [44]:
sorted_tags = TAGS
sorted_tags.remove("UNK")

In [45]:
sorted_tags.sort()

In [50]:
sorted_tags.insert(0, "UNK")

In [52]:
vocab_tags = []
for i, tag in enumerate(sorted_tags):
    vocab_tags.append(str(i+1)+"\t"+str(tag)+"\t"+str(counter[tag]))

In [54]:
with open("vocab.txt", "w") as outfile:
    for line in vocab_tags:
        outfile.write(f'{line}\n')