In [26]:
import pandas as pd 
import numpy as np
import json

## Task 1: Vocabulary Creation

### Loading dataset

In [27]:
dev = pd.read_csv("dev", sep = "\t", index_col=False, header=None)
dev.columns = ["index", "word", "tag"]

In [28]:
train = pd.read_csv("train", sep = "\t", index_col=False, header=None)
train.columns = ["index", "word", "tag"]

In [29]:
test = pd.read_csv("test", sep = "\t", index_col=False, header=None)
test.columns = ["index", "word"]

### Creating a vocabulary

In [31]:
# count the unique vocab
countVoc = train['word'].value_counts().rename_axis('word').reset_index(name='counts')
# less freq part
lessfreq = countVoc.loc[countVoc['counts'] < 3] 

In [32]:
# set the word as '<unk>' if the frequence less than 3 
countVoc.loc[countVoc['counts'] < 3, 'word'] = '<unk>'

In [33]:
# get the summation counts where the words are the same,
# the only case where words are different is '<unk>'
countVoc = countVoc.groupby(['word']).sum('counts')

In [34]:
# sort thhe vocabulary by its count
countVoc = countVoc.sort_values(by=['counts'], ascending = False).reset_index()

In [35]:
# move 'unk' to the first line
unk_word = countVoc[countVoc['word']=='<unk>']
countVoc = countVoc[countVoc['word']!='<unk>'].reset_index()
vocab = pd.concat([unk_word, countVoc], ignore_index = True, axis = 0)
vocab = vocab.drop('index', axis=1)

In [36]:
vocab = pd.concat([unk_word, countVoc], ignore_index = True, axis = 0)
vocab = vocab.drop('index', axis=1)
vocab

Unnamed: 0,word,counts
0,<unk>,32537
1,",",46476
2,the,39533
3,.,37452
4,of,22104
...,...,...
16915,overreacting,3
16916,Schuster,3
16917,overriding,3
16918,oversaw,3


The selected threshold for unknown vocabulary is 3, so the size of my 
vocabulary is 13751, the occurrences of token '< unk >' after replacement are 42044.

In [11]:
#vocab .to_csv('/Users/juliachen/Desktop/CSCI 544/homework/hw3/vocab.txt',
#              header = False,index=True, sep='\t')

In [51]:
train

Unnamed: 0,index,word,tag
0,1,pierre,NNP
1,2,<unk>,<unk>
2,3,",",","
3,4,61,CD
4,5,years,NNS
...,...,...,...
912090,22,to,TO
912091,23,san,NNP
912092,24,francisco,NNP
912093,25,instead,RB


## Task 2: Model Learning

In [52]:
train2 = train
train2.loc[train2['word'].isin(lessfreq['word']), 'word'] = '<unk>'
train2.loc[(train2['word'] == '<unk>'), 'tag'] = '<unk>'

In [53]:
# to lower case
train2
train2['word'] = train2['word'].str.lower()

### Transition parameters for HMM

1. Find unique state as a dictionary
2. Creat empty dictionary for count(s->s')
3. For each sentence, go over all tags 
4. Calculate the transition by count(s->s')/count(s)

In [54]:
# calculate s
count_tags= train2['tag'].value_counts().rename_axis('tag').reset_index(name='count')
tags = count_tags.set_index('tag').to_dict()['count']

In [55]:
# calculate s->s'
tags_tran = {}

for key, val in tags.items():
    tags_tran[key] = dict()

In [56]:
# calculate next state s' for each s
tag_list = train2['tag'].to_list()

for i in range(len(tag_list)-1):
    s = tag_list[i]
    s_prime = tag_list[i+1]
    tags_tran[s][s_prime] = tags_tran[s].get(s_prime, 0) + 1

In [57]:
# transition 
transition = {}

for key, val in tags_tran.items():
    count = tags[key]
    for key2, val2 in val.items():
        total = val2
        transition[(key, key2)] = total / count

In [58]:
#data = dict((','.join(k), v) for k,v in transition.items())
#with open('/Users/juliachen/Desktop/CSCI 544/homework/hw3/transition.json', 'w') as fp:
#    json.dump(data, fp, indent = 2)

### Emission parameters for HMM

1. Find unique state as a dictionary
2. Creat empty dictionary for count(s->x)
3. For each sentence, go over all word
4. Calculate the transition by count(s->x)/count(s)

In [59]:
# calculate s->x
emm_tran = {}

for key, val in tags.items():
    emm_tran[key] = dict()

In [60]:
# calculate next state x for each s
word_list = train2['word'].to_list()
tag_list = train2['tag'].to_list() 

for i in range(len(word_list)):
    x = word_list[i]
    s = tag_list[i]
    emm_tran[s][x] = emm_tran[s].get(x, 0) + 1

In [61]:
# emission  
emission = {}

for key, val in emm_tran.items():
    count = tags[key]
    for key2, val2 in val.items():
        total = val2
        #print(key2, ":", total)
        emission[(key, key2)] = total / count

In [62]:
#data = dict((','.join(k), v) for k,v in emission.items())
#with open('/Users/juliachen/Desktop/CSCI 544/homework/hw3/emission.json', 'w') as fp:
#    json.dump(data, fp, indent = 2)

In [69]:
print("There are", len(transition), "transition parameters, and", len(emission), "emission parameters.")

There are 1445 transition parameters, and 21754 emission parameters.


There are 1451 transition parameters and 23340 emission parameters in my HMM. 

## Task 3: Greedy Decoding withHMM

In [221]:
with open('/Users/juliachen/Desktop/CSCI 544/dev', 'r') as fp:
    sentence = []
    dev_sentences = []
    for line in fp:
        line = line.replace('\n', '').split('\t')
        #print(len(line))
        if len(line) == 1:
            dev_sentences.append(sentence)
            sentence = []
        else:
            sentence.append(line)

In [222]:
len_dev = len(dev_sentences)

In [223]:
def get_transition(previous_tag):
    tags_list = {}
    for tag, val in transition.items():
        if tag[0] == previous_tag:
            #tags_list.append((tag[1], val))
            tags_list.update({tag[1]: val})
            
    return tags_list

# get_transition('NNP')

In [225]:
def get_emission(word):
    tags_list ={}
    if word in vocab['word'].to_list():
        for tag, val in emission.items():
            if tag[1] == word:
                tags_list.update({tag[0]: val})
    else: 
        for tag, val in emission.items():
            if tag[1] == '<unk>':
                tags_list.update({tag[0]: val})
                
    return tags_list
# get_emission('like')

In [226]:
firstSent = train2[train2['index']==1]
ts1 = dict(firstSent['tag'].value_counts())
for key, val in ts1.items():
    ts1[key] = val/len(firstSent)

In [227]:
states = np.unique(dev['tag'])

In [269]:
# calculate s*1 = arg max t(s1)e(x1|s1)
def s1(state):
    score = {}
    for tag, val in state.items():
        if tag in ts1:
            score[tag] = val*ts1[tag]
        else: 
            continue
    print(score)
    return(max(score, key = score.get))

# def s1(word):
#     # prob_w = get_emission(word)
#     bestScore = 0 
#     for tag, prob in word:
#         if tag in ts1:
#             score = ts1[tag] * prob
#         else: 
#             score = ts1['<unk>'] * prob
#         if score > bestScore:
#             bestScore = score
#             ps1 = tag
            
#     return ps1

In [267]:
s1({'DT': 22, 'NN': 11})

{'DT': 4.8198754513579996, 'NN': 0.36351980742058715}


'DT'

In [270]:
# s*2 = arg max t(s2|s*1)e(x2|s2)
def s2(word, state):
    score = {}
    for key in state:
        if key in word:
            score[key] = word[key]*state[key]
    return(max(score, key = score.get))

In [271]:
s2("like", {'DT': 22, 'NN': 11})

ValueError: max() arg is an empty sequence

In [272]:
def acc(pred, label):
    # assert(len(pred)==len(label))
    return sum(x[0]==x[1] for x in zip(pred, label))/len(pred)

In [273]:
def greed_model(sentence):
    labels = []
    predictions = []
    for word in sentence:
        index = word[0]
        token = word[1]
        tag = word[2]
        labels.append(tag)
        if index == '1':
            tag_by_word = get_emission(token)
            pred_tag = s1(tag_by_word)
            predictions.append(pred_tag)
            prev_tag = pred_tag
        else:
            try:
                tag_by_word = get_emission(token)
                tag_by_tag = get_transition(prev_tag)
                pred_tag = s2(tag_by_word, tag_by_tag)
                predictions.append(pred_tag)
                prev_tag = pred_tag
            except ValueError:
                pred_tag = max(tag_by_word, key = tag_by_word.get)
                predictions.append(pred_tag)
                prev_tag = pred_tag
    return labels, predictions

In [274]:
for sentence in dev_sentences:
    predictions, labels = greed_model(sentence)

{}


ValueError: max() arg is an empty sequence

In [211]:
acc(predictions,labels)

NameError: name 'predictions' is not defined

## -----------------------------------------------------------------------

### Transition parameters for HMM

1. Find unique state
2. Build empty matrix
3. Go over the index of each sentence, find count(s ->s')
4. Count the dominodator, count(s)

In [None]:
state = np.unique(train['tag'])

In [None]:
count_S = train['tag'].value_counts().rename_axis('word').reset_index(name='count')

In [None]:
trans_mat = np.zeros([len(state), len(state)])

In [None]:
last_index = train['index'][0]
last_tag = train['tag'][0]
for i,j in zip(train['index'][1:], train['tag'][1:]):  
    ni=np.where(state == last_tag)[0][0]
    nj=np.where(state == j)[0][0]
    if i == last_index + 1:
        trans_mat[ni,nj] += 1
    last_index = i
    last_tag = j

In [None]:
trans_mat

### Emission parameters for HMM

1. Find unique state and unique vocab 
2. Build empty matrix
3. Go over the index of each sentence, find count(s ->x)
4. Count the dominodator, count(s)

In [None]:
vocab = np.unique(train['word'])
vocab

In [None]:
emiss_mat = np.zeros([len(state), len(vocab)])

In [None]:
last_index = train['index'][0]
last_tag = train['tag'][0]
n_sentence = 0
for i,j,k in zip(train['index'][1:], train['tag'][1:], train['word'][1:]):
    ni=np.where(state == last_tag)[0][0]
    nj=np.where(vocab == k)[0][0]
    if i == last_index + 1:
        emiss_mat[ni,nj] += 1
    else:
        n_sentence += 1
        print(f'Complete one sentence {n_sentence}') 
    last_index = i
    last_tag = j

In [None]:
def greed_model(sentence):
    labels = []
    predictions = []
    # sentence_len = 0
    for item in sentence:
        sentence_len += 1
        idx = item[0]
        word = item[1]
        label_tag = item[2]
        labels.append(label_tag)

        if idx == '1':
            initial_word = word
            potential_words = get_emission(initial_word)
            ps1 = s1(potential_words)
            previous_tag = ps1
            predictions.append(ps1)
            continue

        potential_words2 = get_emission(word)
        potential_tags = get_transition(previous_tag)
        try: 
            print(word, tag)
            ps2 = s2(word,tag)
        except:
            ps2 = sorted(potential_words2, key = lambda x: x[1], reverse = True)[0][0]
        previous_tag = ps2
        predictions.append(ps2)
    return predictions, labels


In [None]:
greed_out(sentences)

In [None]:
def greed_out(sentence):
    labels = []
    predictions = []
    sentence_len = 0
    for i in sentence:
        sentence_len += 1
        idx = i[0]
        word = i[1]
        label_tag = i[2]
        labels.append(label_tag)
        
        if idx =='1':
            initial_word = word
            potential_words = get_potential_words(initial_word)
            ps1 = get_ps1(potential_words)
            previous_tag = ps1
            prediction.append(ps1)
            continue
            
        potential_words2 = get_potential_words(word)
        potential_tags = get_potential_tags(previous_tag)
        try: 
            ps2 = get_ps2(potential_words2, potential_tag)
        except:
            ps2 = sorted(potential_words2, key = lambda x: x[1], reverse = True)[0][0]
        previous_tag = ps2
        predictions.append(ps2)
    return predictions
    