In [3]:
import pandas as pd
import ast
from collections import Counter
import math
import re

In [870]:
train_data = pd.read_csv('./data_release/train.csv', encoding='latin-1')
train_data

Unnamed: 0,sentence,pos_seq,label_seq
0,Ca n't fail to be entertaining .,"['VERB', 'ADV', 'VERB', 'PART', 'VERB', 'ADJ',...","[0, 0, 0, 0, 0, 0, 0]"
1,How much was he going to tell her ?,"['ADV', 'ADJ', 'VERB', 'PRON', 'VERB', 'PART',...","[0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,"Up until that news hit the Committee , Don had...","['ADP', 'ADP', 'DET', 'NOUN', 'VERB', 'DET', '...","[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
3,Could go on to the rugby and go with them coul...,"['VERB', 'VERB', 'PART', 'ADP', 'DET', 'NOUN',...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,"Finally , we went to the office and they gave ...","['ADV', 'PUNCT', 'PRON', 'VERB', 'ADP', 'DET',...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ..."
...,...,...,...
6318,"In a voice of soft persuasion , she said , Wil...","['ADP', 'DET', 'NOUN', 'ADP', 'ADJ', 'NOUN', '...","[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6319,It is a symptom of public anxiety about urban ...,"['PRON', 'VERB', 'DET', 'NOUN', 'ADP', 'ADJ', ...","[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
6320,I do n't like Miss Fitch .,"['PRON', 'VERB', 'ADV', 'VERB', 'PROPN', 'PROP...","[0, 0, 0, 0, 0, 0, 0]"
6321,"A fern-like plant , beautifully preserved in a...","['DET', 'ADJ', 'NOUN', 'PUNCT', 'ADV', 'VERB',...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


# HMM Model

In [553]:
# unused code, probably OK to get rid of at some point

# def preprocess(sentence):
#     word_pattern = re.compile("(\w+|<s> |[,.!?;\(\)])")
#     return word_pattern.findall(sentence)

#     def prob_tagged_sequence(self, sequence):
#         '''
#         sequence: tuple where first element is sentence as a string, second element is a 
#             list of tags associated with each word of sentence 
            
#         tags are binary (0 or 1)
#         '''
#         sentence = sequence[0].lower().split()
#         tags = sequence[1]
        
#         #initialize log_prob_acc with initial probability
#         initial_transition_prob = self.tag_bigrams[('<START>', tags[0])] / self.tag_counts['<START>']
#         if sentence[0] in self.emissions:
#             initial_emission_prob = self.emissions[sentence[0]].get(tags[0], 1) / self.tag_counts[tags[0]]
#         else:
#             initial_emission_prob = 1 / self.tag_counts[tags[0]]
#         log_prob_acc = math.log(initial_transition_prob) + math.log(initial_emission_prob)
        
#         #sum log transition and emission probabilities
#         for t in range(1, len(tags)):
#             tag_bigram = (tags[t-1], tags[t])
#             transition_prob = self.tag_bigrams[tag_bigram] / self.tag_counts[tags[t-1]]
#             if sentence[t] in self.emissions:
#                 emission_prob = self.emissions[sentence[t]].get(tags[t], 1) / self.tag_counts[tags[t]]
#             else:
#                 emission_prob = 1 / self.tag_counts[tags[t]]
#             log_prob_acc = math.log(transition_prob) + math.log(emission_prob)
        
#         return math.exp(log_prob_acc)

In [858]:
class HMM_Metaphor_Tagger():
    def __init__(self, training_data, k=0.5, weights={(0,0): 1, (0,1): 1, (1,0): 1, (1,1): 1}):
        self.tag_counts = Counter()
        self.tag_bigrams = {}
        self.emissions = {}
        
        #set k for add-k smoothing
        self.k = k
        
        #set weights for transition probabilities (NOT CURRENTLY SET)
        self.weights = weights
        
        #iterate through training examples
        for row in training_data.iterrows():
            
            #preprocess: add start characters and labels for computing initial probabilities
            # and convert strings to lists and downcase sentences
            tags_string = row[1][2]
            tags = ast.literal_eval(tags_string)
            tags.insert(0, '<START>')
            sentence = row[1][0].lower().split()
            sentence.insert(0, '<s>')
            
            #get label bigram counts -- (0,0), (0,1), (1,0), (1,1), ('<START>',0), ('<START>',1)
            for t in range(1, len(tags)):
                tag_bigram = (tags[t-1], tags[t])
                if tag_bigram not in self.tag_bigrams:
                    self.tag_bigrams[tag_bigram] = 1
                else:
                    self.tag_bigrams[tag_bigram] += 1
                    
            #get individual tag counts
            self.tag_counts.update(tags)
            
            #get emission counts
            for i, word in enumerate(sentence):
                if word not in self.emissions:
                    self.emissions[word] = {tags[i] : 1}
                else:
                    if tags[i] not in self.emissions[word]:
                        self.emissions[word][tags[i]] = 1
                    else:
                        self.emissions[word][tags[i]] += 1
    
    
    def viterbi(self, sentence):
        '''
        sentence: string where each token (punctuation included) is separated by a space
        '''
        sentence = sentence.lower().split()
        previous_log_scores = []
        backpointers = []
        tags = list(self.tag_counts)

        #initialization
        for t in range(1, len(tags)):
            tag = tags[t]

            initial_transition_prob = self.tag_bigrams[('<START>', tag)] / self.tag_counts['<START>']
            if sentence[0] in self.emissions:
                initial_emission_prob = self.emissions[sentence[0]].get(tag, self.k) / self.tag_counts[tag]
            else:
                initial_emission_prob = self.k / self.tag_counts[tag]
            
            previous_log_scores.append(math.log(initial_transition_prob) + math.log(initial_emission_prob))
        
        #iteration
        #w is index of current word
        for w in range(1, len(sentence)):
            
            log_scores = [None, None]
            w_backpointers = []
            max_log_score_final = (float('-inf'), None)
            
            #t is index of current tag
            for t in range(1, len(tags)):
                
                t_backpointer = None
                max_log_score = (float('-inf'), None)

                #j is index of previous tag
                for j in range(1, len(tags)):
                    
                    transition_prob = self.tag_bigrams[(tags[j], tags[t])] / self.tag_counts[tags[j]]
                    if sentence[w] in self.emissions:
                        emission_prob = self.emissions[sentence[w]].get(tags[t], self.k) / self.tag_counts[tags[t]]
                    else:
                        emission_prob = self.k / self.tag_counts[tags[t]]
                    
                    weight = self.weights[(tags[j], tags[t])]
                    log_score = previous_log_scores[j-1] + weight * math.log(transition_prob) + math.log(emission_prob)
                    if log_score > max_log_score[0]:
                        max_log_score = (log_score, j)
                        t_backpointer = j
                        
                    if max_log_score[0] > max_log_score_final[0]:
                        max_log_score_final = max_log_score
                
                log_scores[t-1] = max_log_score[0]
                w_backpointers.append(t_backpointer)
                
            previous_log_scores = log_scores
            backpointers.insert(0, w_backpointers)
        
        #backtracking
        max_index = previous_log_scores.index(max(previous_log_scores)) + 1
        output = [tags[max_index]]
    
        if len(sentence) == 1:
            return output
        
        max_index = max_log_score_final[1]
        for bptrs in backpointers:
            max_index = bptrs[max_index-1]
            output.insert(0, tags[max_index])
            
        return output
            


In [859]:
val_data = pd.read_csv('./data_release/val.csv', encoding='latin-1')

In [860]:
def validate(model, val_data):
    labels = []
    for row in val_data.iterrows():
        sentence = row[1][0]
        labels += model.viterbi(sentence)
    ids = [i for i in range(len(labels))]
    df = pd.DataFrame({'idx': ids, 'label': labels}, columns = ['idx', 'label'])
    return df

In [861]:
model = HMM_Metaphor_Tagger(train_data)

In [862]:
# print(model.viterbi('he continued , hackles rising . he is drowning in debt .'))

In [863]:
#1: k = 0.25, weights = {(0,0): 1, (0,1): 1, (1,0): 1, (1,1): 1}, comparable accuracy, comparable F1
#2: k = 0.5, weights = {(0,0): 1, (0,1): 1, (1,0): 1, (1,1): 1}, treating as baseline
#3: k = 1, weights = {(0,0): 1, (0,1): 1, (1,0): 1, (1,1): 1}, worse accuracy, worse F1
#4: k = 2, weights = {(0,0): 1, (0,1): 1, (1,0): 1, (1,1): 1}, worse accuracy, worse F1

#5: k = 10, weights = {(0,0): 1, (0,1): 1, (1,0): 1, (1,1): 1}, worse accuracy, worse F1
#6: k = 100, weights = {(0,0): 1, (0,1): 1, (1,0): 1, (1,1): 1}, worse accuracy, worse F1
#7: k = 1000, weights = {(0,0): 1, (0,1): 1, (1,0): 1, (1,1): 1}, worse accuracy, worse F1

#max F1 at k = 0.5

#8: k = 0.5, weights = {(0,0): 1, (0,1): 2, (1,0): 2, (1,1): 4}, comparable accuracy, worse F1
#9: k = 0.5, weights = {(0,0): 4, (0,1): 2, (1,0): 2, (1,1): 1}, comparable accuracy, worse F1
#10: k = 0.5, weights = {(0,0): 2, (0,1): 1, (1,0): 1, (1,1): 1}, slightly worse accuracy, slightly better F1
#11: k = 0.5, weights = {(0,0): 4, (0,1): 1, (1,0): 1, (1,1): 1}, slightly worse accuracy, slightly better F1
#12: k = 0.5, weights = {(0,0): 10, (0,1): 1, (1,0): 1, (1,1): 1}, worse accuracy, worse F1

#13: k = 0.5, weights = {(0,0): 5, (0,1): 1, (1,0): 1, (1,1): 1}, worse accuracy, highest F1

#14: k = 0.5, weights = {(0,0): 6, (0,1): 1, (1,0): 1, (1,1): 1}, worse accuracy, worse F1
#15: k = 0.5, weights = {(0,0): 5, (0,1): 2, (1,0): 2, (1,1): 1}, comparable accuracy, worse F1
#15: k = 0.5, weights = {(0,0): 5, (0,1): 2, (1,0): 2, (1,1): 1}, comparable accuracy, worse F1
#16: k = 0.5, weights = {(0,0): 5, (0,1): 0.5, (1,0): 0.5, (1,1): 0.5}, comparable accuracy, worse F1

#max F1 at k = 0.5 and weights = {(0,0): 5, (0,1): 1, (1,0): 1, (1,1): 1} (model 13)

#analysis: the above max parameters result in a very slightly higher F1 score than the baseline
# but with low precision and high recall and as such I believe the baseline (model 2, AKA unweighted) 
# is in fact the most robust model since it has precision roughly equal to recall roughly equal to F1
# as well as the highest accuracy across all tests

#final model parameters: k = 0.5, weights = {(0,0): 1, (0,1): 1, (1,0): 1, (1,1): 1} (model 2)

df = validate(model, val_data)
df.to_csv('model_1_validation_2.csv', index=False)

In [864]:
test_data = pd.read_csv('./data_release/test_no_label.csv', encoding='latin-1')

In [868]:
def test(model, test_data):
    labels = []
    for row in test_data.iterrows():
        sentence = row[1][0]
        labels += model.viterbi(sentence)
    ids = [i for i in range(1, len(labels)+1)]
    df = pd.DataFrame({'idx': ids, 'label': labels}, columns = ['idx', 'label'])
    return df

In [869]:
df = test(model, test_data)
df.to_csv('model_1_test.csv', index=False)