In [1]:
import matplotlib.pyplot as plt
import numpy as np
from IPython.core.display import HTML
from itertools import chain
from collections import Counter, defaultdict, namedtuple, OrderedDict
from pomegranate import State, HiddenMarkovModel, DiscreteDistribution
import os
from io import BytesIO
from itertools import chain
import random

In [3]:
Test_Sentence = namedtuple("Sentence", "words")

def getMeTestSentences(data):
    sentences = []
    for key in data:
        sentence = []
        for val in zip(data[key].words):
            sentence.append(val)
        sentences.append(sentence)
    return sentences

def read_test_data(filename):
    """Read tagged sentence data"""
    with open(filename, 'r') as f:
        sentence_lines = [l.split("\n") for l in f.read().split("\n\n")]
        index = 1
        a = OrderedDict()
        for s in sentence_lines:
            temp = []
            for l in s:
                temp.append(l.strip().split("\t")[1:])   
            temp2 = []
            for val in temp:
                if len(val) == 1:
                    temp2.append(val[0])
                          
            a[index] = Test_Sentence(tuple(temp2))
            index += 1
        return a
        
class TestDataset(namedtuple("_TDataset", "sentences keys vocab X N")):
    def __new__(cls, tagfile, datafile, train_test_split=0.8, seed=112890):
        sentences = read_test_data(datafile)
        keys = tuple(sentences.keys())
        wordset = frozenset(chain(*[s.words for s in sentences.values()]))
        word_sequences = tuple([sentences[k].words for k in keys])
        N = sum(1 for _ in chain(*(s.words for s in sentences.values())))
        
        return super().__new__(cls, dict(sentences), keys, wordset, word_sequences,N)

    def __len__(self):
        return len(self.sentences)

    def __iter__(self):
        return iter(self.sentences.items())

In [5]:
Sentence = namedtuple("Sentence", "words tags")

def read_data(filename):
    """Read tagged sentence data"""
    with open("S21-gene-train.txt", 'r') as f:
        sentence_lines = [l.split("\n") for l in f.read().split("\n\n")]
        index = 1
        a = OrderedDict()
        for s in sentence_lines:
            # print(s)
            temp = []
            for l in s:
                temp.append(l.strip().split("\t")[1:])
            
            temp2 = []
            temp3 = []
            for val in temp:
                if len(val) == 2:
                    temp2.append(val[0])
                    temp3.append(val[1])
                          
            a[index] = Sentence(tuple(temp2),tuple(temp3))
            index += 1
        return a

def read_tags(filename):
    """Read a list of word tag classes"""
    with open(filename, 'r') as f:
        tags = f.read().split("\n")
    return frozenset(tags)

class Subset(namedtuple("BaseSet", "sentences keys vocab X tagset Y N stream")):
    def __new__(cls, sentences, keys):
        word_sequences = tuple([sentences[k].words for k in keys])
        tag_sequences = tuple([sentences[k].tags for k in keys])
        wordset = frozenset(chain(*word_sequences))
        tagset = frozenset(chain(*tag_sequences))
        N = sum(1 for _ in chain(*(sentences[k].words for k in keys)))
        stream = tuple(zip(chain(*word_sequences), chain(*tag_sequences)))
        return super().__new__(cls, {k: sentences[k] for k in keys}, keys, wordset, word_sequences,
                               tagset, tag_sequences, N, stream.__iter__)

    def __len__(self):
        return len(self.sentences)

    def __iter__(self):
        return iter(self.sentences.items())

class Dataset(namedtuple("_Dataset", "sentences keys vocab X tagset Y training_set testing_set N stream")):
    def __new__(cls, tagfile, datafile, train_test_split=0.8, seed=112890):
        tagset = read_tags(tagfile)
        sentences = read_data(datafile)
        keys = tuple(sentences.keys())
        wordset = frozenset(chain(*[s.words for s in sentences.values()]))
        word_sequences = tuple([sentences[k].words for k in keys])
        tag_sequences = tuple([sentences[k].tags for k in keys])
        N = sum(1 for _ in chain(*(s.words for s in sentences.values())))
        
        # split data into train/test sets
        _keys = list(keys)
        if seed is not None: random.seed(seed)
        random.shuffle(_keys)
        split = int(train_test_split * len(_keys))
        training_data = Subset(sentences, _keys[:split])
        testing_data = Subset(sentences, _keys[split:])
        stream = tuple(zip(chain(*word_sequences), chain(*tag_sequences)))
        return super().__new__(cls, dict(sentences), keys, wordset, word_sequences, tagset,
                               tag_sequences, training_data, testing_data, N, stream.__iter__)

    def __len__(self):
        return len(self.sentences)

    def __iter__(self):
        return iter(self.sentences.items())

In [6]:
data = Dataset("tags-universal.txt", "brown-universal.txt", train_test_split=0.8)

print("There are {} sentences in the corpus.".format(len(data)))
print("There are {} sentences in the training set.".format(len(data.training_set)))
print("There are {} sentences in the testing set.".format(len(data.testing_set)))

assert len(data) == len(data.training_set) + len(data.testing_set), \
       "The number of sentences in the training set + testing set should sum to the number of sentences in the corpus"

There are 13796 sentences in the corpus.
There are 11036 sentences in the training set.
There are 2760 sentences in the testing set.


In [7]:
key = 10
print("Sentence: {}".format(key))
print("words:\n\t{!s}".format(data.sentences[key].words))
print("tags:\n\t{!s}".format(data.sentences[key].tags))

Sentence: 10
words:
	('The', 'variable', 'HMG', 'dosage', 'regimen', 'was', 'found', 'to', 'offer', 'no', 'advantages', 'when', 'compared', 'with', 'our', 'standard', 'daily', 'dosage', 'regimen', '.')
tags:
	('O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O')


In [8]:
print("There are a total of {} samples of {} unique words in the corpus."
      .format(data.N, len(data.vocab)))
print("There are {} samples of {} unique words in the training set."
      .format(data.training_set.N, len(data.training_set.vocab)))
print("There are {} samples of {} unique words in the testing set."
      .format(data.testing_set.N, len(data.testing_set.vocab)))
print("There are {} words in the test set that are missing in the training set."
      .format(len(data.testing_set.vocab - data.training_set.vocab)))

assert data.N == data.training_set.N + data.testing_set.N, \
       "The number of training + test samples should sum to the total number of samples"

There are a total of 386201 samples of 31328 unique words in the corpus.
There are 309830 samples of 27563 unique words in the training set.
There are 76371 samples of 11825 unique words in the testing set.
There are 3765 words in the test set that are missing in the training set.


In [9]:
# accessing words with Dataset.X and tags with Dataset.Y 
for i in range(2):    
    print("Sentence {}:".format(i + 1), data.X[i])
    print()
    print("Labels {}:".format(i + 1), data.Y[i])
    print()

Sentence 1: ('Comparison', 'with', 'alkaline', 'phosphatases', 'and', '5', '-', 'nucleotidase', '.')

Labels 1: ('O', 'O', 'B', 'I', 'O', 'B', 'I', 'I', 'O')

Sentence 2: ('Pharmacologic', 'aspects', 'of', 'neonatal', 'hyperbilirubinemia', '.')

Labels 2: ('O', 'O', 'O', 'O', 'O', 'O')



In [10]:
# use Dataset.stream() (word, tag) samples for the entire corpus
print("\nStream (word, tag) pairs:\n")
for i, pair in enumerate(data.stream()):
    print("\t", pair)
    if i > 3: break


Stream (word, tag) pairs:

	 ('Comparison', 'O')
	 ('with', 'O')
	 ('alkaline', 'B')
	 ('phosphatases', 'I')
	 ('and', 'O')


### Making Predictions with a Model

In [59]:
tags = (tag for i, (word, tag) in enumerate(data.training_set.stream()))
words = (word for i, (word, tag) in enumerate(data.training_set.stream()))

In [73]:
def replace_unknown(sequence):
    
    return [w if w in data.training_set.vocab else 'nan' for w in sequence]

def simplify_decoding(X, model):
    _, state_path = model.viterbi(replace_unknown(X))
    if not state_path:
        print('Hello',X,replace_unknown(X))
    return [state[1].name for state in state_path[1:-1]] 

In [62]:
tags = [tag for i, (word, tag) in enumerate(data.training_set.stream())]
dict_tag = {}

def unigram_counts(sequences):
    """Return a dictionary keyed to each unique value in the input sequence list that
    counts the number of occurrences of the value in the sequences list. The sequences
    collection should be a 2-dimensional array.
    
    For example, if the tag NOUN appears 275558 times over all the input sequences,
    then you should return a dictionary such that your_unigram_counts[NOUN] == 275558.
    """
    for tag in sequences:
        if tag in dict_tag.keys():
            dict_tag[tag] += 1
        else: dict_tag[tag] = 1
    return dict_tag


tag_unigrams = unigram_counts(tags)

In [63]:
tags = [tag for i, (word, tag) in enumerate(data.training_set.stream())]
sq = list(zip(tags[:-1],tags[1:]))
dict_sq = {}

def bigram_counts(sequences):
    """Return a dictionary keyed to each unique PAIR of values in the input sequences
    list that counts the number of occurrences of pair in the sequences list. The input
    should be a 2-dimensional array.
    
    For example, if the pair of tags (NOUN, VERB) appear 61582 times, then you should
    return a dictionary such that your_bigram_counts[(NOUN, VERB)] == 61582
    """

    for tag_pair in sequences:
        if tag_pair in dict_sq.keys():
            dict_sq[tag_pair] += 1
        else: dict_sq[tag_pair] = 1
    return dict_sq

tag_bigrams = bigram_counts(sq)



In [64]:
dict_starting = defaultdict(list)
def starting_counts(sequences):
    """Return a dictionary keyed to each unique value in the input sequences list
    that counts the number of occurrences where that value is at the beginning of
    a sequence.
    
    For example, if 8093 sequences start with NOUN, then you should return a
    dictionary such that your_starting_counts[NOUN] == 8093
    """
    for tag in data.training_set.tagset:
        dict_starting[tag] = len([seq[0] for seq in sequences if seq[0]==tag])
    return dict_starting

tag_starts = starting_counts(data.training_set.Y)

In [65]:
dict_ending = defaultdict(list)

def ending_counts(sequences):
    """Return a dictionary keyed to each unique value in the input sequences list
    that counts the number of occurrences where that value is at the end of
    a sequence.
    
    For example, if 18 sequences end with DET, then you should return a
    dictionary such that your_starting_counts[DET] == 18
    """
    for tag in data.training_set.tagset:
        dict_ending[tag] = len([seq[-1] for seq in sequences if seq[-1]==tag])
    return dict_ending

tag_ends = ending_counts(data.training_set.Y)

In [66]:
basic_model = HiddenMarkovModel(name="base-hmm-tagger")

count_tag_and_word = pair_counts(tags, words)
states = {}
for tag, word_dict in count_tag_and_word.items(): #data.training_set.tagset
    p_words_given_tag_state = defaultdict(float)
    # for each tag/word, calculate P(word|tag)
    for word in word_dict.keys(): # data.training_set.vocab
        p_words_given_tag_state[word] = count_tag_and_word[tag][word] / tag_unigrams[tag] 
    # create a new state for each tag from the dict of words that represents P(word|tag)
    emission = DiscreteDistribution(dict(p_words_given_tag_state))
    states[tag] = State(emission, name=tag)
    
basic_model.add_states(list(states.values()))

# Adding start states
for tag in data.training_set.tagset:
    state = states[tag]
    basic_model.add_transition(basic_model.start, state, tag_starts[tag]/len(data.training_set))

# Adding end states
for tag in data.training_set.tagset:
    state = states[tag]
    basic_model.add_transition(state, basic_model.end, tag_ends[tag]/tag_unigrams[tag])

# Adding pairs
for tag1 in data.training_set.tagset:
    state1 = states[tag1]
    for tag2 in data.training_set.tagset:
        if (tag1, tag2) in tag_bigrams:
            state2 = states[tag2]
            basic_model.add_transition(state1, state2, tag_bigrams[(tag1, tag2)]/tag_unigrams[tag1])

# finalize the model
basic_model.bake()

### Example Decoding Sequences with the HMM Tagger

In [67]:
for key in data.testing_set.keys[:2]:
    print("Sentence Key: {}\n".format(key))
    print("Predicted labels:\n-----------------")
    print(simplify_decoding(data.sentences[key].words, basic_model))
    print()
    print("Actual labels:\n--------------")
    print(data.sentences[key].tags)
    print("\n")

Sentence Key: 10080

Predicted labels:
-----------------
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

Actual labels:
--------------
('O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O')


Sentence Key: 9941

Predicted labels:
-----------------
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

Actual labels:
--------------
('B', 'I', 'I', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O',

In [None]:
def replace_unknown(sequence):
    
    return [w if w in data.training_set.vocab else 'nan' for w in sequence]

def simplify_decoding(X, model):
    _, state_path = model.viterbi(replace_unknown(X))
    if not state_path:
        print('Hello',X,replace_unknown(X))
    return [state[1].name for state in state_path[1:-1]] 

In [72]:
with open('yoursystemoutput.txt', 'w') as f:
    for key in data.testing_set.sentences:
        for i,val in enumerate(zip(data.testing_set.sentences[key].words,simplify_decoding(data.sentences[key].words, basic_model))):
            f.write("\t".join([str(i+1),val[0],val[1]]) + "\n")
        f.write("\n")
        
with open('goldstandardfile.txt', 'w') as f:
    for key in data.testing_set.sentences:
        for i,val in enumerate(zip(data.testing_set.sentences[key].words,data.testing_set.sentences[key].tags)):
            f.write("\t".join([str(i+1),val[0],val[1]]) + "\n")
        f.write("\n")

Hello ('Characterization', 'and', 'hormonal', 'regulation', 'of', 'the', 'promoter', 'of', 'the', 'rat', 'prostaglandin', 'endoperoxide', 'synthase', '2', 'gene', 'in', 'granulosa', 'cells', '.')


TypeError: 'NoneType' object is not subscriptable

In [25]:
test_data = TestDataset("tags-universal.txt", "S21-gene-test.txt")

In [28]:

with open('testFinal.txt', 'w') as f:
    k = 0
    for key in test_data.sentences:
        for i,val in enumerate(zip(test_data.sentences[key].words,simplify_decoding(test_data.sentences[key].words, basic_model))):
            f.write("\t".join([str(i+1),val[0],val[1]]) + "\n")
        f.write("\n")