# Sequence Labeling in python

### Initialise the transition, start and emission matrix . The states stand for high and low.  The HMM model is given in the assignment itself. 

In [1]:
import numpy as np

P= np.array([[0.6, 0.4],[0.5,0.5]])

S= np.array([0.5, 0.5])

O= np.array([[0.3,0.2,0.2,0.3],[0.2,0.3,0.3,0.2]])

state={}
state[0]='L'
state[1]='H'

DNA={}
DNA['A']=0
DNA['C']=1
DNA['G']=2
DNA['T']=3


### A stupid attempt to show you why the exhaustive search is a bad, bad option for HMM modelling. 

In [2]:
from itertools import product

import time 
def exhaustive_search(sequence):
    
    M= len(sequence)
    state_len= len(S)
    
    # track the best sequence and its score
    best=(None,float('-inf'))
    
    # basically loop will run for |states|^M 
    for ss in product(range(state_len),repeat=M):
        
        score= S[ss[0]]*O[ss[0],DNA[sequence[0]]]
        
        for i in range(1,M):
            score*= P[ss[i-1],ss[i]]*O[ss[i],DNA[sequence[i]]]
            
        
        #print(','.join([state[k] for k in ss]),score)
    
        if score > best[1]:
            best= (ss,score)
    
    return best


In [7]:
sequences=['GGC','GGCAAGATCAT','GAGAGGAGAGAGAGAGAGA']

import time
for sequence in sequences:
    
    t=time.time()
    best=exhaustive_search(sequence)
    t2=time.time()-t
    
    print('For the sequence '+ sequence+ ' of length '+ str(len(sequence))+' time taken was '+ str(round(t2,3))+'s' )
    print('The sequence '+ ','.join([state[k] for k in best[0]])+ ' gave the best score of '+ str(best[1]))
    print('\n')

For the sequence GGC of length 3 time taken was 0.105s
The sequence H,H,H gave the best score of 0.003375


For the sequence GGCAAGATCAT of length 11 time taken was 0.052s
The sequence H,H,H,L,L,L,L,L,L,L,L gave the best score of 1.377495072e-09


For the sequence GAGAGGAGAGAGAGAGAGA of length 19 time taken was 21.034s
The sequence H,L,L,L,H,H,L,L,L,L,L,L,L,L,L,L,L,L,L gave the best score of 1.3326697514e-16




# Dataset for this assignment: Brown corpus tagged with the Universal Tagset.

## This will be your training set. The remaining 100 sentences will be used as your test data.

In [61]:
from nltk.corpus import brown

corpus = brown.tagged_sents(tagset='universal')[:-100]

start_mat = {}
transmission_mat={}
emission_mat={}

for sent in corpus:
    if sent[0][1] not in start_mat:
        start_mat[sent[0][1]] = 0
    start_mat[sent[0][1]] += 1
    for i in range(len(sent)):
        elem = sent[i]
        w = elem[0].lower()
        tag= elem[1]

        if tag not in emission_mat:
            emission_mat[tag]= {w:1}
        elif w not in emission_mat[tag]:
            emission_mat[tag][w] = 1
        else:
            emission_mat[tag][w] += 1

        if i == len(sent)-1:
            next_state = 'stop'
        else:
            next_state = sent[i+1][1]
            
        if tag not in transmission_mat:
            transmission_mat[tag] = {next_state : 1}
        elif next_state not in transmission_mat[tag]:
            transmission_mat[tag][next_state] = 1
        else:
            transmission_mat[tag][next_state] += 1



In [75]:
start_mat

{'.': 0.08901118099231309,
 'ADJ': 0.03434661076170511,
 'ADP': 0.12283368273934313,
 'ADV': 0.09117749825296996,
 'CONJ': 0.049161425576519924,
 'DET': 0.2133997204751922,
 'NOUN': 0.14129979035639414,
 'NUM': 0.01678895877009085,
 'PRON': 0.15971348707197766,
 'PRT': 0.036652690426275336,
 'VERB': 0.04509084556254368,
 'X': 0.0005241090146750525}

In [63]:
total = sum(start_mat.values())            
for state in start_mat:
    start_mat[state] /= total
    
for state in emission_mat:
    total = sum(emission_mat[state].values())
    for w in emission_mat[state]:
        emission_mat[state][w] = emission_mat[state][w]/total
        
for state in transmission_mat:
    total = sum(transmission_mat[state].values())
    for next_state in transmission_mat[state]:
        transmission_mat[state][next_state] /= total
        
        
test_data= brown.tagged_sents(tagset='universal')[-100:]

print(len(test_data))

100


In [88]:
def update_viterbi(V,new_state,observation):
    best = (0,new_state)
    for old_state in V:
        if observation not in emission_mat[new_state]:
            if new_state not in transmission_mat[old_state]:
                prob = V[old_state]
            else:
                prob = V[old_state]*transmission_mat[old_state][new_state]
        else:
            if new_state not in transmission_mat[old_state]:
                prob = V[old_state]*emission_mat[new_state][observation]
            else:
                prob = V[old_state]*transmission_mat[old_state][new_state]*emission_mat[new_state][observation]
        if prob > best[0]:
            best = (prob,old_state)
    return best            

In [84]:
def tag_sent(sent):
    tokens = [i.lower() for i in sent.split()]
    
    # initialize
    V = dict()
    B = list()
    #B.append({})
    for state in emission_mat:
        try:
            V[state] = (start_mat[state]*emission_mat[state][tokens[0]])
        except:
            V[state] = start_mat[state]
        #B[0][state] = 'start'
    
    # recurse
    for token in tokens[1:]:
        V_updated = dict()
        B.append({})
        for state in emission_mat:
            V_updated[state], B[-1][state] = update_viterbi(V,state,token)
        V = V_updated
    
    # terminate
    best = (0,'.')
    for state in V:
        prob = V[state]*transmission_mat[state]['stop']
        if prob>best[0]:
            best = (prob,state)
    
    # back-track
    current_state = best[1]
    pos = [current_state]
    B.reverse()
    for b in B:
        current_state = b[current_state]
        pos.append(current_state)
    pos.reverse()
    
    pos_seq = list(zip(tokens,pos))
    return pos_seq

In [73]:
def test():
    correct_tags = 0
    total_tags = 0
    diff_tokens = 0
    for sent in test_data:
        actual_sent = ' '.join([t[0] for t in sent])
        #print(actual_sent)
        pos_seq = tag_sent(actual_sent)
        #print(pos_seq)
        for model_tag, actual_tag in zip(pos_seq,sent):
            #print(model_tag, actual_tag)
            if(model_tag[0] == actual_tag[0].lower()):
                if (model_tag[1]==actual_tag[1]):
                    correct_tags += 1
            else:
                diff_tokens += 1
        total_tags += len(sent)
    acc = correct_tags/total_tags
    return acc,diff_tokens

In [89]:
print(test())

(0.054598908021839566, 0)


In [72]:
tag_sent('From what I was able to gauge in a swift , greedy glance , the figure inside the coral-colored boucle dress was stupefying .')

[('from', 'ADP'),
 ('what', 'DET'),
 ('i', 'PRON'),
 ('was', 'VERB'),
 ('able', 'ADJ'),
 ('to', 'ADP'),
 ('gauge', 'NOUN'),
 ('in', 'ADP'),
 ('a', 'DET'),
 ('swift', 'NOUN'),
 (',', '.'),
 ('greedy', 'ADJ'),
 ('glance', 'NOUN'),
 (',', '.'),
 ('the', '.'),
 ('figure', '.'),
 ('inside', '.'),
 ('the', '.'),
 ('coral-colored', '.'),
 ('boucle', '.'),
 ('dress', '.'),
 ('was', '.'),
 ('stupefying', '.'),
 ('.', '.')]

In [85]:
from nltk.corpus import brown

corpus = brown.tagged_sents(tagset='universal')[:-100]

tag_dict={}
word_dict={}

for sent in corpus:
    for elem in sent:
        w = elem[0]
        tag= elem[1]

        if w not in word_dict:
            word_dict[w]=0

        if tag not in tag_dict:
            tag_dict[tag]=0

        word_dict[w]+=1
        tag_dict[tag]+=1

print(len(word_dict))
print(len(tag_dict))
        
test_data= brown.tagged_sents(tagset='universal')[-10:]

print(len(test_data))

55907
12
[[('you', 'PRON'), ("can't", 'VERB'), ('very', 'ADV'), ('well', 'ADV'), ('sidle', 'VERB'), ('up', 'ADP'), ('to', 'ADP'), ('people', 'NOUN'), ('on', 'ADP'), ('the', 'DET'), ('street', 'NOUN'), ('and', 'CONJ'), ('ask', 'VERB'), ('if', 'ADP'), ('they', 'PRON'), ('want', 'VERB'), ('to', 'PRT'), ('buy', 'VERB'), ('a', 'DET'), ('hot', 'ADJ'), ('Bodhisattva', 'NOUN'), ('.', '.')], [('Additionally', 'ADV'), (',', '.'), ('since', 'ADP'), ("you're", 'PRT'), ('going', 'VERB'), ('to', 'PRT'), ('be', 'VERB'), ('hors', 'X'), ('de', 'X'), ('combat', 'X'), ('pretty', 'ADV'), ('soon', 'ADV'), ('with', 'ADP'), ('sprue', 'NOUN'), (',', '.'), ('yaws', 'NOUN'), (',', '.'), ('Delhi', 'NOUN'), ('boil', 'NOUN'), (',', '.'), ('the', 'DET'), ('Granville', 'NOUN'), ('wilt', 'NOUN'), (',', '.'), ('liver', 'NOUN'), ('fluke', 'NOUN'), (',', '.'), ('bilharziasis', 'NOUN'), (',', '.'), ('and', 'CONJ'), ('a', 'DET'), ('host', 'NOUN'), ('of', 'ADP'), ('other', 'ADJ'), ('complications', 'NOUN'), ('of', 'ADP'), 

## Module to implement CRF. 

In [86]:
# pip3 install sklearn-crfsuite # install this please

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

train_sents= corpus

def word2features(sent,i):
    word = sent[i][0]
    
    features ={
    'bias': 1.0,
    }
                
    return features

def sent2features(sent):
    return [word2features(sent,i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for i,label in sent]



In [87]:
X_train=[sent2features(s) for s in train_sents]
y_train=[sent2labels(s) for s in train_sents]

X_test=[sent2features(s) for s in test_data]
y_test=[sent2labels(s) for s in test_data]



In [88]:

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [89]:
y_pred = crf.predict(X_test)
labels=list(crf.classes_)

metrics.flat_f1_score(y_test, y_pred, 
                      average='weighted', labels=labels)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


0.12985271687027178

In [90]:
sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

             precision    recall  f1-score   support

          .      0.800     0.242     0.372        33
          X      0.023     1.000     0.044         3
        ADJ      0.000     0.000     0.000        18
        ADP      0.179     0.185     0.182        27
        ADV      0.000     0.000     0.000         9
       VERB      0.000     0.000     0.000        35
        DET      0.121     0.121     0.121        33
       CONJ      0.000     0.000     0.000         7
       NOUN      0.242     0.157     0.190        51
       PRON      0.000     0.000     0.000        12
        PRT      0.000     0.000     0.000        11
        NUM      0.000     0.000     0.000         0

avg / total      0.199     0.117     0.130       239



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
