In [1]:
from nltk.corpus import treebank
corpus = treebank.tagged_sents()
tree_corpus = []
word_numbers = {}
tag_numbers = {}
for sent in corpus:
    num_sent = []
    for word, tag in sent:
        wi = word_numbers.setdefault(word.lower(), len(word_numbers))
        ti = tag_numbers.setdefault(tag, len(tag_numbers))
        num_sent.append((wi, ti))
    tree_corpus.append(num_sent)
    
print tree_corpus[0]
print word_numbers.get("electricity")
print len(tag_numbers)


[(0, 0), (1, 0), (2, 1), (3, 2), (4, 3), (5, 4), (2, 1), (6, 5), (7, 6), (8, 7), (9, 8), (10, 9), (11, 7), (12, 4), (13, 8), (14, 0), (15, 2), (16, 10)]
1095
46


In [None]:
import nltk
tweets = nltk.corpus.twitter_samples.tokenized()
tweets_corpus = []
for tweet in tweets:
    tweet_sent = []
    for word in tweet:
        word = word.lower()
        if word == "rt":
            word = "RETWEET_TOKEN"
        if word[0] == "@":
            word = "USER_TOKEN"
        if word[0] == "#":
            word = "HASHTAG_TOKEN"
        if word[0:8] == "https://" :
            word = "URL_TOKEN"
        if word[0:7] == "http://":
            word = "URL_TOKEN"
        wi = word_numbers.setdefault(word, len(word_numbers))
        tweet_sent.append(wi)
    tweets_corpus.append(tweet_sent)

print tweets_corpus[0]
print word_numbers.get("electricity")
print word_numbers.get("HASHTAG_TOKEN")        
print len(word_numbers)        


In [None]:
new_tags = ["USR","HT","RT","URL","VPP","TD","O"]
for tag in new_tags:
    ti = tag_numbers.setdefault(tag, len(tag_numbers))

word_numbers.setdefault('<unk>', len(word_numbers))

word_names = [None] * len(word_numbers)
for word, index in word_numbers.items():
    word_names[index] = word
tag_names = [None] * len(tag_numbers)
for tag, index in tag_numbers.items():
    tag_names[index] = tag
    
print word_numbers.get('<unk>')
print len(tag_numbers)

In [None]:
import urllib
try:
    urllib.request.urlretrieve("https://github.com/aritter/twitter_nlp/raw/master/data/annotated/pos.txt","pos.txt")
except: # Python 2
    urllib.urlretrieve("https://github.com/aritter/twitter_nlp/raw/master/data/annotated/pos.txt","pos.txt")
test_corpora = []
with open('pos.txt') as f:
    words = []
    for line in f:

        if line.strip() == '':
            test_corpora.append(words)
            words = []
        else:
            word, tag = line.strip().split()

            if tag == "(":
                tag = "-LRB-"
            if tag == ")":
                tag = "-RRB-"
            if tag == "NONE":
                tag = "-NONE-"   

            word = word.lower()
            if word == "rt":
                word = "RETWEET_TOKEN"
            if word[0] == "@":
                word = "USER_TOKEN"
            if word[0] == "#":
                word = "HASHTAG_TOKEN"
            if word[0:8] == "https://" :
                word = "URL_TOKEN"
            if word[0:7] == "http://":
                word = "URL_TOKEN"
            
            if word_numbers.get(word) == None:
                words.append((word_numbers.get('<unk>'),tag_numbers.get(tag)))
            else:
                words.append((word_numbers.get(word),tag_numbers.get(tag)))
print test_corpora[0]

test_outputs = []
with open('pos.txt') as f:
    words = []
    for line in f:
        if line.strip() == '':
            test_outputs.append(words)
            words = []
        else:
            word, tag = line.strip().split()
            if tag == "(":
                tag = "-LRB-"
            if tag == ")":
                tag = "-RRB-"
            if tag == "NONE":
                tag = "-NONE-"   
            words.append((word,tag))
print len(tag_numbers)

In [6]:
import numpy as np

def count(corpus,vocabulary,tagset):
    S = len(tagset)
    V = len(vocabulary)
    eps = 0.1
    pi = eps * np.ones(S)
    transition = eps * np.ones((S, S))
    emission = eps * np.ones((S, V))
    for sent in corpus:
        last_tag = None
        for word, tag in sent:
            emission[tag, word] += 1
            if last_tag == None:
                pi[tag] += 1
            else:
                transition[last_tag, tag] += 1
            last_tag = tag
    pi /= np.sum(pi)
    for s in range(S):
        emission[s,:] /= np.sum(emission[s,:])
        transition[s,:] /= np.sum(transition[s,:])
    return pi,transition,emission

pi,transition,emission = count(tree_corpus,word_numbers,tag_numbers)

    

In [7]:
def viterbi(params, observations):
    prediction = []
    pi, A, O = params
    M = len(observations)
    S = pi.shape[0]
    
    alpha = np.zeros((M, S))
    alpha[:,:] = float('-inf')
    backpointers = np.zeros((M, S), 'int')
    
    # base case
    alpha[0, :] = pi * O[:,observations[0]]
    
    # recursive case
    for t in range(1, M):
        for s2 in range(S):
            for s1 in range(S):
                score = alpha[t-1, s1] * A[s1, s2] * O[s2, observations[t]]
                if score > alpha[t, s2]:
                    alpha[t, s2] = score
                    backpointers[t, s2] = s1
    
    # now follow backpointers to resolve the state sequence
    ss = []
    ss.append(np.argmax(alpha[M-1,:]))
    for i in range(M-1, 0, -1):
        ss.append(backpointers[i, ss[-1]])
    predict =  list(reversed(ss))
    for i in range(len(predict)):
        prediction.append((word_names[observations[i]],tag_names[predict[i]]))
    return prediction


predictions = []
for sent in test_corpora:
    encoded_sent = []
    for word,tag in sent:
        encoded_sent.append(word)
    pred = viterbi((pi, transition, emission), encoded_sent)
    predictions.append(pred)

print predictions[0]


[('USER_TOKEN', u'``'), (u'it', u'PRP'), (u"'s", u'VBZ'), (u'the', u'DT'), (u'view', u'NN'), (u'from', u'IN'), (u'where', u'WRB'), (u'i', u'PRP'), (u"'m", u'VBP'), (u'living', u'VBG'), (u'for', u'IN'), (u'two', u'CD'), (u'weeks', u'NNS'), (u'.', u'.'), (u'empire', u"''"), (u'state', u'NN'), (u'building', u'NN'), (u'=', u'.'), ('<unk>', u'-RRB-'), (u'.', u'.'), (u'pretty', u"''"), (u'bad', u'JJ'), (u'storm', u'NN'), (u'here', u'RB'), (u'last', u'JJ'), (u'evening', u'NN'), (u'.', u'.')]


In [8]:
from sklearn.metrics import accuracy_score as acc
def evaluate(input):
    output = []
    for sent in input:
        sents = []
        for word,tag in sent:
            sents.append(tag_numbers.get(tag))
        output.append(sents)
    return output



pre_output =  evaluate(predictions)
tweet_output = evaluate(test_outputs)
all_test_tags = [tag for tags in tweet_output for tag in tags]
all_pred_tags = [tag for tags in pre_output for tag in tags]
print acc(all_test_tags, all_pred_tags)

0.6371419163648337


In [9]:
index_user_word = word_numbers.get('USER_TOKEN')
index_hashtags_word = word_numbers.get('HASHTAG_TOKEN')
index_retweet_word = word_numbers.get('RETWEET_TOKEN')
index_URL_word = word_numbers.get('URL_TOKEN')

index_user_tag = tag_numbers.get('USR')
index_hashtags_tag = tag_numbers.get('HT')
index_retweet_tag = tag_numbers.get('RT')
index_URL_tag = tag_numbers.get('URL')

for i in [index_hashtags_tag,index_user_tag,index_retweet_tag,index_URL_tag]:
    for j in range(len(word_numbers)):
        emission[i][j] = 0
        
emission[index_hashtags_tag][index_hashtags_word] = 1
emission[index_user_tag][index_user_word] = 1
emission[index_retweet_tag][index_retweet_word] = 1
emission[index_URL_tag][index_URL_word] = 1


print emission

    

[[9.15369893e-05 1.74752434e-04 8.32154448e-06 ... 8.32154448e-06
  8.32154448e-06 8.32154448e-06]
 [1.33457894e-05 1.33457894e-05 6.51955158e-01 ... 1.33457894e-05
  1.33457894e-05 1.33457894e-05]
 [1.62522347e-05 1.62522347e-05 1.62522347e-05 ... 1.62522347e-05
  1.62522347e-05 1.62522347e-05]
 ...
 [3.83582662e-05 3.83582662e-05 3.83582662e-05 ... 3.83582662e-05
  3.83582662e-05 3.83582662e-05]
 [3.83582662e-05 3.83582662e-05 3.83582662e-05 ... 3.83582662e-05
  3.83582662e-05 3.83582662e-05]
 [3.83582662e-05 3.83582662e-05 3.83582662e-05 ... 3.83582662e-05
  3.83582662e-05 3.83582662e-05]]


In [35]:
from sklearn.metrics import classification_report,precision_recall_fscore_support
predictions = []
for sent in test_corpora:
    encoded_sent = []
    for word,tag in sent:
        encoded_sent.append(word)
    pred = viterbi((pi, transition, emission), encoded_sent)
    predictions.append(pred)

pre_output =  evaluate(predictions)
tweet_output = evaluate(test_outputs)
all_test_tags = [tag for tags in tweet_output for tag in tags]
all_pred_tags = [tag for tags in pre_output for tag in tags]
result = classification_report(all_test_tags, all_pred_tags, target_names=tag_names)
precision,recall,fscore,support = precision_recall_fscore_support(all_test_tags, all_pred_tags)
listf = fscore.tolist()
best = tag_names[listf.index(max(listf))]
worse = min(listf)
worselist = []
for i in range(len(listf)):
    if listf[i] == worse:
        worselist.append(tag_names[i])

        
    
print result
print best 
print "tag performe best "
print worselist 
print  "performe worse"

             precision    recall  f1-score   support

        NNP       0.60      0.27      0.37      1159
          ,       0.85      1.00      0.92       303
         CD       0.59      0.59      0.59       268
        NNS       0.43      0.54      0.48       393
         JJ       0.64      0.59      0.61       670
         MD       0.53      0.97      0.69       181
         VB       0.65      0.70      0.68       660
         DT       0.74      0.93      0.82       825
         NN       0.79      0.63      0.70      1931
         IN       0.81      0.88      0.85      1091
          .       0.72      0.83      0.77       875
        VBZ       0.69      0.78      0.73       342
        VBG       0.88      0.50      0.64       303
         CC       0.96      0.88      0.92       305
        VBD       0.77      0.74      0.75       306
        VBN       0.43      0.63      0.51       140
     -NONE-       0.00      0.00      0.00         2
         RB       0.71      0.76      0.73   