### POS Tagging: Agenda
* <a href="#section1">What are parts of speech? Why are they useful?</a>
* <a href="#section2">How do you use them with SpaCy?</a>
* <a href="#section3">How do we infer them?</a>
* <a href="#section4"> How do we learn them with SpaCy?</a>


In [34]:
import pandas as pd
import spacy
from spacy.matcher import Matcher
from spacy.attrs import POS
from spacy.en import English
import matplotlib.pyplot as plt
from functools import partial
import nltk
from operator import itemgetter
from itertools import groupby
from nltk.corpus import brown
from collections import defaultdict, Counter
import numpy as np
from spacy.tokens import Doc
from IPython.display import HTML
import warnings
import pandas as pd



warnings.filterwarnings('ignore')
%matplotlib inline

def rep_sentences(texts):
    html = []
    for text in texts:
        html.append(rep_sentence(text))
    return HTML("".join(html))

def rep_sentence(text, display_pos = True):
    html_colors = ['SkyBlue'
               ,'red'
               ,'YellowGreen'
               ,'yellow'
               ,'orange'
               ,'pink'
               ,'brown'
               ,'purple'
               , 'CadetBlue'
                ,'DarkKhaki'
                ,'DarkSalmon'
                ,'Gold'    
              ]
    doc = nlp(text)
    n_words = len(doc)
    unique_pos = list(set(map(lambda x: x.pos_, doc)))
    pos_to_color = {i:html_colors[unique_pos.index(i)] for i in unique_pos}
    css = ["<style>.word{font-weight:bold;}</style>"]
    for pos in unique_pos:
        css.append('<style>.{}{{background-color:{};}}</style>'.format(*[pos, pos_to_color[pos]]))
    css = "".join(css)

    html = ["<table width=100%>"]
    html.append(css)
    html.append("<tr>")            
    for i in range(n_words):
        word_string= doc[i].orth_
        html.append("<td><span class='word'>{0}</span></td>".format(word_string))
    html.append("</tr>")
    if display_pos:
        html.append("<tr>")            
        for i in range(n_words):
            pos = doc[i].pos_
            color = pos_to_color[pos]
            html.append("<td><span class='{0}'>{0}</span></td>".format(pos))
        html.append("</tr>")
    html = "".join(html)
    return html



def custom_tag_table(list_of_word_tag_tuples):
    html_colors = ['SkyBlue'
               ,'red'
               ,'YellowGreen'
               ,'yellow'
               ,'orange'
               ,'pink'
               ,'brown'
               ,'MediumPurple'
               , 'CadetBlue'
                ,'DarkKhaki'
                ,'DarkSalmon'
                ,'Gold'    
              ]
    
    n_words = len(list_of_word_tag_tuples)
    words, pos_list = zip(*list_of_word_tag_tuples)
    unique_pos = list(set([pos for pair in pos_list for pos in pair]))
    pos_to_color = {i:html_colors[unique_pos.index(i)] for i in unique_pos}
    css = ["<style>.word{font-weight:bold;}</style>"]
    for pos in unique_pos:
        css.append('<style>.{}{{background-color:{};}}</style>'.format(*[pos, pos_to_color[pos]]))
    css = "".join(css)

    html = ["<table width=100%>"]
    html.append(css)
    for i in range(n_words):
        html.append("<tr>")            
        word_string= words[i]
        html.append("<td><span class='word'>{0}</span></td>".format(word_string))
        row = []
        pos_sublist = pos_list[i]
        for pos in pos_sublist:
            entry = "<span class='{0}'>{0}</span> ".format(pos)
            #print entry
            row.append(entry)
        row = "".join(row)
        html.append("<td>{}</td>".format(row))
        html.append("</tr>")
    return "".join(html)
        
    

def nltk_corpus(corpus_name):
    corpus = getattr(nltk.corpus, corpus_name)
    try:
        corpus.ensure_loaded()
    except:
        nltk.download(corpus_name)
    return corpus

#read nltk corpora
def nltk_reader(corpus_name, limit = None):
    corpus = nltk_corpus(corpus_name)
    fileids = corpus.fileids()
    
    if limit:
        doc_iter = (" ".join([" ".join(j) for j in corpus.sents(fileid)]) for fileid in fileids[:limit])
    else:
        doc_iter = (" ".join([" ".join(j) for j in corpus.sents(fileid)]) for fileid in fileids)
    return doc_iter

universal_tags = [
     ['Open Class Words','ADJ','adjective']
    ,['Open Class Words','ADV','adverb']
    ,['Open Class Words','INTJ','interjection']
    ,['Open Class Words','NOUN','noun']
    ,['Open Class Words','PROPN','proper noun']
    ,['Open Class Words','VERB','verb']
    ,['Closed Class Words','ADP','adposition']
    ,['Closed Class Words','AUX','auxiliary']
    ,['Closed Class Words','CCONJ','coordination conjunction']
    ,['Closed Class Words','DET','determiner']
    ,['Closed Class Words','NUM','numeral']
    ,['Closed Class Words','PART','particle']
    ,['Closed Class Words','PRON','pronoun']
    ,['Closed Class Words','SCONJ','subordinating conjection']
    ,['Other','PUNCT','punctuation']
    ,['Other','SYM','symbol']
    ,['Other','X','other']
]
tag_table = pd.DataFrame(universal_tags, columns = ['Category','Abbrev','Part of Speech'])
tag_table = tag_table.set_index(['Category','Abbrev'])

nltk.download('tagsets')
nltk.download('universal_tagset')
nlp = spacy.load('en')

<a name="section1"></a>

### What are Parts of Speech?

In [3]:
tag_table

Unnamed: 0_level_0,Unnamed: 1_level_0,Part of Speech
Category,Abbrev,Unnamed: 2_level_1
Open Class Words,ADJ,adjective
Open Class Words,ADV,adverb
Open Class Words,INTJ,interjection
Open Class Words,NOUN,noun
Open Class Words,PROPN,proper noun
Open Class Words,VERB,verb
Closed Class Words,ADP,adposition
Closed Class Words,AUX,auxiliary
Closed Class Words,CCONJ,coordination conjunction
Closed Class Words,DET,determiner


In [8]:
sentence1 = 'I get a discount on newspapers.'
sentence2 = 'I discount that newspaper.'

rep_sentences([sentence1, sentence2])

0,1,2,3,4,5,6
I,get,a,discount,on,newspapers,.
PRON,VERB,DET,NOUN,ADP,NOUN,PUNCT

0,1,2,3,4
I,discount,that,newspaper,.
PRON,VERB,ADP,NOUN,PUNCT


<a name='applications'></a>
### Applications
* Rule based systems:
    * <a href="#qacode">Example of rule based question answering component</a>
* Feature engineering for statistical models
    * <a href="#wordsense">Feature for word disambiguation</a>

<a name="section2"></a>
### Parts of Speech with SpaCy

In [112]:
### Accessing
doc = nlp('I get a discount on newspapers')
tags = {}

for word in doc:
    tags[word.orth_] = {'lemma': word.lemma_, 
                        'pos (coarse)': word.pos_, 
                        'pos (fine)':word.tag_}
pd.DataFrame(tags).T

Unnamed: 0,lemma,pos (coarse),pos (fine)
I,-PRON-,PRON,PRP
a,a,DET,DT
discount,discount,NOUN,NN
get,get,VERB,VBP
newspapers,newspaper,NOUN,NNS
on,on,ADP,IN


### Exercise: Building word vectors that are Part of Speech specific
Steps:
* get documents
* tokenize the documents, and append the part of speech to each token, e.g. dog|NOUN
* train a word2vec model with gensim
* compare the most similar words of 'back||||VERB' vs 'back||||NOUN' (or other combo)

* Hints:
    * model.wv.vocab contains the vocabulary.
    * using a completely unique join character will make it easier to split later.

In [191]:
from gensim.models import Word2Vec

def return_documents():
    from sklearn.datasets import fetch_20newsgroups
    dataset = fetch_20newsgroups()
    corpus = dataset.data
    return corpus

def tokenize_and_tag_documents(documents, nlp, sep_char="|"):
    pass

def build_model(tokenized_docs):
    pass

def compare_most_similar_words_across_pos(word):
    pass

documents = return_documents()
tokenized_and_tagged_documents = tokenize_and_tag_documents(documents, nlp)
model = build_model(tokenized_and_tagged_documents)
compare_most_similar_words_across_pos('back')

<a name="section3"></a>
### How do we infer parts of speech?

In [52]:
from IPython.display import clear_output, display
from ipywidgets import Button
class reveal(object):
    def __init__(self):
        self.text = 'I was loble to find the effix by klepping the Dongle search engine.'
        self.toggle = Button(description='Toggle POS', )
        self.toggle.on_click(self.toggle_pos)
        self.state = False
        display(self.toggle)
        self.display()
        
    def toggle_pos(self, b):
        self.state = not self.state
        self.display()
        
    def display(self):
        clear_output()
        display(HTML(rep_sentence(text, display_pos = self.state)))
        
r = reveal()

0,1,2,3,4,5,6,7,8,9,10,11,12,13
I,was,loble,to,find,the,effix,by,klepping,the,Dongle,search,engine,.


### Determinants of Part of Speech:
* Word: some words can only be used in a single way; we can memorize these.
* Word shape: if the first letter is capitalized, its likely a proper noun.
* Neighboring part of speech: there are common patterns, such as noun phrases commonly following a determiner. to the beach



| Feature | Notes | Example|
|------|------|------|
|   Word Identity  | Some words can only be used in a single way; we can memorize these.| "the" -> determiner| 
| Word Shape|Capitalization, dashes,  |"I stayed at the Park Hotel."|
|Neighboring parts of speech|There are common patterns what tags can neighbor others|"to the beach" (noun following determiner)|
|Morphological Structures|Word prefixes and suffixes can rule out certain tag types|"-ly" -> adverb|
|Syntactic Dependencies|Syntax may establish expectations that only certain tags can logically fill|"I was told __" -> adpositional phrase or object entity|
|?|?|?|


<a name="section4"></a>
### Training your own tagger

In [67]:
from sklearn.cross_validation import train_test_split
corpus = nltk_corpus('brown')
all_data = np.array(corpus.tagged_sents(tagset='brown'))
all_data = [list(zip(*i)) for i in all_data]
train, test = train_test_split(all_data, test_size = .1)

In [94]:
def filter_(f, iterable):
    return list(filter(f,iterable))

def map_(f, iterable):
    return list(map(f,iterable))
def zip_(*args):
    return list(zip(*args))

In [69]:
#EXPERIMENTAL
from spacy.symbols import *
from spacy.language_data import TAG_MAP
from spacy.vocab import Vocab
from spacy.tagger import Tagger
from spacy.tokens import Doc
from spacy.gold import GoldParse
from nltk.tag import tagset_mapping
from spacy.en import English
from spacy.tagger import W_cluster, W_lemma, W_pos, W_prefix, W_suffix, W_shape\
                         ,P2_cluster, P2_lemma, P2_pos, P2_prefix, P2_suffix, P2_shape\
                         ,P1_cluster, P1_lemma, P1_pos, P1_prefix, P1_suffix, P1_shape\
                         ,N1_cluster, N1_lemma, N1_pos, N1_prefix, N1_suffix, N1_shape\
                         ,N2_cluster, N2_lemma, N2_pos, N2_prefix, N2_suffix, N2_shape




def validate(test_data, tagger):
    correct = 0
    total = 0
    tagmap = tagger.vocab.morphology.tag_map
    for words, tags in test_data:
        doc = Doc(vocab, words=words)
        tagger(doc)
        predictions = map_(lambda token: tagmap[token.tag_], doc)
        actual = map_(lambda tag: tagmap[tag], tags)
        
        correct_predictions = filter_(lambda x: x[0] == x[1], zip(predictions, actual))
        n_correct = len(correct_predictions)
        correct += n_correct
        total += len(words)
        
    result = {'correct':correct, 'words':total, 'accuracy':correct / float(total)}
    return result

def generate_tagmap():
    def adjust_value(x):
        if x == '.':
            val = PUNCT
        elif x=='PRT':
            val = PART
        else:
            val = getattr(spacy.symbols, x)
        return {POS:val}
    nltk_map = tagset_mapping('en-brown','universal')
    adj_map = {key:adjust_value(value) for key, value in nltk_map.items()}
    return adj_map

features = [(W_cluster,), (W_lemma,),    (W_pos,), (W_prefix,),  (W_suffix,), (W_shape,)
,(P2_cluster,), (P2_lemma,), (P2_pos,), (P2_prefix,), (P2_suffix,), (P2_shape,)
,(P1_cluster,), (P1_lemma,), (P1_pos,), (P1_prefix,), (P1_suffix,), (P1_shape,)
,(N1_cluster,), (N1_lemma,), (N1_pos,), (N1_prefix,), (N1_suffix,), (N1_shape,)
,(N2_cluster,), (N2_lemma,), (N2_pos,), (N2_prefix,), (N2_suffix,), (N2_shape,)]

tagmap = generate_tagmap()
gold_tagmap = nlp.vocab.morphology.tag_map

for key in tagmap:
    gold_tagmap[key] = tagmap[key]

vocab = nlp.vocab
tagger = nlp.tagger


pretraining_accuracy = validate(test, tagger)
print(pretraining_accuracy)

for i in range(10):
    train_fold, test_fold = train_test_split(train, test_size = .2)
    for words, tags in train_fold:
        doc = Doc(vocab, words=words)
        gold = GoldParse(doc, tags=tags)   
        tagger.update(doc, gold)
    current_accuracy = validate(test_fold, tagger)
    print(current_accuracy)
    np.random.shuffle(train)
tagger.model.end_training()

posttraining_accuracy = validate(test, tagger)
print(posttraining_accuracy)

{'words': 116884, 'correct': 97184, 'accuracy': 0.8314568289928476}


ValueError: Unrecognized gold tag: JJ-TL. tag_map.json must contain all gold tags, to maintain coarse-grained mapping.

In [97]:
from spacy.symbols import *
from spacy.language_data import TAG_MAP
from spacy.vocab import Vocab
from spacy.tagger import Tagger
from spacy.tokens import Doc
from spacy.gold import GoldParse
from nltk.tag import tagset_mapping
from spacy.en import English
from spacy.tagger import W_cluster, W_lemma, W_pos, W_prefix, W_suffix, W_shape\
                         ,P2_cluster, P2_lemma, P2_pos, P2_prefix, P2_suffix, P2_shape\
                         ,P1_cluster, P1_lemma, P1_pos, P1_prefix, P1_suffix, P1_shape\
                         ,N1_cluster, N1_lemma, N1_pos, N1_prefix, N1_suffix, N1_shape\
                         ,N2_cluster, N2_lemma, N2_pos, N2_prefix, N2_suffix, N2_shape

def validate(test_data, tagger):
    correct = 0
    total = 0
    
    for words, tags in test_data:
        doc = Doc(vocab, words=words)
        tagger(doc)
        predictions = map_(lambda token: tagmap[token.tag_], doc)
        actual = map_(lambda tag: tagmap[tag], tags)
        
        correct_predictions = filter_(lambda x: x[0] == x[1], zip_(predictions, actual))
        n_correct = len(correct_predictions)
        correct += n_correct
        total += len(words)
        
    result = {'correct':correct, 'words':total, 'accuracy':correct / float(total)}
    return result

def generate_tagmap():
    def adjust_value(x):
        if x == '.':
            val = PUNCT
        elif x=='PRT':
            val = PART
        else:
            val = getattr(spacy.symbols, x)
        return {POS:val}
    nltk_map = tagset_mapping('en-brown','universal')
    adj_map = {key:adjust_value(value) for key, value in nltk_map.items()}
    return adj_map

features = [(W_cluster,), (W_lemma,),    (W_pos,), (W_prefix,),  (W_suffix,), (W_shape,)
,(P2_cluster,), (P2_lemma,), (P2_pos,), (P2_prefix,), (P2_suffix,), (P2_shape,)
,(P1_cluster,), (P1_lemma,), (P1_pos,), (P1_prefix,), (P1_suffix,), (P1_shape,)
,(N1_cluster,), (N1_lemma,), (N1_pos,), (N1_prefix,), (N1_suffix,), (N1_shape,)
,(N2_cluster,), (N2_lemma,), (N2_pos,), (N2_prefix,), (N2_suffix,), (N2_shape,)]

tagmap = generate_tagmap()
vocab = Vocab(tag_map = tagmap)
tagger = Tagger(vocab)


pretraining_accuracy = validate(test, tagger)
print(pretraining_accuracy)

for i in range(10):
    train_fold, test_fold = train_test_split(train, test_size = .2)
    for words, tags in train_fold:
        doc = Doc(vocab, words=words)
        gold = GoldParse(doc, tags=tags)   
        tagger.update(doc, gold)
    current_accuracy = validate(test_fold, tagger)
    print(current_accuracy)
    np.random.shuffle(train)
tagger.model.end_training()

posttraining_accuracy = validate(test, tagger)
print(posttraining_accuracy)

{'words': 116884, 'correct': 14981, 'accuracy': 0.12816980938366243}
{'words': 208437, 'correct': 52799, 'accuracy': 0.2533091533652854}
{'words': 210067, 'correct': 66090, 'accuracy': 0.31461390889573326}
{'words': 209335, 'correct': 56648, 'accuracy': 0.27060931043542646}
{'words': 211223, 'correct': 56042, 'accuracy': 0.2653214848761735}
{'words': 207431, 'correct': 57799, 'accuracy': 0.27864205446630447}
{'words': 207494, 'correct': 68565, 'accuracy': 0.3304432899264557}
{'words': 207906, 'correct': 65606, 'accuracy': 0.31555606860792856}
{'words': 208654, 'correct': 64619, 'accuracy': 0.30969451819759025}
{'words': 210271, 'correct': 63388, 'accuracy': 0.30145859390976404}
{'words': 210742, 'correct': 72974, 'accuracy': 0.3462717445976597}
{'words': 116884, 'correct': 48794, 'accuracy': 0.4174566236610657}


In [72]:
tagger.vocab.morphology.tag_map['JJ-TL']

{74: 82}

In [77]:
gold_tagmap['JJ-TL']

{74: 82}

### Appendix

<a name='qacode'></a>
### Example Rule Based QA Component Code

In [None]:
def get_answer_requirements(token):
    if token.tag_ == 'WRB':
        if token.lower_ == 'where':
            #Where was Star Wars Filmed
            return ['LOCATION']
        elif token.lower_ == 'when':
            #When was Star Wars Filmed
            return ['DATE']
        elif token.lower_ == 'how':
            #How much did Star Wars make?
            if token.nbor().lower_ in ('much', 'many'):
                return ['QUANTITY']

            #How old is star wars?
            elif token.nbor().lower_ in ('long', 'old'):
                return ['DURATION']
            else:
                return False
        elif token.lower() == 'whom':
            #Whom did you see?
            return ['PERSON','ORG']      
        else:
            return False
    elif token.tag_ == 'WP':
        #Asking for Identity
        if token.lower_ in ('who', 'whose'):
            #Who directed Star Wars?
            return ['PERSON','ORG']
        if token.lower_ in ('which','what'):
            #What is Star Wars
            return False 
        else: 
            return False
    elif token.tag_ == 'WDT':
        #asking for a choice among options
        if token.lower_ in ('which','what'):
            #which Star Wars did you like best?
            return [token.nbor().lower_] #return neighbor
        else:
            return False
    else:
        return False

<a href='#applications'>back</a>
<a name="wordsense"></a>
##### Word sense disambiguation

In [131]:
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
for syn in wn.synsets('shower'):
    print(syn, syn.definition())

[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Synset('shower.n.01') a plumbing fixture that sprays water over you
Synset('shower.n.02') washing yourself by standing upright under water sprayed from a nozzle
Synset('shower.n.03') a brief period of precipitation
Synset('shower.n.04') a sudden downpour (as of tears or sparks etc) likened to a rain shower
Synset('exhibitor.n.01') someone who organizes an exhibit for others to see
Synset('shower.n.06') a party of friends assembled to present gifts (usually of a specified kind) to a person
Synset('lavish.v.01') expend profusely; also used with abstract nouns
Synset('shower.v.02') spray or sprinkle with
Synset('shower.v.03') take a shower; wash one's body in the shower
Synset('shower.v.04') rain abundantly
Synset('shower.v.05') provide abundantly with


<a href='#applications'>back</a>