In [11]:
from IPython.core.display import HTML
HTML("""
<style>

div.cell { /* Tunes the space between cells */
margin-top:1em;
margin-bottom:1em;
}

div.text_cell_render h1 { /* Main titles bigger, centered */
font-size: 2.0em;
line-height:1.6em;
text-align:center;
}

div.text_cell_render h2 { /*  Parts names nearer from text */
margin-bottom: -0.2em;
}


div.text_cell_render { /* Customize text cells */
font-family: 'Times New Roman';
font-size:1.2em;
line-height:1.2em;
padding-left:1em;
padding-right:3em;
}
</style>
""")

<h1 align="center">  Generate Parts of Speech tags using various python libraries </h1>

```
2.1 Generating POS tags using Polyglot library

2.2 Generating POS tags using Stanford CoreNLP 

2.3 Generating POS tags using Spacy library

2.4 Why do we need to develop our own POS tagger?

```

<h2 align="center"> Generating POS tags using Polyglot library </h2>

```

2.1.1 Download polyglot POS model for English language

2.1.2 Load POS model

2.1.3 Import dependencies

2.1.4 Detect the language

2.1.5 Tokenization of the sentences

2.1.6 Generate POS tags for given sentence


```

### 2.1.1 Download polyglot POS model for English language


In [1]:
from polyglot.downloader import downloader
print(downloader.supported_languages_table("pos2"))

  1. Slovene                    2. French                     3. Hungarian                
  4. Swedish                    5. Spanish; Castilian         6. Portuguese               
  7. Indonesian                 8. English                    9. German                   
 10. Danish                    11. Czech                     12. Bulgarian                
 13. Italian                   14. Irish                     15. Dutch                    
 16. Finnish                  


### 2.1.2 Load POS model

In [2]:
from polyglot.downloader import downloader
downloader.download("embeddings2.en")
downloader.download("pos2.en")

[polyglot_data] Downloading package embeddings2.en to
[polyglot_data]     /home/jalaj/polyglot_data...
[polyglot_data]   Package embeddings2.en is already up-to-date!
[polyglot_data] Downloading package pos2.en to
[polyglot_data]     /home/jalaj/polyglot_data...
[polyglot_data]   Package pos2.en is already up-to-date!


True

### 2.1.3 Import dependencies

In [3]:
import polyglot
from polyglot.text import Text, Word

### 2.1.4 Detect the language

In [4]:
text = Text("Bonjour, Mesdames.")
print("Language Detected: Code={}, Name={}\n".format(text.language.code, text.language.name))


Language Detected: Code=fr, Name=French



### 2.1.5 Tokenization of the sentences

In [5]:
words_list = Text("Beautiful is better than ugly. "
           "Explicit is better than implicit. "
           "Simple is better than complex.")
print(words_list.words)

['Beautiful', 'is', 'better', 'than', 'ugly', '.', 'Explicit', 'is', 'better', 'than', 'implicit', '.', 'Simple', 'is', 'better', 'than', 'complex', '.']


### 2.1.6 Generate POS tags for given sentence


In [6]:
sent = """We will meet at eight o'clock on Thursday morning."""
text = Text(sent)

In [7]:
text.pos_tags

[('We', 'PRON'),
 ('will', 'AUX'),
 ('meet', 'VERB'),
 ('at', 'ADP'),
 ('eight', 'NUM'),
 ("o'clock", 'NOUN'),
 ('on', 'ADP'),
 ('Thursday', 'PROPN'),
 ('morning', 'NOUN'),
 ('.', 'PUNCT')]

In [8]:
print("{:<16}{}".format("Word", "POS Tag")+"\n"+"-"*30)
for word, tag in text.pos_tags:
    print(u"{:<16}{:>2}".format(word, tag))

Word            POS Tag
------------------------------
We              PRON
will            AUX
meet            VERB
at              ADP
eight           NUM
o'clock         NOUN
on              ADP
Thursday        PROPN
morning         NOUN
.               PUNCT


In [9]:
text = Text("This is a car")
text.pos_tags
print("{:<16}{}".format("Word", "POS Tag")+"\n"+"-"*30)
for word, tag in text.pos_tags:
    print(u"{:<16}{:>2}".format(word, tag))

Word            POS Tag
------------------------------
This            DET
is              VERB
a               DET
car             NOUN


In [10]:
text = Text("Alexander the Great...!")
text.pos_tags
print("{:<16}{}".format("Word", "POS Tag")+"\n"+"-"*30)
for word, tag in text.pos_tags:
    print(u"{:<16}{:>2}".format(word, tag))

Word            POS Tag
------------------------------
Alexander       PROPN
the             DET
Great           PROPN
.               PUNCT
.               PUNCT
.               PUNCT
!               PUNCT


In [11]:
text = Text("Alexander the Great, was a king of the ancient Greek kingdom of Macedon.")
text.pos_tags
print("{:<16}{}".format("Word", "POS Tag")+"\n"+"-"*30)
for word, tag in text.pos_tags:
    print(u"{:<16}{:>2}".format(word, tag))

Word            POS Tag
------------------------------
Alexander       PROPN
the             DET
Great           PROPN
,               PUNCT
was             VERB
a               DET
king            NOUN
of              ADP
the             DET
ancient         ADJ
Greek           ADJ
kingdom         NOUN
of              ADP
Macedon         PROPN
.               PUNCT


```

```

<h2 align="center"> Generating POS tags using Stanford CoreNLP </h2>

```

2.2.1 Run Stanford CoreNLP in server mode

2.2.2 Define a function

2.2.3 Generate POS tags for given sentence


```

### 2.2.1 Run Stanford CoreNLP in server mode

In [12]:
## Run the server using all jars in the current directory (e.g., the CoreNLP home directory)
# java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer --add-modules java.se.ee -port 9000 -timeout 15000
from pycorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('http://localhost:9000')

### 2.2.2 Define a function

In [13]:
def stnfordpostagdemofunction(text):
    output = nlp.annotate(text, properties={
        'annotators': 'pos',
        'outputFormat': 'json'
    })
    for s in output["sentences"]:
        print("{:<16}{}".format("Word", "POS Tag")+"\n"+"-"*30)
        for t in s["tokens"]:
            print(u"{:<16}{:>2}".format(str(t["word"]),str(t["pos"])))   

### 2.2.3 Generate POS tags for given sentence


In [14]:
if __name__ == "__main__":
    stnfordpostagdemofunction("This is a car.")
   

Word            POS Tag
------------------------------
This            DT
is              VBZ
a               DT
car             NN
.                .


In [15]:
if __name__ == "__main__":
    stnfordpostagdemofunction("Alexander the Great, was a king of the ancient Greek kingdom of Macedon.")

Word            POS Tag
------------------------------
Alexander       NNP
the             DT
Great           NNP
,                ,
was             VBD
a               DT
king            NN
of              IN
the             DT
ancient         JJ
Greek           JJ
kingdom         NN
of              IN
Macedon         NNP
.                .


In [16]:
if __name__ == "__main__":
    stnfordpostagdemofunction("We will meet at eight o'clock on Thursday morning.")

Word            POS Tag
------------------------------
We              PRP
will            MD
meet            VB
at              IN
eight           CD
o'clock         RB
on              IN
Thursday        NNP
morning         NN
.                .


In [17]:
if __name__ == "__main__":
    stnfordpostagdemofunction("The name of your medicine is Paracetamol 500mg Tablets (called paracetamol throughout this leaflet).")
    

Word            POS Tag
------------------------------
The             DT
name            NN
of              IN
your            PRP$
medicine        NN
is              VBZ
Paracetamol     NNP
500             CD
mg              NN
Tablets         NNS
-LRB-           -LRB-
called          VBN
paracetamol     NNP
throughout      IN
this            DT
leaflet         NN
-RRB-           -RRB-
.                .


<h2 align="center"> Generating POS tags using Spacy library </h2>

```
2.3.1 Import dependencies

2.3.2 Load model

2.3.3 Generate POS tag for given sentence

```

### 2.3.1 Import dependencies

In [18]:
import spacy

### 2.3.2 Load model

In [19]:
nlp = spacy.load('en_core_web_sm')

### 2.3.3 Generate POS tag for given sentence

In [20]:
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
print("{:<32}{:<32}{}".format("Word", "Word Category", "POS Tag")+"\n"+"-"*80)
for token in doc:
    #print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
    print(u"{:<32}{:<32}{}".format(token.text,token.pos_, token.tag_))   

Word                            Word Category                   POS Tag
--------------------------------------------------------------------------------
Apple                           PROPN                           NNP
is                              VERB                            VBZ
looking                         VERB                            VBG
at                              ADP                             IN
buying                          VERB                            VBG
U.K.                            PROPN                           NNP
startup                         NOUN                            NN
for                             ADP                             IN
$                               SYM                             $
1                               NUM                             CD
billion                         NUM                             CD


```


```

<h2 align="center"> Why do we need to develop our own POS tagger? </h2>


* Dealing with domain specific terminology

* Dealing with ambiguity 



In [21]:
doc = nlp(u'The name of your medicine is Paracetamol 500mg Tablets (called paracetamol throughout this leaflet). ')
print("{:<32}{:<32}{}".format("Word", "Word Category", "POS Tag")+"\n"+"-"*80)
for token in doc:
    #print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
    print(u"{:<32}{:<32}{}".format(token.text,token.pos_, token.tag_)) 

Word                            Word Category                   POS Tag
--------------------------------------------------------------------------------
The                             DET                             DT
name                            NOUN                            NN
of                              ADP                             IN
your                            ADJ                             PRP$
medicine                        NOUN                            NN
is                              VERB                            VBZ
Paracetamol                     PROPN                           NNP
500                             NUM                             CD
mg                              ADJ                             JJ
Tablets                         NOUN                            NNS
(                               PUNCT                           -LRB-
called                          VERB                            VBN
paracetamol                     NO