In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_lg")

In [18]:
text_1 = "Apple isn't trying to buy U.K. startup company in $1 billion"
text_2 = "She eat Apple"

In [63]:
def print_doc(text):
    doc = nlp(text)
    for token in doc:
        print(f'{token.text:{10}} {token.lemma_:{10}} {token.tag_:{10}} {token.pos_:{10}} {token.is_stop}')

In [42]:
print_doc(text_1)

Apple      Apple      NNP        PROPN      False
is         be         VBZ        AUX        True
n't        not        RB         PART       True
trying     try        VBG        VERB       False
to         to         TO         PART       True
buy        buy        VB         VERB       False
U.K.       U.K.       NNP        PROPN      False
startup    startup    NN         NOUN       False
company    company    NN         NOUN       False
in         in         IN         ADP        True
$          $          $          SYM        False
1          1          CD         NUM        False
billion    billion    CD         NUM        False


In [43]:
print_doc(text_2)

She        -PRON-     PRP        PRON       True
eat        eat        VBP        VERB       False
Apple      Apple      NNP        PROPN      False


In [44]:
# we can see in below examples looks has different pos values

In [45]:
print_doc("she looks like an angel")

she        -PRON-     PRP        PRON       True
looks      look       VBZ        VERB       False
like       like       IN         SCONJ      False
an         an         DT         DET        True
angel      angel      NN         NOUN       False


In [48]:
print_doc("her looks are very bad")

her        -PRON-     PRP$       DET        True
looks      look       NNS        NOUN       False
are        be         VBP        AUX        True
very       very       RB         ADV        True
bad        bad        JJ         ADJ        False


**Dependency parsing**
> https://spacy.io/api/annotation

In [64]:
def print_dp(text):
    doc = nlp(text)
    for chunk in doc.noun_chunks:
        print(f'{chunk.text:{30}} {chunk.root.text:{10}} {chunk.root.dep_}')

In [51]:
print_dp(text_1)

Apple                          Apple      nsubj
U.K. startup company           company    dobj


### Sentence Segmentation

In [53]:
def print_sents(text):
    doc = nlp(text)
    for sent in doc.sents:
        print(sent)

In [55]:
print_sents("Hi, My name is Himanshu Teotia. I am a programmer. Currently, I am learning NLP.")

Hi, My name is Himanshu Teotia.
I am a programmer.
Currently, I am learning NLP.


In [56]:
# only works with these signs !.? 

In [70]:
# add custom rule to segment the sentence

def sent_rule(doc):
    for token in doc[:-1]:
        if token.text == ".*.":
            doc[token.i + 1].is_sent_start = True
    return doc

In [73]:
nlp.add_pipe(sent_rule,before="parser")

In [66]:
text_3 = "Hi, My name is Himanshu.*.how are you.*.I am doing great"

In [67]:
print_sents(text_3)

Hi, My name is Himanshu.*.how are
you.*.I am doing great


In [68]:
print_doc(text_3)

Hi         hi         UH         INTJ       False
,          ,          ,          PUNCT      False
My         -PRON-     PRP$       DET        True
name       name       NN         NOUN       True
is         be         VBZ        AUX        True
Himanshu.*.how Himanshu.*.how NNP        PROPN      False
are        be         VBP        AUX        True
you.*.I    you.*.i    PRP        PRON       False
am         be         VBP        AUX        True
doing      do         VBG        VERB       True
great      great      JJ         ADJ        False


In [72]:
# to again add or update same name pipe we have to remove the previously added pipe

nlp.remove_pipe('sent_rule')

('sent_rule', <function __main__.sent_rule(doc)>)

### Vizualization

In [75]:
from spacy import displacy

In [77]:
doc = nlp(text_3)
displacy.render(doc,style="dep")