# SPACY : Language Processing Pipeline

In [1]:
import spacy

In [2]:
nlp = spacy.blank("en") #blank language processing pipeline
#create nlp as object which understands (en is) engish language from spacy language model

In [3]:
doc = nlp("Mr. and Mrs. sharma are doctors.They bought dosa,idli,juice etc. for 5$, they just love eating.")
for token in doc:
    print(token," | ",token.pos_,' | ',token.lemma_) # blank so wont return anything
    # using tokeniser we get components by default blank pipeline

Mr.  |    |  
and  |    |  
Mrs.  |    |  
sharma  |    |  
are  |    |  
doctors  |    |  
.  |    |  
They  |    |  
bought  |    |  
dosa  |    |  
,  |    |  
idli  |    |  
,  |    |  
juice  |    |  
etc  |    |  
.  |    |  
for  |    |  
5  |    |  
$  |    |  
,  |    |  
they  |    |  
just  |    |  
love  |    |  
eating  |    |  
.  |    |  


In [4]:
nlp.pipe_names # blank array is blank components which has nothing

[]

# USE of Pre trained components Pipeline:

In Spacy Documentation there are pre trained pipeline for each various languages  such as english, frence, chinese, german etc.
install command for each required language

for english : python -m spacy download en_core_web_sm -- this command trains pipeline for english language

In [5]:
nlp = spacy.load('en_core_web_sm') # loading pipeline with all components

In [6]:
nlp.pipe_names # all components of pipeline

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [7]:
doc = nlp("Mr. and Mrs. sharma are doctors.They bought dosa,idli,juice etc. for 5$, they just love eating.")
for token in doc:
    print(token," | ",token.pos_,' | ',token.lemma_)
    # token.pos_ here pos is Part Of Speech
    # token.lemma_ shows base word

Mr.  |  PROPN  |  Mr.
and  |  CCONJ  |  and
Mrs.  |  PROPN  |  Mrs.
sharma  |  NOUN  |  sharma
are  |  AUX  |  be
doctors  |  NOUN  |  doctor
.  |  PUNCT  |  .
They  |  PRON  |  they
bought  |  VERB  |  buy
dosa  |  PROPN  |  dosa
,  |  PUNCT  |  ,
idli  |  PROPN  |  idli
,  |  PUNCT  |  ,
juice  |  NOUN  |  juice
etc  |  X  |  etc
.  |  X  |  .
for  |  ADP  |  for
5  |  NUM  |  5
$  |  NUM  |  $
,  |  PUNCT  |  ,
they  |  PRON  |  they
just  |  ADV  |  just
love  |  VERB  |  love
eating  |  VERB  |  eat
.  |  PUNCT  |  .


1. tagger compents gives POS Parts of speech proper noun, adverb , adjective,verb,punctuation,number etc
2. lemmi gives base words.
3. NER : Named Entity Recognition : 

In [9]:
doc = nlp("Mr. and Mrs. sharma are doctors.They had dosa,idli,juice etc .for 5$ rupees.They visit london in summers.")
for ent in doc.ents:
    print(ent.text," | ",ent.label_," | ",spacy.explain(ent.label_))

sharma  |  PERSON  |  People, including fictional
5$  |  MONEY  |  Monetary values, including unit
london  |  GPE  |  Countries, cities, states


# for better Visualization :

In [10]:
from spacy import displacy
displacy.render(doc,style = "ent")

# Use of Pipeline for FRENCE LANGUAGE:

nlp = spacy.load("fr_core_news_sm")

You need to install the processing pipeline for french language using this command in git bash,

python -m spacy download fr_core_news_sm

# Cusmisation/Adding a component to a blank pipeline :

In [11]:
source_nlp = spacy.load("en_core_web_sm") # first load english pipeline

In [12]:
nlp = spacy.blank("en") # then create blank pipeline 
nlp.add_pipe("ner", source=source_nlp) # then add 'ner' in the pipeline from source as source_nlp add as custome components
nlp.pipe_names # shows names of componenst added in a pipeline

['ner']

In [14]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")
for ent in doc.ents:
    print(ent.text," | ",ent.label_," | ",spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
$45 billion  |  MONEY  |  Monetary values, including unit
