# NLP Pipeline with spaCy

### Loading Libraries

In [3]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Sys
import sys

# Warnings
import warnings

# Path
from pathlib import Path

# SpaCy
import spacy
from spacy import displacy
from textacy.extract import ngrams, entities

In [4]:
%matplotlib inline

In [5]:
warnings.filterwarnings('ignore')

### SpaCy Language Model Installation

#### English

In [12]:
# %%bash
# python -m spacy download en_core_web_sm

In [13]:
# %%bash
# python -m spacy download es_core_news_sm

#### Validating Installation

In [17]:
!{sys.executable} -m spacy validate

[2K[38;5;2m✔ Loaded compatibility table[0m
[1m
[38;5;4mℹ spaCy installation:
/opt/anaconda3/envs/MLAT/lib/python3.12/site-packages/spacy[0m

NAME              SPACY            VERSION                            
es_core_news_sm   >=3.8.0,<3.9.0   [38;5;2m3.8.0[0m   [38;5;2m✔[0m
en_core_web_sm    >=3.8.0,<3.9.0   [38;5;2m3.8.0[0m   [38;5;2m✔[0m



### Getting Data

In [18]:
DATA_DIR = Path('..', 'data')

### SpaCy in Action

#### Create & Explore The Language Object

In [23]:
# Older Version
# nlp = spacy.load('en') 

# Current One
nlp = spacy.load("en_core_web_sm")

# Spanish One
nlp_es = spacy.load("es_core_news_sm")

In [24]:
type(nlp)

spacy.lang.en.English

In [25]:
nlp.lang

'en'

In [28]:
spacy.info()

{'spacy_version': '3.8.5',
 'location': '/opt/anaconda3/envs/MLAT/lib/python3.12/site-packages/spacy',
 'platform': 'macOS-15.4.1-arm64-arm-64bit',
 'python_version': '3.12.9',
 'pipelines': {}}

In [30]:
# Oldest Version
# spacy.info('en')

spacy.info('en_core_web_sm')

{'lang': 'en',
 'name': 'core_web_sm',
 'version': '3.8.0',
 'description': 'English pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer.',
 'author': 'Explosion',
 'email': 'contact@explosion.ai',
 'url': 'https://explosion.ai',
 'license': 'MIT',
 'spacy_version': '>=3.8.0,<3.9.0',
 'spacy_git_version': '5010fcbd3',
 'vectors': {'width': 0, 'vectors': 0, 'keys': 0, 'name': None},
 'labels': {'tok2vec': [],
  'tagger': ['$',
   "''",
   ',',
   '-LRB-',
   '-RRB-',
   '.',
   ':',
   'ADD',
   'AFX',
   'CC',
   'CD',
   'DT',
   'EX',
   'FW',
   'HYPH',
   'IN',
   'JJ',
   'JJR',
   'JJS',
   'LS',
   'MD',
   'NFP',
   'NN',
   'NNP',
   'NNPS',
   'NNS',
   'PDT',
   'POS',
   'PRP',
   'PRP$',
   'RB',
   'RBR',
   'RBS',
   'RP',
   'SYM',
   'TO',
   'UH',
   'VB',
   'VBD',
   'VBG',
   'VBN',
   'VBP',
   'VBZ',
   'WDT',
   'WP',
   'WP$',
   'WRB',
   'XX',
   '_SP',
   '``'],
  'parser': ['ROOT',
   'acl',
   'acomp',


In [31]:
def get_attributes(f):
    print([a for a in dir(f) if not a.startswith('_')], end=' ')

In [32]:
get_attributes(nlp)

['Defaults', 'add_pipe', 'analyze_pipes', 'batch_size', 'begin_training', 'component', 'component_names', 'components', 'config', 'create_optimizer', 'create_pipe', 'create_pipe_from_source', 'default_config', 'default_error_handler', 'disable_pipe', 'disable_pipes', 'disabled', 'enable_pipe', 'evaluate', 'factories', 'factory', 'factory_names', 'from_bytes', 'from_config', 'from_disk', 'get_factory_meta', 'get_factory_name', 'get_pipe', 'get_pipe_config', 'get_pipe_meta', 'has_factory', 'has_pipe', 'initialize', 'lang', 'make_doc', 'max_length', 'memory_zone', 'meta', 'path', 'pipe', 'pipe_factories', 'pipe_labels', 'pipe_names', 'pipeline', 'rehearse', 'remove_pipe', 'rename_pipe', 'replace_listeners', 'replace_pipe', 'resume_training', 'select_pipes', 'set_error_handler', 'set_factory_meta', 'to_bytes', 'to_disk', 'tokenizer', 'update', 'use_params', 'vocab'] 

### Exploring The Pipeline

In [33]:
sample_text = 'Apple is looking at buying U.K. startup for $1 billion'

doc = nlp(sample_text)

In [34]:
get_attributes(doc)

['cats', 'char_span', 'copy', 'count_by', 'doc', 'ents', 'extend_tensor', 'from_array', 'from_bytes', 'from_dict', 'from_disk', 'from_docs', 'from_json', 'get_extension', 'get_lca_matrix', 'has_annotation', 'has_extension', 'has_unknown_spaces', 'has_vector', 'is_nered', 'is_parsed', 'is_sentenced', 'is_tagged', 'lang', 'lang_', 'mem', 'noun_chunks', 'noun_chunks_iterator', 'remove_extension', 'retokenize', 'sentiment', 'sents', 'set_ents', 'set_extension', 'similarity', 'spans', 'tensor', 'text', 'text_with_ws', 'to_array', 'to_bytes', 'to_dict', 'to_disk', 'to_json', 'to_utf8_array', 'user_data', 'user_hooks', 'user_span_hooks', 'user_token_hooks', 'vector', 'vector_norm', 'vocab'] 

In [35]:
doc.is_parsed

True

In [36]:
doc.is_sentenced

True

In [37]:
doc.is_tagged

True

In [38]:
doc.text

'Apple is looking at buying U.K. startup for $1 billion'

In [39]:
get_attributes(doc.vocab)

['add_flag', 'cfg', 'deduplicate_vectors', 'from_bytes', 'from_disk', 'get_noun_chunks', 'get_vector', 'has_vector', 'in_memory_zone', 'lang', 'length', 'lex_attr_getters', 'lookups', 'memory_zone', 'morphology', 'prune_vectors', 'reset_vectors', 'set_vector', 'strings', 'to_bytes', 'to_disk', 'vectors', 'vectors_length', 'writing_system'] 

In [40]:
doc.vocab.length

773

#### Exploring `Token` Annotations

In [41]:
pd.Series([token.text for token in doc])

0       Apple
1          is
2     looking
3          at
4      buying
5        U.K.
6     startup
7         for
8           $
9           1
10    billion
dtype: object

In [42]:
pd.DataFrame([[t.text, t.lemma_, t.pos_, t.tag_, t.dep_, t.shape_, t.is_alpha, t.is_stop]
              for t in doc],
             columns=['text', 'lemma', 'pos', 'tag', 'dep', 'shape', 'is_alpha', 'is_stop'])

Unnamed: 0,text,lemma,pos,tag,dep,shape,is_alpha,is_stop
0,Apple,Apple,PROPN,NNP,nsubj,Xxxxx,True,False
1,is,be,AUX,VBZ,aux,xx,True,True
2,looking,look,VERB,VBG,ROOT,xxxx,True,False
3,at,at,ADP,IN,prep,xx,True,True
4,buying,buy,VERB,VBG,pcomp,xxxx,True,False
5,U.K.,U.K.,PROPN,NNP,nsubj,X.X.,False,False
6,startup,startup,VERB,VBD,ccomp,xxxx,True,False
7,for,for,ADP,IN,prep,xxx,True,True
8,$,$,SYM,$,quantmod,$,False,False
9,1,1,NUM,CD,compound,d,False,False


#### Visualizing POS Dependencies

In [43]:
options = {'compact': True, 'bg': 'white',
           'color': 'black', 'font': 'Source Sans Pro', 'notebook': True}

In [44]:
displacy.render(doc, style='dep', options=options)

#### Visualizing Named Entities

In [45]:
displacy.render(doc, style='ent', jupyter=True)

### Reading BBC Data

In [46]:
files = (DATA_DIR / 'bbc').glob('**/*.txt')

bbc_articles = []

for i, file in enumerate(sorted(list(files))):
    with file.open(encoding='latin1') as f:
        lines = f.readlines()
        body = ' '.join([l.strip() for l in lines[1:]]).strip()
        bbc_articles.append(body)

In [47]:
len(bbc_articles)

0

In [51]:
# bbc_articles[0]

#### Parsing 1st Article through Pipeline

In [52]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [54]:
# doc = nlp(bbc_articles[0])
# type(doc)

### Sentence Boundary Detection

In [59]:
# sentences = [s for s in doc.sents]

# sentences[:3]

In [56]:
get_attributes(sentences[0])

['as_doc', 'char_span', 'conjuncts', 'doc', 'end', 'end_char', 'ent_id', 'ent_id_', 'ents', 'get_extension', 'get_lca_matrix', 'has_extension', 'has_vector', 'id', 'id_', 'kb_id', 'kb_id_', 'label', 'label_', 'lefts', 'lemma_', 'n_lefts', 'n_rights', 'noun_chunks', 'orth_', 'remove_extension', 'rights', 'root', 'sent', 'sentiment', 'sents', 'set_extension', 'similarity', 'start', 'start_char', 'subtree', 'tensor', 'text', 'text_with_ws', 'to_array', 'vector', 'vector_norm', 'vocab'] 

In [57]:
pd.DataFrame([[t.text, t.pos_, spacy.explain(t.pos_)] for t in sentences[0]], 
             columns=['Token', 'POS Tag', 'Meaning']).head(15)

Unnamed: 0,Token,POS Tag,Meaning
0,Apple,PROPN,proper noun
1,is,AUX,auxiliary
2,looking,VERB,verb
3,at,ADP,adposition
4,buying,VERB,verb
5,U.K.,PROPN,proper noun
6,startup,VERB,verb
7,for,ADP,adposition
8,$,SYM,symbol
9,1,NUM,numeral


In [60]:
options = {'compact': True, 'bg': '#09a3d5',
           'color': 'white', 'font': 'Source Sans Pro'}
displacy.render(sentences[0].as_doc(), style='dep', jupyter=True, options=options)

In [61]:
for t in sentences[0]:
    if t.ent_type_:
        print('{} | {} | {}'.format(t.text, t.ent_type_, spacy.explain(t.ent_type_)))

Apple | ORG | Companies, agencies, institutions, etc.
U.K. | GPE | Countries, cities, states
$ | MONEY | Monetary values, including unit
1 | MONEY | Monetary values, including unit
billion | MONEY | Monetary values, including unit


In [62]:
displacy.render(sentences[0].as_doc(), style='ent', jupyter=True)

### Named Entity-Recognition with `textacy`

In [64]:
entities = [e.text for e in entities(doc)]

pd.Series(entities).value_counts().head()

#### N-Grams with textacy

In [65]:
pd.Series([n.text for n in ngrams(doc, n=2, min_freq=2)]).value_counts()

#### The `spaCy` Streaming Pipeline API

In [67]:
iter_texts = (bbc_articles[i] for i in range(len(bbc_articles)))

for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50, n_threads=8)):
    if i % 100 == 0:
        print(i, end = ' ')
    assert doc.is_parsed

### Multi-language Features

#### Creating a Spanish Language Object

In [69]:
model = {}

for language in ['en_core_web_sm', 'es_core_news_sm']:
    model[language] = spacy.load(language) 

#### Reading Bilingual TED2013 Samples

In [71]:
text = {}

path = Path('data', 'TED')

for language in ['en_core_web_sm', 'es_core_news_sm']:
    file_name = path /  'TED2013_sample.{}'.format(language)
    text[language] = file_name.read_text()

#### Sentencing Boundaries English vs Spanish

In [73]:
parsed, sentences = {}, {}

for language in ['en_core_web_sm', 'es_core_news_sm']:
    parsed[language] = model[language](text[language])
    sentences[language] = list(parsed[language].sents)
    print('Sentences:', language, len(sentences[language]))

In [75]:
for i, (en, es) in enumerate(zip(sentences['en'], sentences['es']), 1):
    print('\n', i)
    print('English:\t', en)
    print('Spanish:\t', es)
    if i > 5: 
        break

#### POS Tagging English vs Spanish

In [76]:
pos = {}

for language in ['en', 'es']:
    pos[language] = pd.DataFrame([[t.text, t.pos_, spacy.explain(t.pos_)] for t in sentences[language][0]],
                                 columns=['Token', 'POS Tag', 'Meaning'])

In [77]:
bilingual_parsed = pd.concat([pos['en'], pos['es']], axis=1)

bilingual_parsed.head(15)