# Linguistic Features

In [10]:
import spacy
import pandas as pd

nlp = spacy.load('en_core_web_sm')

text = "Apple is looking at buying U.K. startup for $1 billion"

doc = nlp(text)

data = []
for token in doc:
    data.append([token.text, token.lemma_, token.pos_, token.tag_, token.dep, token.shape_, token.is_alpha, token.is_stop])
    
pd.DataFrame(data = data, columns = ['Text', 'Lemma', 'Part of Speech', 'Tag', 'Dependency', 'Shape', 'Is Alpha', 'Is Stop'])

Unnamed: 0,Text,Lemma,Part of Speech,Tag,Dependency,Shape,Is Alpha,Is Stop
0,Apple,Apple,PROPN,NNP,429,Xxxxx,True,False
1,is,be,AUX,VBZ,405,xx,True,True
2,looking,look,VERB,VBG,8206900633647566924,xxxx,True,False
3,at,at,ADP,IN,443,xx,True,True
4,buying,buy,VERB,VBG,438,xxxx,True,False
5,U.K.,U.K.,PROPN,NNP,416,X.X.,False,False
6,startup,startup,NOUN,NN,399,xxxx,True,False
7,for,for,ADP,IN,443,xxx,True,True
8,$,$,SYM,$,446,$,False,False
9,1,1,NUM,CD,7037928807040764755,d,False,False


In [16]:
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
displacy.render(doc, style = "dep")

In [20]:
import spacy

nlp = spacy.load('en_core_web_sm')

print('Pipelines:', nlp.pipe_names)
doc = nlp('I was reading the papers.')
token = doc[0]
print(token.morph)

Pipelines: ['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']
Case=Nom|Number=Sing|Person=1|PronType=Prs


In [25]:
import spacy

nlp = spacy.load('en_core_web_sm')

lemmatizer = nlp.get_pipe('lemmatizer')

print(lemmatizer.mode)

doc = nlp('I was reading a paper.')
print([token.lemma_ for token in doc])

rule
['I', 'be', 'read', 'a', 'paper', '.']


In [32]:
import spacy

nlp = spacy.blank('sv')

nlp.add_pipe('lemmatizer', config = {'mode': 'lookup'})

<spacy.pipeline.lemmatizer.Lemmatizer at 0x15cb3d02600>

In [38]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text, sep = '\t\t')

Autonomous cars		cars		nsubj		shift
insurance liability		liability		dobj		shift
manufacturers		manufacturers		pobj		toward


In [41]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_, [child for child in token.children], sep = '\t')

Autonomous	amod	cars	NOUN	[]
cars	nsubj	shift	VERB	[Autonomous]
shift	ROOT	shift	VERB	[cars, liability, toward]
insurance	compound	liability	NOUN	[]
liability	dobj	shift	VERB	[insurance]
toward	prep	shift	VERB	[manufacturers]
manufacturers	pobj	toward	ADP	[]


In [45]:
import spacy
from spacy.symbols import nsubj, VERB

nlp = spacy.load("en_core_web_sm")
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")

# Finding a verb with a subject from below — good
verbs = set()
for possible_subject in doc:
    if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
        verbs.add(possible_subject.head)
print(verbs)

{shift}


In [48]:
import spacy
from spacy.symbols import nsubj, VERB

nlp = spacy.load("en_core_web_sm")
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")

# Finding a verb with a subject from above — less good
verbs = []
for possible_verb in doc:
    if possible_verb.pos == VERB:
        for possible_subject in possible_verb.children:
            if possible_subject.dep == nsubj:
                verbs.append(possible_verb)
                break
print(verbs)

[shift]


In [51]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("bright red apples on the tree")
print([token.text for token in doc[2].lefts])  # ['bright', 'red']
print([token.text for token in doc[2].rights])  # ['on']
print(doc[2].n_lefts)  # 2
print(doc[2].n_rights)  # 1

['bright', 'red']
['on']
2
1


In [61]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Credit and mortgage account holders must submit their requests")

roots = [token for token in doc if token.head == token]
print('roots:', roots)
root = roots[0]
print('root:', root)

subject = list(root.lefts)[0]
for descendant in subject.subtree:
#     assert subject is descendant or subject.is_ancestor(descendant)
    print(descendant.text, descendant.dep_, descendant.n_lefts,
            descendant.n_rights,
            [ancestor.text for ancestor in descendant.ancestors], sep ='\t')

roots: [submit]
root: submit
Credit	nmod	0	2	['holders', 'submit']
and	cc	0	0	['Credit', 'holders', 'submit']
mortgage	compound	0	0	['account', 'Credit', 'holders', 'submit']
account	conj	1	0	['Credit', 'holders', 'submit']
holders	nsubj	1	0	['submit']


In [65]:
import spacy

nlp = spacy.load("en_core_web_sm")

doc = nlp("Credit and mortgage account holders must submit their requests")
span = doc[doc[4].left_edge.i : doc[4].right_edge.i + 1]

with doc.retokenize() as retokenizer:
    retokenizer.merge(span)

for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text)

Credit and mortgage account holders NOUN nsubj submit
must AUX aux submit
submit VERB ROOT submit
their PRON poss requests
requests NOUN dobj submit


In [76]:
import spacy
from spacy.matcher import PhraseMatcher

nlp = spacy.load('en_core_web_sm')

terms = ['Galaxy Note', 'iPhone 11', 'iPhone XS', 'Google Pixel']
patterns = [nlp(text) for text in terms]

matcher = PhraseMatcher(nlp.vocab, attr = 'LOWER')
matcher.add("TerminologyList", patterns)

text_doc = nlp("Glowing review overall, and some really interesting side-by-side "
               "photography tests pitting the iPhone 11 Pro against the "
               "Galaxy Note 10 Plus and last year’s iPhone XS and Google Pixel 3.") 
matches = matcher(text_doc)
print(matches)
for match in matches:
    print(text_doc[match[1]:match[2]], end = ', ')

[(3766102292120407359, 17, 19), (3766102292120407359, 22, 24), (3766102292120407359, 30, 32), (3766102292120407359, 33, 35)]
iPhone 11, Galaxy Note, iPhone XS, Google Pixel, 

In [81]:
import spacy

nlp = spacy.blank('en') # Blank model


nlp.add_pipe('textcat',
             config = {
                    "exclusive_classes": True,
                    "architecture": "bow"
            }
)

ConfigValidationError: 

Config validation error

textcat -> architecture        extra fields not permitted
textcat -> exclusive_classes   extra fields not permitted

{'nlp': <spacy.lang.en.English object at 0x0000015CB66725E0>, 'name': 'textcat', 'architecture': 'bow', 'exclusive_classes': True, 'model': {'@architectures': 'spacy.TextCatEnsemble.v2', 'linear_model': {'@architectures': 'spacy.TextCatBOW.v1', 'exclusive_classes': True, 'ngram_size': 1, 'no_output_layer': False}, 'tok2vec': {'@architectures': 'spacy.Tok2Vec.v2', 'embed': {'@architectures': 'spacy.MultiHashEmbed.v1', 'width': 64, 'rows': [2000, 2000, 1000, 1000, 1000, 1000], 'attrs': ['ORTH', 'LOWER', 'PREFIX', 'SUFFIX', 'SHAPE', 'ID'], 'include_static_vectors': False}, 'encode': {'@architectures': 'spacy.MaxoutWindowEncoder.v2', 'width': 64, 'window_size': 1, 'maxout_pieces': 3, 'depth': 2}}}, 'threshold': 0.5, '@factories': 'textcat'}