# spaCy start

In [1]:
# import spacy library after 
import spacy

txt = "The tallest living man is 37-year-old Sultan Kosen, from Turkey, who is 8 feet, 2.8 inches, who set the record in 2009."

# Create the Language object
nlp = spacy.load("en_core_web_sm")
nlp

<spacy.lang.en.English at 0x181173e2250>

In [2]:
# create the Doc object
# filled with extra information about the given text's sentence nad words
# is an iterator
doc = nlp(txt)
doc

The tallest living man is 37-year-old Sultan Kosen, from Turkey, who is 8 feet, 2.8 inches, who set the record in 2009.

In [3]:
for token in doc[:5]:
    print(token)

The
tallest
living
man
is


In [4]:
# able to slice or index notations to extract individual tokens
# tokenization == splitting sentences into words and punctuation
# single token can be a word, punctuation or noun chunk
print(type(token))
len(doc)

<class 'spacy.tokens.token.Token'>


31

In [5]:
# extracting >= 1 token = span object
span = doc[:5]
print(type(span))
# .text to get words in span
span.text
# doc and span are views of an object 
# meaning they are different ways of representing the same text data from
# different perspectives

<class 'spacy.tokens.span.Span'>


'The tallest living man is'

In [6]:
# pre-trained English pipeline have language-specific rules for tokenization
# and extracting their lexical attributes
print("Index:    ", [token.i for token in doc[3:10]])
print("Text:     ", [token.text for token in doc[3:10]])
print("is_alpha: ", [token.is_alpha for token in doc[3:10]])
print("is_punct: ", [token.is_punct for token in doc[3:10]])
# recognize both literal and lettered numbers
print("like_num: ", [token.like_num for token in doc[3:10]])
# get base word stripped from any suffixes, prefixes, tense or grammatical attributes
print("Base word:", [token.lemma_ for token in doc[3:10]])

Index:     [3, 4, 5, 6, 7, 8, 9]
Text:      ['man', 'is', '37', '-', 'year', '-', 'old']
is_alpha:  [True, True, False, False, True, False, True]
is_punct:  [False, False, False, True, False, True, False]
like_num:  [False, False, True, False, False, False, False]
Base word: ['man', 'be', '37', '-', 'year', '-', 'old']


# Architecture and core data structures

In [7]:
nlp = spacy.load("en_core_web_md")
type(nlp)

spacy.lang.en.English

In [8]:
# other languages
# nlp = spacy.load("es_core_news_sm")  # Spanish
# nlp = spacy.load("ru_core_news_sm")  # Russian
# nlp = spacy.load("zh_core_web_sm")  # Chinese
# nlp = spacy.load("de_core_news_sm")  # German

In [9]:
# loading language models from lang sub-module
from spacy.lang.en import English

nlp = English()
type(nlp)

spacy.lang.en.English

In [10]:
txt = """The original name for the search engine Google was Backrub. 
         It was renamed Google after the googol, 
         which is the number one followed by 100 zeros."""
doc = nlp(txt)
doc

The original name for the search engine Google was Backrub. 
         It was renamed Google after the googol, 
         which is the number one followed by 100 zeros.

In [11]:
# after processing text, words and punctuations are stored in the vocab object
# of nlp
len(nlp.vocab)

785

In [12]:
# the vocab is shared between documents for the nlp object
# doc object's vocab only contains words from txt
len(doc.vocab)

785

In [13]:
# communicates with hashses and has a 2-way lookup table called StringStore
type(nlp.vocab.strings)

spacy.strings.StringStore

In [14]:
nlp.vocab.strings['google']

1988622737398120358

In [15]:
nlp.vocab.strings[1988622737398120358]

'google'

In [16]:
# when tokens go into the vocab, these lose all context-specific information
# you are actually looking at lexemes when you see words from vocab
lexeme = nlp.vocab['google']
type(lexeme)

spacy.lexeme.Lexeme

In [17]:
# lexemes don't contain context-specific info like part-of-speech tags,
# morphological dependencies, etc
# may still offer many lexical attributes of a word
print(lexeme.text, lexeme.orth, lexeme.is_digit)

google 1988622737398120358 False


In [18]:
# orth refers to the hash of the lexeme
# looking at word in doc object, it is a token
# looking at word in vocab object, it is a lexeme
txt = """Mosquitoes are the deadliest animal in the world: 
         They kill more people than any other creature, 
         due to the diseases they carry."""

doc = nlp(txt)

print(type(doc))
print(len(doc))

<class 'spacy.tokens.doc.Doc'>
27


In [19]:
# can manually create docs by importing doc class
from spacy.tokens import Doc
nlp = English()

words = ['I','love','Habitica','!']
spaces = [True,True, False,False]

#creating the doc manually
doc = Doc(nlp.vocab, words=words,spaces=spaces)
doc.text

'I love Habitica!'

In [20]:
# spans are also a class of their own
txt = """The hardest working muscle in your body is your heart: 
         It pumps more than 2,000 gallons of blood a day
         and beats more than 2.5 billion times in a 70-year life span."""

doc = nlp(txt)
span = doc[:10]
type(span)

spacy.tokens.span.Span

In [21]:
print(span.text)
print(span.start, span.end)

The hardest working muscle in your body is your heart
0 10


In [22]:
# creating span manually
from spacy.tokens import Span

span = Span(doc, 0 ,10)
span.text

'The hardest working muscle in your body is your heart'

# Named Entity Recognition (NER)

In [23]:
txt = """Cleopatra wasn't actually Egyptian! 
         As far as historians can tell, Egypt's 
         famous femme fatal was actually Greek!. 
         She was a descendant of Alexander the Great's
         Macedonian general Ptolemy"""

nlp = spacy.load("en_core_web_lg")
doc = nlp(txt)
for ent in doc.ents:
    print(f"{ent.text:<20}{ent.label_:<20}")

Cleopatra           PERSON              
Egyptian            NORP                
Egypt               GPE                 
Greek               NORP                
Alexander the Great'sORG                 
Macedonian          NORP                
Ptolemy             NORP                


In [24]:
print(spacy.explain('ORG'))
print(spacy.explain('GPE'))
print(spacy.explain('PERSON'))
spacy.explain('NORP')

Companies, agencies, institutions, etc.
Countries, cities, states
People, including fictional


'Nationalities or religious or political groups'

In [25]:
from spacy.tokens import Span

alexander = Span(doc, 31, 34, label="PERSON")
alexander

Alexander the Great

In [26]:
# Leaves the rest of ents untouched
doc.set_ents([alexander], default="unmodified")

In [27]:
for ent in doc.ents:
    print(f"{ent.text:<20}{ent.label_:<20}")

Cleopatra           PERSON              
Egyptian            NORP                
Egypt               GPE                 
Greek               NORP                
Alexander the Great PERSON              
's                  ORG                 
Macedonian          NORP                
Ptolemy             NORP                


In [28]:
import spacy
from spacy import displacy


options = {"compact": True, "bg": "#09a3d5",
           "color": "white", "font": "Source Sans Pro"}
displacy.render(doc, style="dep", options=options)

In [29]:
# visualising by sentence
sentence_spans = list(doc.sents)
displacy.render(sentence_spans, style="dep")

In [30]:
# visualizing the entity recognizer
doc.user_data["title"] = "Cleopatra lolsies"
displacy.render(doc, style="ent")

In [31]:
# specifiying a list of ents shows only those ents during visualization
doc.user_data["title"] = "Cleopatra lolsies"
colors = {"ORG": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
options = {"ents": ["NORP"], "colors": colors}
displacy.render(doc, style="ent", options=options)

In [32]:
txt3 = 'faez is a budding customer manager'
doc2 = nlp(txt3)
displacy.render(doc2, style="ent")

# POS tags and syntactic dependencies

In [33]:
txt = "The first footprints on the moon will remain there for a million years"

doc = nlp(txt)
# setting headers
print(
    f"{'Text':<20} {'Part-of-speech':<20} "
    f"{'Dependency':<20} {'Dependency text':<20}\n"
)
# printing actual dependencies
for token in doc:
    print(f"{token.text:<20} {token.pos_:<20} "
    f"{token.dep_:<20} {token.head.text:<20}\n")

Text                 Part-of-speech       Dependency           Dependency text     

The                  DET                  det                  footprints          

first                ADJ                  amod                 footprints          

footprints           NOUN                 nsubj                remain              

on                   ADP                  prep                 footprints          

the                  DET                  det                  moon                

moon                 NOUN                 pobj                 on                  

will                 AUX                  aux                  remain              

remain               VERB                 ROOT                 remain              

there                ADV                  advmod               remain              

for                  ADP                  prep                 remain              

a                    DET                  det                  ye

In [34]:
# understanding tag meanings
pos_tags = ["DET", "AUX", "ADP",'ADV']
dep_tags = ["amod", "nsubj", "nummod",'det','prep','pobj','aux','advmod']

for pos in pos_tags:
    print(pos, "-->", spacy.explain(pos))

for dep in dep_tags:
    print(dep, "-->", spacy.explain(dep))

DET --> determiner
AUX --> auxiliary
ADP --> adposition
ADV --> adverb
amod --> adjectival modifier
nsubj --> nominal subject
nummod --> numeric modifier
det --> determiner
prep --> prepositional modifier
pobj --> object of preposition
aux --> auxiliary
advmod --> adverbial modifier


In [35]:
# https://spacy.io/usage/linguistic-features
# extracting noun chunks

txt = """The teddy bear is named after President Theodore Roosevelt. 
         After he refused to shoot a captured black bear on a hunt, 
         a stuffed-animal maker decided to create
         a bear and name it after the president."""

doc = nlp(txt)

for chunk in doc.noun_chunks:
    print(chunk.text)

The teddy bear
President Theodore Roosevelt
he
a captured black bear
a hunt
a stuffed-animal maker
a bear
it
the president


# Custom rule-based tokenization

In [36]:
txt = """Cleopatra wasn't actually Egyptian! 
         As far as historians can tell, Egypt's 
         famous femme fatal was actually Greek!. 
         She was a descendant of Alexander the Great's
         Macedonian general Ptolemy and also, Shaun the Sheep"""

# Create a patten
pattern = [
    {"IS_ALPHA": True, "IS_TITLE": True},
    {"IS_STOP": True},
    {"IS_ALPHA": True, "IS_TITLE": True},
]

from spacy.matcher import Matcher

# Init the matcher with the shared vocab
matcher = Matcher(nlp.vocab)

# Add the pattern to the matcher
matcher.add("TITLED_PERSON", [pattern])

In [37]:
# process the text
doc = nlp(txt)

# finding all matches
matches = matcher(doc)

# iterate over matches
for match_id, start, end in matches:
    # Get span
    span = doc[start:end]
    print(span.text)

Alexander the Great
Shaun the Sheep


In [38]:
matches

[(8382984582166938613, 31, 34), (8382984582166938613, 42, 45)]

# Word vectors and semantic similarity

In [39]:
# spaCy is able to calculate semantic similarity using word vectors
nlp = spacy.load("en_core_web_md")

doc1 = nlp("What a lukeworm sentiment.")
doc2 = nlp("What a short sentence.")
doc1.similarity(doc2)

0.9331312038844666

In [40]:
array = doc1[0].vector
array.shape

(300,)

In [41]:
array[:10]

array([-3.0935,  1.3209, -3.9328, -2.3205, -4.4307, -2.8526,  2.1242,
        1.2495, -5.3112,  1.0106], dtype=float32)

# other ways of calculating

In [42]:
import gensim

# Define a list of sentences to train the Word2Vec model
sentences = [["I", "love", "chocolate"], 
             ["I", "hate", "spinach"], 
             ["I", "like", "ice", "cream"], 
             ["I", "dislike", "onions"]]

# Train the Word2Vec model with the sentences
model = gensim.models.Word2Vec(sentences, min_count=1)

# Get the word vector for a specific word
word_vector = model.wv["chocolate"]

print(word_vector)


[-9.5785474e-03  8.9431144e-03  4.1650678e-03  9.2347339e-03
  6.6435025e-03  2.9247357e-03  9.8040197e-03 -4.4246409e-03
 -6.8033123e-03  4.2273807e-03  3.7290000e-03 -5.6646108e-03
  9.7047593e-03 -3.5583067e-03  9.5494054e-03  8.3472492e-04
 -6.3384580e-03 -1.9771170e-03 -7.3770545e-03 -2.9795242e-03
  1.0416961e-03  9.4826864e-03  9.3558477e-03 -6.5958784e-03
  3.4751510e-03  2.2755694e-03 -2.4893521e-03 -9.2291739e-03
  1.0271263e-03 -8.1657078e-03  6.3201878e-03 -5.8000805e-03
  5.5354382e-03  9.8337224e-03 -1.6000033e-04  4.5284913e-03
 -1.8094016e-03  7.3607611e-03  3.9400961e-03 -9.0103243e-03
 -2.3985051e-03  3.6287690e-03 -9.9568366e-05 -1.2012720e-03
 -1.0554385e-03 -1.6716027e-03  6.0495140e-04  4.1650939e-03
 -4.2527914e-03 -3.8336229e-03 -5.2816868e-05  2.6935578e-04
 -1.6880751e-04 -4.7855065e-03  4.3134023e-03 -2.1719194e-03
  2.1035385e-03  6.6652300e-04  5.9696771e-03 -6.8423818e-03
 -6.8157101e-03 -4.4762585e-03  9.4358278e-03 -1.5918827e-03
 -9.4292425e-03 -5.45041

In [43]:
# word similarity
similarity = model.wv.similarity('chocolate','ice')
print(similarity)

-0.069003314


# Custom pipelines

In [44]:
from spacy.language import Language
from spacy.matcher import Matcher
from spacy.util import filter_spans

nlp = spacy.load("en_core_web_sm")

@Language.component("titled_person")
def titled_person(doc):
    pattern = [
        {"IS_ALPHA": True, "IS_TITLE": True},
        {"IS_STOP": True},
        {"IS_ALPHA": True, "IS_TITLE": True},
    ]
    # Create the matcher
    matcher = Matcher(nlp.vocab)
    # Add the pattern
    matcher.add("TITLED_PERSON", [pattern])

    matches = matcher(doc)
    matched_spans = [Span(doc, start, end, label="PERSON") for _, start, end in matches]

    # Filter the entities for potential overlap
    filtered_matches = filter_spans(list(doc.ents) + matched_spans)
    # Add the matched spans to doc's entities
    doc.ents = filtered_matches

    return doc

nlp.add_pipe("titled_person")

<function __main__.titled_person(doc)>

In [45]:
from spacy.tokens import Span
txt = """Cleopatra wasn't actually Egyptian! 
         As far as historians can tell, Egypt's 
         famous femme fatal was actually Greek!. 
         She was a descendant of Alexander the Great's
         Macedonian general Ptolemy"""

doc = nlp(txt)
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'titled_person']

In [46]:
doc.ents

(Cleopatra, Egyptian, Egypt, Greek, Alexander the Great's, Ptolemy)

In [47]:
# # other ways of adding to pipeline
# nlp.add_pipe("titled_person", first=True)  # Beginning
# nlp.add_pipe("titled_person", after="parser")  # After parser
# nlp.add_pipe("titled_person", before="tagger") # Before POS tagger

In [48]:
string = 'I want to eat food, I am hungry.'
string.split()

['I', 'want', 'to', 'eat', 'food,', 'I', 'am', 'hungry.']

# spaCy practice

In [49]:
import spacy
from spacy.lang.en import English
from spacy.tokens import Span
from spacy import displacy
import pandas as pd

nlp = English()
df = pd.read_csv('arxiv_data.csv')
df['orig_titles'] = df['titles']
df['orig_summaries'] = df['summaries']
df

Unnamed: 0,titles,summaries,terms,orig_titles,orig_summaries
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"['cs.CV', 'cs.LG']",Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"['cs.CV', 'cs.AI', 'cs.LG']",FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","['cs.CV', 'cs.AI']",Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons..."
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,['cs.CV'],Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...
4,Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor...","['cs.CV', 'cs.LG']",Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor..."
...,...,...,...,...,...
51769,Hierarchically-coupled hidden Markov models fo...,We address the problem of analyzing sets of no...,"['stat.ML', 'physics.bio-ph', 'q-bio.QM']",Hierarchically-coupled hidden Markov models fo...,We address the problem of analyzing sets of no...
51770,Blinking Molecule Tracking,We discuss a method for tracking individual mo...,"['cs.CV', 'cs.DM']",Blinking Molecule Tracking,We discuss a method for tracking individual mo...
51771,Towards a Mathematical Foundation of Immunolog...,We attempt to set a mathematical foundation of...,"['stat.ML', 'cs.LG', 'q-bio.GN']",Towards a Mathematical Foundation of Immunolog...,We attempt to set a mathematical foundation of...
51772,A Semi-Automatic Graph-Based Approach for Dete...,Diffusion Tensor Imaging (DTI) allows estimati...,['cs.CV'],A Semi-Automatic Graph-Based Approach for Dete...,Diffusion Tensor Imaging (DTI) allows estimati...


In [50]:
# clean the data
import nltk
import re
from nltk.corpus import stopwords

# lowercase everything
df.summaries = df.summaries.apply(str.lower)
df.titles = df.titles.apply(str.lower)

# remove stop words
def remove_stop(word):
    removed = " ".join([i for i in word.split(" ") if i not in stopword])
    return removed

stopword = stopwords.words('english')
df.summaries = df.summaries.apply(remove_stop)
df.titles = df.titles.apply(remove_stop)


# lemmatize the values
def lemmatize_text(text):
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

def stem_porter(text):
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    ps = nltk.PorterStemmer()
    return [ps.stem(w) for w in w_tokenizer.tokenize(text)]


# nltk.download()
df.summaries = df.summaries.apply(stem_porter)
df.titles = df.titles.apply(stem_porter)
df

Unnamed: 0,titles,summaries,terms,orig_titles,orig_summaries
0,"[survey, semant, stereo, match, /, semant, dep...","[stereo, match, one, wide, use, techniqu, infe...","['cs.CV', 'cs.LG']",Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...
1,"[future-ai:, guid, principl, consensu, recomme...","[recent, advanc, artifici, intellig, (ai), com...","['cs.CV', 'cs.AI', 'cs.LG']",FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...
2,"[enforc, mutual, consist, hard, region, semi-s...","[paper,, propos, novel, mutual, consist, netwo...","['cs.CV', 'cs.AI']",Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons..."
3,"[paramet, decoupl, strategi, semi-supervis, 3d...","[consist, train, proven, advanc, semi-supervis...",['cs.CV'],Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...
4,"[background-foreground, segment, interior, sen...","[ensur, safeti, autom, driving,, correct, perc...","['cs.CV', 'cs.LG']",Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor..."
...,...,...,...,...,...
51769,"[hierarchically-coupl, hidden, markov, model, ...","[address, problem, analyz, set, noisi, time-va...","['stat.ML', 'physics.bio-ph', 'q-bio.QM']",Hierarchically-coupled hidden Markov models fo...,We address the problem of analyzing sets of no...
51770,"[blink, molecul, track]","[discuss, method, track, individu, molecul, gl...","['cs.CV', 'cs.DM']",Blinking Molecule Tracking,We discuss a method for tracking individual mo...
51771,"[toward, mathemat, foundat, immunolog, amino, ...","[attempt, set, mathemat, foundat, immunolog, a...","['stat.ML', 'cs.LG', 'q-bio.GN']",Towards a Mathematical Foundation of Immunolog...,We attempt to set a mathematical foundation of...
51772,"[semi-automat, graph-bas, approach, determin, ...","[diffus, tensor, imag, (dti), allow, estim, po...",['cs.CV'],A Semi-Automatic Graph-Based Approach for Dete...,Diffusion Tensor Imaging (DTI) allows estimati...


In [51]:
df

Unnamed: 0,titles,summaries,terms,orig_titles,orig_summaries
0,"[survey, semant, stereo, match, /, semant, dep...","[stereo, match, one, wide, use, techniqu, infe...","['cs.CV', 'cs.LG']",Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...
1,"[future-ai:, guid, principl, consensu, recomme...","[recent, advanc, artifici, intellig, (ai), com...","['cs.CV', 'cs.AI', 'cs.LG']",FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...
2,"[enforc, mutual, consist, hard, region, semi-s...","[paper,, propos, novel, mutual, consist, netwo...","['cs.CV', 'cs.AI']",Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons..."
3,"[paramet, decoupl, strategi, semi-supervis, 3d...","[consist, train, proven, advanc, semi-supervis...",['cs.CV'],Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...
4,"[background-foreground, segment, interior, sen...","[ensur, safeti, autom, driving,, correct, perc...","['cs.CV', 'cs.LG']",Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor..."
...,...,...,...,...,...
51769,"[hierarchically-coupl, hidden, markov, model, ...","[address, problem, analyz, set, noisi, time-va...","['stat.ML', 'physics.bio-ph', 'q-bio.QM']",Hierarchically-coupled hidden Markov models fo...,We address the problem of analyzing sets of no...
51770,"[blink, molecul, track]","[discuss, method, track, individu, molecul, gl...","['cs.CV', 'cs.DM']",Blinking Molecule Tracking,We discuss a method for tracking individual mo...
51771,"[toward, mathemat, foundat, immunolog, amino, ...","[attempt, set, mathemat, foundat, immunolog, a...","['stat.ML', 'cs.LG', 'q-bio.GN']",Towards a Mathematical Foundation of Immunolog...,We attempt to set a mathematical foundation of...
51772,"[semi-automat, graph-bas, approach, determin, ...","[diffus, tensor, imag, (dti), allow, estim, po...",['cs.CV'],A Semi-Automatic Graph-Based Approach for Dete...,Diffusion Tensor Imaging (DTI) allows estimati...


In [52]:
txt = df.iloc[0]['orig_summaries']
print(txt)
doc = nlp(txt)

for ent in doc.ents:
    print(f"{ent.text:<20}{ent.label_:<20}")
# visualizing the entity recognizer
# doc.user_data["title"] = "testing NER"
# displacy.render(doc, style="ent")

Stereo matching is one of the widely used techniques for inferring depth from
stereo images owing to its robustness and speed. It has become one of the major
topics of research since it finds its applications in autonomous driving,
robotic navigation, 3D reconstruction, and many other fields. Finding pixel
correspondences in non-textured, occluded and reflective areas is the major
challenge in stereo matching. Recent developments have shown that semantic cues
from image segmentation can be used to improve the results of stereo matching.
Many deep neural network architectures have been proposed to leverage the
advantages of semantic segmentation in stereo matching. This paper aims to give
a comparison among the state of art networks both in terms of accuracy and in
terms of speed which are of higher importance in real-time applications.
