This notebook was used to practice with the spaCy library in order to use it effectively in the final project of Data Processing.

Name: Julia Wervers
Student number: 13168665

Chapter 1: Finding words, phrases, names and concepts

In [None]:
#Install spacy
#pip install spacy

In [1]:
# Import spaCy
import spacy

# Create a blank English nlp object
nlp = spacy.blank("en")

In [30]:
# Created by processing a string of text with the nlp object
doc = nlp("Hello world!")

# Iterate over tokens in a Doc
for token in doc:
    print(token.text)

#test
my_text = 'I was soon introduced into the presence of the magistrate, an old benevolent man, with calm and mild manners.'
doc = nlp(my_text)
#Iterate over tokens
for token in doc:
    print(token.text)

Hello
world
!
I
was
soon
introduced
into
the
presence
of
the
magistrate
,
an
old
benevolent
man
,
with
calm
and
mild
manners
.


In [3]:
#Try out indexing
my_token = doc[2]
print(my_token.text)

my_span = doc[11:15]
print(my_span)

soon
an old benevolent man


In [4]:
#Test out token attributes
print("is_alpha:", [token.is_alpha for token in doc])
print("is_punct:", [token.is_punct for token in doc])
print("like_num:", [token.like_num for token in doc])

is_alpha: [True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, False, True, True, True, True, True, False]
is_punct: [False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, True, False, False, False, False, False, True]
like_num: [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]


In [8]:
#Trained pipelines
#from spacy.cli import download
#download("en_core_web_sm")

nlp = spacy.load("en_core_web_sm")

#Could help me filter out the words with tags I'm interested in
# Iterate over the tokens
my_text = 'I was soon introduced into the presence of the magistrate, an old benevolent man, with calm and mild manners.'
doc = nlp(my_text)
for token in doc:
    # Print the text and the predicted part-of-speech tag
    print(token.text, token.pos_)

I PRON
was AUX
soon ADV
introduced VERB
into ADP
the DET
presence NOUN
of ADP
the DET
magistrate NOUN
, PUNCT
an DET
old ADJ
benevolent ADJ
man NOUN
, PUNCT
with ADP
calm ADJ
and CCONJ
mild ADJ
manners NOUN
. PUNCT


In [9]:
#Tells us about relations between words
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text)

I PRON nsubjpass introduced
was AUX auxpass introduced
soon ADV advmod introduced
introduced VERB ROOT introduced
into ADP prep introduced
the DET det presence
presence NOUN pobj into
of ADP prep presence
the DET det magistrate
magistrate NOUN pobj of
, PUNCT punct magistrate
an DET det man
old ADJ amod man
benevolent ADJ amod man
man NOUN appos magistrate
, PUNCT punct presence
with ADP prep presence
calm ADJ amod manners
and CCONJ cc calm
mild ADJ conj calm
manners NOUN pobj with
. PUNCT punct introduced


In [16]:
#Helps us find entities in the text -> probably not super useful for my project, but very cool!

# Iterate over the predicted entities
for ent in doc.ents:
    # Print the entity text and its label
    print(ent.text, ent.label_)

#As expected there's no output, as there aren't entities in the sample text that I am using. 

#The following function is very helpful, though, in the case we do find entities in the text.
spacy.explain("MONEY")

'Monetary values, including unit'

In [22]:
#Very interesting and cool method, but not sure if it will be useful for the project
from spacy.matcher import Matcher

# Initialize the matcher with the shared vocab
matcher = Matcher(nlp.vocab)

# Add the pattern to the matcher
pattern = [{"TEXT": "benevolent"}, {"TEXT": "man"}]
matcher.add("MAN_PATTERN", [pattern])

matches = matcher(doc)

# Iterate over the matches
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

benevolent man


In [33]:
#Actually might be helpful for filtering text ?
pattern = [
    {"IS_PUNCT": False},
]

# Initialize the matcher with the shared vocab
matcher = Matcher(nlp.vocab)
matcher.add("FILTER_PATTERN", [pattern])

matches = matcher(doc)

# Iterate over the matches
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)
    
#Doesn't seem to take "IS_DET": False and "IS_CCONJ": False as attributes though, which is a bummer
#Make sure to reinitialize the matcher, as otherwise previously filter patterns will also be applied, here leading to another mention
#of benevolent man before reinitilizaing the matcher.

I
was
soon
introduced
into
the
presence
of
the
magistrate
an
old
benevolent
man
with
calm
and
mild
manners
