In [None]:
# ! pip install spacy

In [None]:
# ! python -m spacy download en_core_web_sm

## spaCy's part-of-speech and dependency tags

The list is:

* ADJ: adjective
* ADP: adposition
* ADV: adverb
* AUX: auxiliary verb
* CONJ: coordinating conjunction
* DET: determiner
* INTJ: interjection
* NOUN: noun
* NUM: numeral
* PART: particle
* PRON: pronoun
* PROPN: proper noun
* PUNCT: punctuation
* SCONJ: subordinating conjunction
* SYM: symbol
* VERB: verb
* X: other

https://spacy.io/api/annotation

The list of other attributes for tokens can be found at https://spacy.io/api/token

## Dependency Tokens

spaCy's dependency tag scheme is based upon the ClearNLP project; the meanings of the tags can be found at https://github.com/clir/clearnlp-guidelines/blob/master/md/specifications/dependency_labels.md:

* ACL: Clausal modifier of noun
* ACOMP: Adjectival complement
* ADVCL: Adverbial clause modifier
* ADVMOD: Adverbial modifier
* AGENT: Agent
* AMOD: Adjectival modifier
* APPOS: Appositional modifier
* ATTR: Attribute
* AUX: Auxiliary
* AUXPASS: Auxiliary (passive)
* CASE: Case marker
* CC: Coordinating conjunction
* CCOMP: Clausal complement
* COMPOUND: Compound modifier
* CONJ: Conjunct
* CSUBJ: Clausal subject
* CSUBJPASS: Clausal subject (passive)
* DATIVE: Dative
* DEP: Unclassified dependent
* DET: Determiner
* DOBJ: Direct Object
* EXPL: Expletive
* INTJ: Interjection
* MARK: Marker
* META: Meta modifier
* NEG: Negation modifier
* NOUNMOD: Modifier of nominal
* NPMOD: Noun phrase as adverbial modifier
* NSUBJ: Nominal subject
* NSUBJPASS: Nominal subject (passive)
* NUMMOD: Number modifier
* OPRD: Object predicate
* PARATAXIS: Parataxis
* PCOMP: Complement of preposition
* POBJ: Object of preposition
* POSS: Possession modifier
* PRECONJ: Pre-correlative conjunction
* PREDET: Pre-determiner
* PREP: Prepositional modifier
* PRT: Particle
* PUNCT: Punctuation
* QUANTMOD: Modifier of quantifier
* RELCL: Relative clause modifier
* ROOT: Root
* XCOMP: Open clausal complement

In [2]:
import spacy

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")

# Process whole documents
text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")
doc = nlp(text)

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

Noun phrases: ['Sebastian Thrun', 'self-driving cars', 'Google', 'few people', 'the company', 'him', 'I', 'you', 'very senior CEOs', 'major American car companies', 'my hand', 'I', 'Thrun', 'an interview', 'Recode']
Verbs: ['start', 'work', 'drive', 'take', 'can', 'tell', 'would', 'shake', 'turn', 'be', 'talk', 'say']
Sebastian Thrun PERSON
Google ORG
2007 DATE
American NORP
Thrun PERSON
Recode ORG
earlier this week DATE


In [16]:
# To get the meaning of a dependency tag, use explain()
print('nsubj:', spacy.explain('nsubj'))
print('dobj:', spacy.explain('dobj'))
print('pobj:', spacy.explain('pobj'))

nsubj: nominal subject
dobj: direct object
pobj: object of preposition


## Text Classification With Machine Learning and SpaCy

https://github.com/Jcharis/Natural-Language-Processing-Tutorials/blob/master/Text%20Classification%20With%20Machine%20Learning,SpaCy,Sklearn(Sentiment%20Analysis)/Text%20Classification%20&%20Sentiment%20Analysis%20with%20SpaCy,Sklearn.ipynb

In [3]:
# Import the English language class
from spacy.lang.en import English

# Create the nlp object
nlp = English()

# Process a text
doc = nlp("This is a sentence.")

# Print the document text
print(doc.text)

This is a sentence.


In [4]:
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en_core_web_sm')

# Build a list of stopwords to use to filter
stopwords = list(STOP_WORDS)
stopwords

['something',
 'been',
 'some',
 'forty',
 'hereupon',
 'must',
 'sixty',
 'she',
 'thereafter',
 'we',
 'nor',
 'another',
 'full',
 'about',
 'any',
 'still',
 'which',
 'both',
 '’ll',
 'along',
 'might',
 'say',
 'nowhere',
 'also',
 'except',
 'alone',
 'anyhow',
 'somewhere',
 '’ve',
 'most',
 'meanwhile',
 'where',
 'had',
 'various',
 'via',
 "'s",
 'am',
 'within',
 'a',
 'other',
 'formerly',
 'amount',
 'moreover',
 'done',
 '‘s',
 'keep',
 'ten',
 'no',
 'upon',
 'fifteen',
 're',
 'go',
 'mostly',
 'elsewhere',
 'to',
 'seems',
 'next',
 '‘re',
 'may',
 'twelve',
 "'m",
 'off',
 'least',
 'thereupon',
 'name',
 'whereas',
 'he',
 'therein',
 'become',
 'eleven',
 'these',
 'up',
 'well',
 "n't",
 'thereby',
 'is',
 'hereby',
 '’d',
 'regarding',
 'how',
 'himself',
 'towards',
 'always',
 'empty',
 'are',
 'us',
 'his',
 'can',
 'noone',
 '’s',
 'would',
 'whatever',
 'enough',
 'should',
 'just',
 'do',
 'otherwise',
 'therefore',
 'across',
 'else',
 'seemed',
 'what',
 

In [5]:
docx = nlp('This is how Johnny Walker was walking.  He was also running beside the lawn.')

# Lemmatizing of tokens
for word in docx:
    print('word =', word.text, "| lemma = ", word.lemma_)

word = This | lemma =  this
word = is | lemma =  be
word = how | lemma =  how
word = Johnny | lemma =  Johnny
word = Walker | lemma =  Walker
word = was | lemma =  be
word = walking | lemma =  walk
word = . | lemma =  .
word =   | lemma =   
word = He | lemma =  -PRON-
word = was | lemma =  be
word = also | lemma =  also
word = running | lemma =  run
word = beside | lemma =  beside
word = the | lemma =  the
word = lawn | lemma =  lawn
word = . | lemma =  .


In [6]:
# Lemma that are not pronouns
for word in docx:
    if word.lemma_ != '-PRON-':
        print(word.lemma_.lower().strip())

this
be
how
johnny
walker
be
walk
.

be
also
run
beside
the
lawn
.


In [7]:
# List Comprehensions of our Lemma
[word.lemma_.lower().strip() if word.lemma_ != '-PRON-' else word.lower_ for word in docx]

['this',
 'be',
 'how',
 'johnny',
 'walker',
 'be',
 'walk',
 '.',
 '',
 'he',
 'be',
 'also',
 'run',
 'beside',
 'the',
 'lawn',
 '.']

In [8]:
# Filtering out Stopwords and Punctuations
for word in docx:
    if word.is_stop == False and not word.is_punct:
        print(word)

Johnny
Walker
walking
 
running
lawn


In [9]:
# Stop words and Punctuation In list Comprehension
[word for word in docx if word.is_stop == False and not word.is_punct]

[Johnny, Walker, walking,  , running, lawn]

In [10]:
# Use the punctuations of string module
import string
punctuations = string.punctuation

In [11]:
# Creating a Spacy Parser
from spacy.lang.en import English
parser = English()

In [12]:
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [word.lemma_lower().strip() if word.lemma_ != '-PRON-' else word.lower_ for word in mytokens]
    mytokens = [word for word in mytokens if word not in stopwords and word not in punctuations]
#    mytokens = [word for word in mytokens if word.is_stop != True and not word.is_punct]
#    mytokens = [word for word in mytokens if word.text not in stopwords word.text not in punctuations]    return mytokens