There are many NLP tasks that spaCy can handle, such as
- tokenization
- lemmatization
- stop words etc.

In [1]:
import pandas as pd
import spacy

In [18]:
test=[
    "We're going to start this course with traditional NLP applications.",
    "Then we'll move on to modern NLP theory.",
    "Finally, we'll wrap things up with modern NLP applications"
]
test_series=pd.Series(test)


In [19]:
test_series

0    We're going to start this course with traditio...
1             Then we'll move on to modern NLP theory.
2    Finally, we'll wrap things up with modern NLP ...
dtype: object

In [2]:
data=[
"When life gives you lemons, make Lemonade!",
"She bought 2 lemons for $1 at Maven Market.",
"A dozen lemons will make a gallon of lemonade. [AllRecipes]",
"lemon, Lemon, Lemons, Lemon, lemon, lemons",
"He's running to the market to get a lemon - there's a great sale today.",
"Does Maven Market carry Eureka lemons or Meyer lemons?",
"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]",
"iced tea is my favorite"
]
data_df=pd.DataFrame(data, columns=['sentence'])

In [3]:
df=data_df.copy()
df['sentence_clean'] = df['sentence'].str.lower()
df['sentence_clean']=df['sentence_clean'].str.replace(r'\[.*?\]', '', regex=True)
df['sentence_clean']=df['sentence_clean'].str.replace(r'[^\w\s]', '', regex=True)


In [4]:
df

Unnamed: 0,sentence,sentence_clean
0,"When life gives you lemons, make Lemonade!",when life gives you lemons make lemonade
1,She bought 2 lemons for $1 at Maven Market.,she bought 2 lemons for 1 at maven market
2,A dozen lemons will make a gallon of lemonade....,a dozen lemons will make a gallon of lemonade
3,"lemon, Lemon, Lemons, Lemon, lemon, lemons",lemon lemon lemons lemon lemon lemons
4,He's running to the market to get a lemon - th...,hes running to the market to get a lemon ther...
5,Does Maven Market carry Eureka lemons or Meyer...,does maven market carry eureka lemons or meyer...
6,"An Arnold Palmer is half lemonade, half iced t...",an arnold palmer is half lemonade half iced tea
7,iced tea is my favorite,iced tea is my favorite


In [9]:
nlp=spacy.load('en_core_web_sm')

In [7]:
phrase=df['sentence_clean'][0]
phrase

'when life gives you lemons make lemonade'

In [10]:
# the first step is to turn the string phrase into a spacy object

doc=nlp(phrase)

In [11]:
doc

when life gives you lemons make lemonade

In [None]:
# tokenize 

[token for token in doc]

[when, life, gives, you, lemons, make, lemonade]

In [13]:
[token.text for token in doc] # this time return string outputs

['when', 'life', 'gives', 'you', 'lemons', 'make', 'lemonade']

In [14]:
# lemmatize

[token.lemma_ for token in doc]

['when', 'life', 'give', 'you', 'lemon', 'make', 'lemonade']

In [15]:
#  remove stop words

list(nlp.Defaults.stop_words)[:10]

['before',
 'afterwards',
 'does',
 'whole',
 'side',
 'a',
 'such',
 'been',
 'my',
 'mine']

In [16]:
norm = [token.lemma_ for token in doc if not  token.is_stop]
norm

['life', 'give', 'lemon', 'lemonade']

In [17]:
' '.join(norm)

'life give lemon lemonade'

In [20]:
# wrap everything in a function
def lemma_token_stop(text):
    doc=nlp(text)
    output=[token.lemma_ for token in doc if not token.is_stop]
    output=' '.join(output)
    return output

In [22]:
test_series

0    We're going to start this course with traditio...
1             Then we'll move on to modern NLP theory.
2    Finally, we'll wrap things up with modern NLP ...
dtype: object

In [21]:
test_series.apply(lemma_token_stop)

0    go start course traditional NLP application .
1                              modern NLP theory .
2      finally , wrap thing modern NLP application
dtype: object

In [23]:
df.sentence_clean.apply(lemma_token_stop)

0                       life give lemon lemonade
1                     buy 2 lemon 1 maven market
2                    dozen lemon gallon lemonade
3            lemon lemon lemon lemon lemon lemon
4        s run market lemon   s great sale today
5    maven market carry eureka lemon meyer lemon
6       arnold palmer half lemonade half ice tea
7                               ice tea favorite
Name: sentence_clean, dtype: object

### Parts of Speech tagging

In [24]:
phrase2=df.sentence_clean.apply(lemma_token_stop)[0]
phrase2

'life give lemon lemonade'

In [26]:
doc2=nlp(phrase2)
[(token.text, token.pos_) for token in doc2]

[('life', 'NOUN'), ('give', 'VERB'), ('lemon', 'NOUN'), ('lemonade', 'PROPN')]

In [27]:
# take only nouns
[(token.text, token.pos_) for token in doc2 if token.pos_ in ['NOUN', 'PROPN']]

[('life', 'NOUN'), ('lemon', 'NOUN'), ('lemonade', 'PROPN')]

In [29]:
def filter_pos(text, pos_list):
    doc=nlp(text)
    output= [token.text for token in doc if token.pos_ in pos_list]
    ' '.join(output)
    return output