- Preprocess with pandas 
- Tokenize the text
- Lemmatize the text
- Remove stop words

In [1]:
import pandas as pd
import spacy

In [2]:
nlp= spacy.load('en_core_web_sm')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
pd.set_option('display.max_colwidth', None)

In [4]:
data = [
    "The gardener picked fresh oranges early this morning.",
    "Oranges cost 3.50$ per kilo at SunnyVale Market!",
    "I bought two bags of orange slices for the picnic… delicious.",
    "orange, Orange, ORANGES, orange—so many ways to say it.",
    "She's squeezing oranges to make fresh orange juice.",
    "Does SunnyVale Market sell Valencia oranges or Navel oranges?",
    "A mimosa is sparkling wine mixed with orange juice. [WikiMix]",
    "fresh juice is always better at home"
]


In [5]:
df= pd.DataFrame(data, columns=['Sentences'])

In [6]:
df

Unnamed: 0,Sentences
0,The gardener picked fresh oranges early this morning.
1,Oranges cost 3.50$ per kilo at SunnyVale Market!
2,I bought two bags of orange slices for the picnic… delicious.
3,"orange, Orange, ORANGES, orange—so many ways to say it."
4,She's squeezing oranges to make fresh orange juice.
5,Does SunnyVale Market sell Valencia oranges or Navel oranges?
6,A mimosa is sparkling wine mixed with orange juice. [WikiMix]
7,fresh juice is always better at home


In [7]:
def lower_replace(series):
    output=series.str.lower()
    output = output.str.replace(r'\[.*?\]', '', regex=True) # remove square brackets
    output=output.str.replace(r'[^\w\s]', '', regex=True) # remove punctuation
    return output

In [8]:
df['Clean_Sentences'] = lower_replace(df['Sentences'])

In [9]:
df

Unnamed: 0,Sentences,Clean_Sentences
0,The gardener picked fresh oranges early this morning.,the gardener picked fresh oranges early this morning
1,Oranges cost 3.50$ per kilo at SunnyVale Market!,oranges cost 350 per kilo at sunnyvale market
2,I bought two bags of orange slices for the picnic… delicious.,i bought two bags of orange slices for the picnic delicious
3,"orange, Orange, ORANGES, orange—so many ways to say it.",orange orange oranges orangeso many ways to say it
4,She's squeezing oranges to make fresh orange juice.,shes squeezing oranges to make fresh orange juice
5,Does SunnyVale Market sell Valencia oranges or Navel oranges?,does sunnyvale market sell valencia oranges or navel oranges
6,A mimosa is sparkling wine mixed with orange juice. [WikiMix],a mimosa is sparkling wine mixed with orange juice
7,fresh juice is always better at home,fresh juice is always better at home


In [10]:
def token_lemma_nonstop(series):
    doc = nlp(series)
    output =  [token.lemma_ for token in doc if not token.is_stop]
    output = ' '.join(output)
    return output

In [12]:
new_series=df.Clean_Sentences.apply(token_lemma_nonstop)
new_series

0              gardener pick fresh orange early morning
1                 orange cost 350 kilo sunnyvale market
2                 buy bag orange slice picnic delicious
3                     orange orange orange orangeso way
4                   s squeeze orange fresh orange juice
5    sunnyvale market sell valencia orange navel orange
6                  mimosa sparkle wine mix orange juice
7                                 fresh juice well home
Name: Clean_Sentences, dtype: object

In [14]:
pd.to_pickle(new_series, 'text_clean.pkl')