<a href="https://colab.research.google.com/github/heyprincehere/advanced-nlp-with-python-for-machine-learning-3807097/blob/main/Building_a_spacy_processing_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Example 1: Building a spacy processing pipeline**

In [2]:
import pandas as pd
import spacy
nlp = spacy.load("en_core_web_sm")

''' Order of tasks in the processing pipeline
1 .Tokenization
2. Stop Words
3. Parts-Of-Speech (POS) tagging
4. Dependecy Parsing
5. Lemmatization
6. Named Entity Recognition (NER)
7. Other Use Case Task
'''

text = "A customer in New York City wants to give a review"
doc=nlp(text)

#tokenization
print("Tokenization:")
for token in doc:
    print(token.text)

print("\n")

# Stop Words
print("Stop Words:")
filtered_token = [token.text for token in doc if not token.is_stop]
# create a list of tokens excluding stop words using list comprehension
print(filtered_token)

print("\n")

# Parts-Of-Speech (POS) Tagging
print("Parts-Of-Speech (POS) Tagging:")
for token in doc:
  #iterate through each token and print it's text and pos tag
  print(token.text, token.pos_)

print("\n")

# Named Entity Recognition (NER)
print("Named Entity Recognition (NER):")
for ent in doc.ents:
  #iterate through each entity and print it's text and label
  print(ent.text, ent.label_)

print("\n")

# Lemmatization
print("Lemmatization:")
lemmatization_tokens = [token.lemma_ for token in doc if not token.is_punct]
print(lemmatization_tokens)

Tokenization:
A
customer
in
New
York
City
wants
to
give
a
review


Stop Words:
['customer', 'New', 'York', 'City', 'wants', 'review']


Parts-Of-Speech (POS) Tagging:
A DET
customer NOUN
in ADP
New PROPN
York PROPN
City PROPN
wants VERB
to PART
give VERB
a DET
review NOUN


Named Entity Recognition (NER):
New York City GPE


Lemmatization:
['a', 'customer', 'in', 'New', 'York', 'City', 'want', 'to', 'give', 'a', 'review']


**Example 2: Building a spacy processing pipeline with a file**

In [20]:
file_path = "/content/sentiment_analysis_data.txt"
with open(file_path, "r", encoding = "utf-8") as file:
  sentiment_texts = file.readlines()


In [22]:

#Lists to store results
token_lists = []
filtered_token_lists = []
pos_tag_lists = []
ner_lists = []

for sentiment_text in sentiment_texts:
  doc = nlp(sentiment_text.strip())

  #tokenization
  token = [token.text for token in doc]
  token_lists.append(token)

  #Filter for removal of stop words
  filtered_tokens = [token.text for token in doc if not token.is_stop]
  filtered_token_lists.append(filtered_tokens)

  #pos tagging
  pos_tag = [(token.text, token.pos_) for token in doc]
  pos_tag_lists.append(pos_tag)

  #ner
  ner = [(ent.text, ent.label_) for ent in doc.ents]
  ner_lists.append(ner)


results_df = pd.DataFrame({
    'Sentiment Example': sentiment_texts,
    'Tokenization': token_lists,
    'Filtered_tokens': filtered_token_lists,
    'Pos Tagging': pos_tag_lists,
    'NER': ner_lists
})

print(results_df)

                                   Sentiment Example  \
0    I love this product! It's absolutely amazing.\n   
1      This is the worst experience I’ve ever had.\n   
2         I am feeling so happy and excited today!\n   
3           The service was okay, nothing special.\n   
4  I hate waiting in long queues, it’s so frustra...   
5    The weather is beautiful today, I feel great!\n   
6  I am extremely disappointed with the quality o...   
7         Such a wonderful and heartwarming story!\n   
8  The food was terrible, I will never eat here a...   
9  What a fantastic performance, I really enjoyed...   

                                        Tokenization  \
0  [I, love, this, product, !, It, 's, absolutely...   
1  [This, is, the, worst, experience, I, ’ve, eve...   
2  [I, am, feeling, so, happy, and, excited, toda...   
3  [The, service, was, okay, ,, nothing, special, .]   
4  [I, hate, waiting, in, long, queues, ,, it, ’s...   
5  [The, weather, is, beautiful, today, ,, I, f

**Extract data into a csv file format**

In [23]:
results_df.to_csv('processed_data.csv', index= False)
processed_df = pd.read_csv('/content/processed_data.csv', encoding = 'latin1')

In [24]:
processed_df.head()

Unnamed: 0,Sentiment Example,Tokenization,Filtered_tokens,Pos Tagging,NER
0,I love this product! It's absolutely amazing.\n,"['I', 'love', 'this', 'product', '!', 'It', ""'...","['love', 'product', '!', 'absolutely', 'amazin...","[('I', 'PRON'), ('love', 'VERB'), ('this', 'DE...",[]
1,This is the worst experience Iâve ever had.\n,"['This', 'is', 'the', 'worst', 'experience', '...","['worst', 'experience', '.']","[('This', 'PRON'), ('is', 'AUX'), ('the', 'DET...",[]
2,I am feeling so happy and excited today!\n,"['I', 'am', 'feeling', 'so', 'happy', 'and', '...","['feeling', 'happy', 'excited', 'today', '!']","[('I', 'PRON'), ('am', 'AUX'), ('feeling', 'VE...","[('today', 'DATE')]"
3,"The service was okay, nothing special.\n","['The', 'service', 'was', 'okay', ',', 'nothin...","['service', 'okay', ',', 'special', '.']","[('The', 'DET'), ('service', 'NOUN'), ('was', ...",[]
4,"I hate waiting in long queues, itâs so frust...","['I', 'hate', 'waiting', 'in', 'long', 'queues...","['hate', 'waiting', 'long', 'queues', ',', 'fr...","[('I', 'PRON'), ('hate', 'VERB'), ('waiting', ...",[]
