
## Named Entity Recognition


### Initial setup

In [None]:
# Import packages
import csv
import pandas as pd
import numpy as np
import nltk 
import re
from bs4 import BeautifulSoup
import spacy
from spacy import displacy
from collections import Counter

In [None]:
# Load dataset
dtypes = {'RevID':'category', 'Productname':'category','reviewtitle':'category','reviewtext':'category','all':'category'}
ds = pd.DataFrame(pd.read_excel("input_totaldataset_productnamereviewtitlereviewtext_forner.xlsx"))

In [None]:
# Drop non-English reviews
#ds = ds.drop(ds[ds.Language!='English'].index)

### Functions

In [None]:
# Text preprocessing
def textPreProcess(rawText, removeHTML=True, charsToRemove = r'\?|\.|\!|\;|\.|\"|\,|\(|\)|\&|\:|\-', removeNumbers=True, removeLineBreaks=False, specialCharsToRemove = r'[^\x00-\xfd]', convertToLower=True, removeConsecutiveSpaces=True):
    if type(rawText) != str:
        return rawText
    procText = rawText
        
    # Remove HTML
    if removeHTML:
        procText = BeautifulSoup(procText,'html.parser').get_text()

    # Remove punctuation and other special characters
    if len(charsToRemove)>0:
        procText = re.sub(charsToRemove,' ',procText)

    # Remove numbers
    if removeNumbers:
        procText = re.sub(r'\d+',' ',procText)

    # Remove line breaks
    if removeLineBreaks:
        procText = procText.replace('\n',' ').replace('\r', '')

    # Remove special characters
    if len(specialCharsToRemove)>0:
        procText = re.sub(specialCharsToRemove,' ',procText)

    # Normalize to lower case
    if convertToLower:
        procText = procText.lower() 

    # Replace multiple consecutive spaces with just one space
    if removeConsecutiveSpaces:
        procText = re.sub(' +', ' ', procText)

    return procText

### Analysis

In [None]:
# Create a dataframe with only the RevDescription
processedReviews = pd.DataFrame(data=ds.reviewtext.apply(textPreProcess,charsToRemove ='', removeNumbers=False).values, index=ds.index, columns=['PreProcessedText'])

In [None]:
# Remove rows with empty text
processedReviews.PreProcessedText = processedReviews.PreProcessedText.str.strip()
processedReviews = processedReviews[processedReviews.PreProcessedText != '']

In [None]:
# Load Spacy English model
nlp = spacy.load("en_core_web_sm")

In [None]:
# Define a function to perform NER with spaCy
def performNER(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Apply the NER function to the review text in the dataframe
ds['reviewtext_entities'] = ds['all'].apply(lambda x: performNER(textPreProcess(x)))

In [None]:
for i, row in ds.iterrows():
    print("Review #{}".format(i))
    print("Review Text: {}".format(row['all']))
    print("Named Entities: {}".format(row['reviewtext_entities']))
    print("\n")

In [None]:
# Apply the NER function to the review text in the dataframe
ds['reviewtext_entities'] = ds['all'].apply(lambda x: performNER(textPreProcess(x)))

# Save the dataframe to an Excel file
writer = pd.ExcelWriter('TotaldatasetNewOutputNER.xlsx')
ds.to_excel(writer,'Sheet1',index=False)
writer.save()

In [None]:
#List and count the most frequent "ORG"
#List and Count most frequent"PERSON" 