In [203]:
# ! pip install spacy
# ! pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.1.0/en_core_web_trf-3.1.0.tar.gz
# ! pip install presidio-analyzer
# ! python -m spacy download en_core_web_trf
# ! python -m spacy download en_core_web_lg

In [204]:
import spacy
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import NlpEngineProvider
import pandas as pd

In [205]:
df = pd.read_csv('../../Data/Raw/YelpFakeReview(Tagged DF).csv')
df = df.dropna(subset=['reviewContent'])
df = df.head(10)

### Entity Recognition Extraction from Review using Microsoft Presidio and Spacy model

Presidio Analyzer supports a wide range of entities. Here are some of them that we will be selecting:

- `DATE_TIME`: Absolute or relative dates or periods or times smaller than a day¹.
- `EMAIL_ADDRESS`: An email address identifies an email box to which email messages are delivered¹.
- `NRP`: A person’s Nationality, religious or political group¹.
- `LOCATION`: Name of politically or geographically defined location (cities, provinces, countries, international regions, bodies of water, mountains)¹.
- `PERSON`: A full person name, which can include first names, middle names or initials, and last names¹.
- `PHONE_NUMBER`: A telephone number¹.
- `URL`: A URL (Uniform Resource Locator), unique identifier used to locate a resource on the Internet¹.


In [206]:
# Initialize the Presidio analyzer engine
analyzer = AnalyzerEngine()

In [207]:
# Load the installed model
nlp = spacy.load('en_core_web_trf')

In [208]:
# Create configuration containing engine name and models
configuration = {
    "nlp_engine_name": "spacy",
    "models": [
        {"lang_code": "en", "model_name": "en_core_web_trf"},
    ],
}

# Create NLP engine based on configuration
provider = NlpEngineProvider(nlp_configuration=configuration)
nlp_engine = provider.create_engine()

# Pass the created NLP engine and supported_languages to the AnalyzerEngine
analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"])

In [209]:
df.reviewContent.isna().sum()

0

In [210]:
# Assuming df is your DataFrame and 'reviewContent' is your column with text data
entities = ['DATE_TIME', 'EMAIL_ADDRESS', 'NRP', 'LOCATION', 'PERSON', 'PHONE_NUMBER']

for entity in entities:
    df[entity] = None

In [211]:
# for index, row in df.iterrows():
#     text = row['reviewContent']
#     results = analyzer.analyze(text=text, entities=entities, language='en')
#     for result in results:
#         entity_text = text[result.start : result.end]
#         entity_type = result.entity_type
#         if df.loc[index, entity_type] == None:
#             df.loc[index, entity_type] = [entity_text]
#         else:
#             df.loc[index, entity_type].append(entity_text)

In [215]:
def analyze_row(row):
    text = row['reviewContent']
    results = analyzer.analyze(text=text, entities=entities, language='en')
    for result in results:
        entity_text = text[result.start : result.end]
        entity_type = result.entity_type
        if row[entity_type] == None:
            row[entity_type] = [entity_text]
        else:
            row[entity_type].append(entity_text)
    return row

In [216]:
df = df.apply(analyze_row, axis=1)


In [213]:
df

Unnamed: 0,reviewDate,reviewID,reviewerID,reviewContent,reviewRating,reviewUsefulCount,reviewCoolCount,reviewFunnyCount,restaurantID,flagged,...,resLocation,resName,resReviewCount,resRating,DATE_TIME,EMAIL_ADDRESS,NRP,LOCATION,PERSON,PHONE_NUMBER
0,9/22/2012,GtwU21YOQn-wf4vWRUIx6w,bNYesZ944s6IJVowOnB0iA,"Unlike Next, which we'd eaten at the previous ...",5,0,0,0,pbEiXam9YJL3neCYHGwLUA,N,...,"Alinea - Lincoln Park - Chicago, IL",Alinea,841,4.5,"[the previous night, four hours and thirty-nin...",,[English],,"[Willy Wonka, Mickey, Minnie Mouse, Jackson Po...",
1,9/22/2012,0LpVTc3,TRKxLC3y-ZvP45e5iilMtw,Probably one of the best meals I've had ever. ...,5,0,0,0,pbEiXam9YJL3neCYHGwLUA,N,...,"Alinea - Lincoln Park - Chicago, IL",Alinea,841,4.5,"[an evening, a least 4 hours]",,,,[Grant Achatz],
2,9/19/2012,tljtLzf68Fkwf,0EMm8umAqXZzyhxNpL4M9g,Service was impeccable. Experience and present...,3,2,0,0,pbEiXam9YJL3neCYHGwLUA,N,...,"Alinea - Lincoln Park - Chicago, IL",Alinea,841,4.5,,,,,,
3,9/6/2012,iSN,DlwexC7z88ymAzu45skODw,"The problem with places like this, given the e...",3,8,0,3,pbEiXam9YJL3neCYHGwLUA,N,...,"Alinea - Lincoln Park - Chicago, IL",Alinea,841,4.5,[5 seconds],,,,"[Jackson Pollock, Grant Achatz]",
4,9/9/2012,Jmwrh7,kW2dk1CWihmh3g7k9N2G8A,I have no idea how to write my review - dining...,5,1,2,0,pbEiXam9YJL3neCYHGwLUA,N,...,"Alinea - Lincoln Park - Chicago, IL",Alinea,841,4.5,[a few days],,[American],[Chicago],,
5,8/30/2012,lKlceLWoePzeuvFD3sj4mw,HxXEcMDDTJFUqVfhPF9M8Q,Despite the first-world tragedy I endured in a...,5,3,1,1,pbEiXam9YJL3neCYHGwLUA,N,...,"Alinea - Lincoln Park - Chicago, IL",Alinea,841,4.5,[same-day],,,"[Chicago, New York's]","[Willy Wonka, Grant Achatz's]",
6,9/8/2012,PBS2uyee9V5IpFfTropxbw,OW2H-GkKnlVEBPuGHIaiFg,"Overall, was it worth the hype? Yes and more. ...",5,1,0,0,pbEiXam9YJL3neCYHGwLUA,N,...,"Alinea - Lincoln Park - Chicago, IL",Alinea,841,4.5,"[evening, 2-plus weeks, about a month before]",,,,[Achatz],
7,8/24/2012,PkwbB,BSh3h1J4mdSmEsb8FFdf0Q,There are already TONS of professional & amate...,3,8,0,0,pbEiXam9YJL3neCYHGwLUA,N,...,"Alinea - Lincoln Park - Chicago, IL",Alinea,841,4.5,"[at least two months, over 6 hours, 3-hour]",,,[U.S.],,
8,8/22/2012,MRdliMXsmP1ViLjA5oCO,F3mbveXX30Ou0gpDY6IrCQ,Your life is a countdown ever since you're bor...,5,3,7,3,pbEiXam9YJL3neCYHGwLUA,N,...,"Alinea - Lincoln Park - Chicago, IL",Alinea,841,4.5,"[one minute, four hours, an hour or two, summe...",,,"[Antioch, Illinois, California]",[Achatz],
9,8/23/2012,z-j4X,NvSnBp4fTpNOfDwm2GWusA,Lots of complaints about how difficult it is t...,5,3,0,0,pbEiXam9YJL3neCYHGwLUA,N,...,"Alinea - Lincoln Park - Chicago, IL",Alinea,841,4.5,,,,,,
