In [217]:
# ! pip install spacy
# ! pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.1.0/en_core_web_trf-3.1.0.tar.gz
# ! pip install presidio-analyzer
# ! python -m spacy download en_core_web_trf
# ! python -m spacy download en_core_web_lg

In [218]:
import spacy
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import NlpEngineProvider
import pandas as pd

In [219]:
df = pd.read_csv('../../Data/Raw/YelpFakeReview(Tagged DF).csv')
df = df.dropna(subset=['reviewContent'])

### Entity Recognition Extraction from Review using Microsoft Presidio and Spacy model

Presidio Analyzer supports a wide range of entities. Here are some of them that we will be selecting:

- `DATE_TIME`: Absolute or relative dates or periods or times smaller than a day¹.
- `EMAIL_ADDRESS`: An email address identifies an email box to which email messages are delivered¹.
- `NRP`: A person’s Nationality, religious or political group¹.
- `LOCATION`: Name of politically or geographically defined location (cities, provinces, countries, international regions, bodies of water, mountains)¹.
- `PERSON`: A full person name, which can include first names, middle names or initials, and last names¹.
- `PHONE_NUMBER`: A telephone number¹.
- `URL`: A URL (Uniform Resource Locator), unique identifier used to locate a resource on the Internet¹.


In [220]:
# Initialize the Presidio analyzer engine
analyzer = AnalyzerEngine()

In [221]:
# Load the installed model
nlp = spacy.load('en_core_web_trf')

In [222]:
# Create configuration containing engine name and models
configuration = {
    "nlp_engine_name": "spacy",
    "models": [
        {"lang_code": "en", "model_name": "en_core_web_trf"},
    ],
}

# Create NLP engine based on configuration
provider = NlpEngineProvider(nlp_configuration=configuration)
nlp_engine = provider.create_engine()

# Pass the created NLP engine and supported_languages to the AnalyzerEngine
analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"])

In [223]:
df.reviewContent.isna().sum()

0

In [224]:
# Assuming df is your DataFrame and 'reviewContent' is your column with text data
entities = ['DATE_TIME', 'EMAIL_ADDRESS', 'NRP', 'LOCATION', 'PERSON', 'PHONE_NUMBER']

for entity in entities:
    df[entity] = None

In [225]:
# for index, row in df.iterrows():
#     text = row['reviewContent']
#     results = analyzer.analyze(text=text, entities=entities, language='en')
#     for result in results:
#         entity_text = text[result.start : result.end]
#         entity_type = result.entity_type
#         if df.loc[index, entity_type] == None:
#             df.loc[index, entity_type] = [entity_text]
#         else:
#             df.loc[index, entity_type].append(entity_text)

In [226]:
def analyze_row(row):
    text = row['reviewContent']
    results = analyzer.analyze(text=text, entities=entities, language='en')
    for result in results:
        entity_text = text[result.start : result.end]
        entity_type = result.entity_type
        if row[entity_type] == None:
            row[entity_type] = [entity_text]
        else:
            row[entity_type].append(entity_text)
    return row

In [None]:
df = df.apply(analyze_row, axis=1)


In [230]:
df.to_csv('Entity_dataframe.csv', index= False)