# Top 10 NER

This sample script will create two dataframes containing the top 10 entities as well as all entities that are found in a sample of your dataset.

In [16]:
# Importing all our required packages
import os
import pandas as pd
from lxml import etree
from bs4 import BeautifulSoup
import random

# The spaCy library allows us to process and run operations on text
import spacy
from spacy.pipeline import EntityRecognizer

In [17]:
# Replace the path below with the dataset which you would like to use as input for the script
dataset_directory = '/home/ec2-user/SageMaker/Halfmann_Abortion'
input_files = os.listdir(dataset_directory)

In [18]:
# The memory usage of both processing text and the resultingdataframe can negatively impact your TDM Studio experience. For 
# this reason, we take a sample of the documents if there are too many documents in the dataset.
#try:
#    sample_input_files = random.sample(input_files, 1000)
#    
#except ValueError:
#    sample_input_files = input_files

In [19]:
sample_input_files = input_files

In [20]:
# Loading spacy's NLP model
nlp = spacy.load('en_core_web_sm')
# Setting max length to be larger than default so we can process longer articles
nlp.max_length = 15000000

In [21]:
# We define a function to get the text content that we need from the XML articles available in our dataset
def getxmlcontent(root):
    if root.find('.//HiddenText') is not None:
        return(root.find('.//HiddenText').text)
    elif root.find('.//Text') is not None:
        return(root.find('.//Text').text)
    else:
        print('Could not find \"HiddenText\" or \"Text\" tags')
        return None

In [22]:
# Creating empty dictionary in which to put our entities and counts
entitydict = {}

for article in sample_input_files:
    try:
        # Parse data and get root tags
        tree = etree.parse(dataset_directory + "/" + article)
        root = tree.getroot()
        
        # Clean text and remove all HTML tags
        soup = BeautifulSoup(getxmlcontent(root))
        text = soup.get_text()
        
        # Run spaCy tokenizer on doc
        doc = nlp(text, disable=['tagger', 'parser', 'textcat'])
        
        # Getting entities and setting them to dict
        for ent in doc.ents:
#             # Uncomment this section of code if you only want one specific type of entity (e.g. PERSON)      
#             if ent.label_ == 'PERSON':
#                 if ent.text not in entitydict:
#                     entitydict[ent.text] = 1
#                 else:
#                     entitydict[ent.text] += 1
            
            if ent.text not in entitydict:
                entitydict[ent.text] = [1, ent.label_]
            else:
                entitydict[ent.text][0] += 1
                
    except TypeError:
        print('Could not read article ' + article)

SyntaxError: invalid or missing encoding declaration for '/home/ec2-user/SageMaker/Halfmann_Abortion/data15.csv' (<string>)

In [None]:
# Creating the dataframes containing all entities as well as top 10 entities
entity_df = pd.DataFrame.from_dict(entitydict, orient='index')
top10_df = entity_df.sort_values(by=[0],ascending=False).head(10)

In [None]:
# View top 10 entities dataframe
top10_df

In [None]:
# View dataframe of all entities (will be truncated)
entity_df

In [None]:
entities = entity_df.sort_values(by=[0],ascending=False)
org = entities[entities[1] == "ORG"]
org = org[org[0] > 10]

In [None]:
org.to_csv("../organizations drew.csv")