# Practice Extracting Named Entities using spaCy (NLP software) 

In [1]:
#import spaCy
import spacy
nlp = spacy.load("en_core_web_sm")

# Created a temporary list of dictionaries to mimic the data that was collected. 
1. Did all testing of the spaCy library with named entity extraction on this temporary dictionary. 
2. These entries are copy and pasted directly from the RSS feed, so that I can have a accurate picture of how my code works. 
3. You can see some deficiencies in spaCy, but there are named entities that I would like to include in further data preparation such as:
    1. NORP 
    2. DATE 
    3. PERSON and PRONOUN 

In [2]:

country_mentioned_temp = [{'title':"Julio Tello, Peru’s Archaeological Trail Blazer", 
                           'description_clean':'Tello is often called some variation of' 
                           'the father of Peruvian archaeology or the first indigenous Peruvian archaeologist.' 
                           'And his work was playing out across a backdrop of constant unrest and conflict,' 
                           'both for his country and his profession'},
                          {'title':'Grand Central Terminal', 'description_clean': 'Grand Central"s story starts with' 
                           'one of the wealthiest names in U.S. history,' 
                           ' but it also is in many ways the story of the city itself since the 1800s,' 
                           'because Grand Central was such a pivotal element in the growth of Manhattan.'},
                          {'title':'A Brief History of Bonsai', 'description_clean':'Bonsai’s origins go all the way back' 
                           'to ancient China,long before Japan became infatuated with the art form.' 
                           ' Over time, the western world also became fascinated with bonsai,' 
                           'though there has been plenty of cultural confusion about it along the way.'},
                          {'title':'The Peterloo Massacre', 'description_clean': 'The Peterloo Massacre took place' 
                           'during a peaceful protest for parliamentary reform in Manchester, England.'
                           ' And there was a lot feeding into why people in Britain,' 
                           'and specifically in the region around Manchester, thought that reform was needed.'},
                          {'title':'The Nika Riots & Massacre', 'description_clean':'Large-scale rioting and mass violence' 
                           'were fairly common in Constantinople when this riot – and then massacre – took place in the year 532.' 
                           ' But we have more documentation of the Nika Riots than many of the others.'},
                          {'title':"Lakshmi Bai: Who is India's Joan of Arc?",'description_clean':'Lakshmi Bai was born into' 
                           'wealthy family in 1830, but she was far from the typical aristocrat.'
                           'In this episode, Deblina and Sarah recount the life and work of Lakshmi Bai,' 
                           'from her youth to her instrumental role in the Indian Rebellion of 1857.'},
                          {'title':'The First Tacoma Narrows Bridge – Galloping Gertie', 'description_clean': 
                           'The drama of the first Tacoma Narrows bridge is hardly relegated to its turbulent end.'
                           'There’s more to the story – from its inception to financing issues'
                           'to some surprising legal happenings, and how it spawned entirely new approaches to bridge design.'},
                          {'title':'A. Gustave Eiffel, Part 2', 'description_clean': 'The second part of our look at Gustave'
                           'Eiffel"s life picks up just after he closed down all business interests in South America,'
                           'and leads into some of his most famous work, including the Statue of Liberty '
                           'and the Parisian tower that bears his name.'},
                          {'title':'The Dyatlov Pass Incident', 'description_clean':'In 1959, nine students ventured into'
                           'the Ural mountains for a ski hiking trip, and never returned. While much speculation has'
                           'swirled for more than half a century, no one knows for certain what caused them to abandon'
                           'their camp to die in the cold.'}]

## Extracting Country, Cities, States (GPE)

In [4]:
# Write a function extracting geopolitical entities. 
def spacy_gpe(texts):
    
    #Because it's a list of dictionaries and multiple texts, this doc=nlp(texts) and looping through it to work around a limitation of spaCy. 
    doc = nlp(texts)
    
    #List of results of the for loop. 
    ep_results = []
    
    #Loop through the titles and descriptions looking for geopolitical entities or GPE
    for ent in doc.ents:
        if ent.label_=='GPE' and ent.text=='Deblina': 
            continue 
        elif ent.label_=='GPE':
            ep_results.append([ent.text,ent.label_])
        
    return ep_results

#Create a list to put the results of search and extraction. 
list_titles_descriptions = []

#this for loop made sure that title and description were linked together. 
for ep in country_mentioned_temp:
    spacy_result = {}
    spacy_result['title_gpe'] = spacy_gpe(ep['title'])
    spacy_result['description_gpe'] = spacy_gpe(ep['description_clean'])
    
    #append results to the list before loop. 
    list_titles_descriptions.append(spacy_result)

In [5]:
list_titles_descriptions

[{'title_gpe': [['Peru', 'GPE']], 'description_gpe': []},
 {'title_gpe': [], 'description_gpe': [['U.S.', 'GPE'], ['Manhattan', 'GPE']]},
 {'title_gpe': [], 'description_gpe': [['China', 'GPE'], ['Japan', 'GPE']]},
 {'title_gpe': [],
  'description_gpe': [['Manchester', 'GPE'],
   ['England', 'GPE'],
   ['Britain', 'GPE']]},
 {'title_gpe': [], 'description_gpe': [['Constantinople', 'GPE']]},
 {'title_gpe': [['India', 'GPE']], 'description_gpe': []},
 {'title_gpe': [], 'description_gpe': []},
 {'title_gpe': [], 'description_gpe': []},
 {'title_gpe': [], 'description_gpe': []}]

## Extracting Nationalities, religious or political groups (NORP) 

In [6]:
# This code does the same things as above, but it just looks for nationalities, religious or political groups. 
# This is an entity I would like to include to further prepare the data and make it more accurate. 
def spacy_norp(texts):
    doc = nlp(texts)
    ep_results = []
    for ent in doc.ents:
        if ent.label_=='NORP': 
            ep_results.append([ent.text,ent.label_])
        
    return ep_results

list_norp_mentioned = []

for ep in country_mentioned_temp:
    spacy_result = {}
    spacy_result['norp_title'] = spacy_norp(ep['title'])
    spacy_result['norp_description'] = spacy_norp(ep['description_clean'])
    
    list_norp_mentioned.append(spacy_result)

In [8]:
# As you can see, it's not perfect. It didn't pick up 'Indian' as a nationality and perhaps is thinking based on context that it's an event. 
list_norp_mentioned

[{'norp_title': [],
  'norp_description': [['Peruvian', 'NORP'], ['Peruvian', 'NORP']]},
 {'norp_title': [], 'norp_description': []},
 {'norp_title': [], 'norp_description': []},
 {'norp_title': [], 'norp_description': []},
 {'norp_title': [], 'norp_description': []},
 {'norp_title': [], 'norp_description': []},
 {'norp_title': [], 'norp_description': []},
 {'norp_title': [], 'norp_description': [['Parisian', 'NORP']]},
 {'norp_title': [], 'norp_description': []}]

## Extracting buidlings, airports, highways, bridges, etc.(FAC) 

In [9]:
def spacy_fac(texts):
    doc = nlp(texts)
    ep_results = []
    for ent in doc.ents:
        if ent.label_=='FAC': 
            ep_results.append([ent.text,ent.label_])
        
    return ep_results

list_fac_mentioned = []

for ep in country_mentioned_temp:
    spacy_result = {}
    spacy_result['fac_title'] = spacy_fac(ep['title'])
    spacy_result['fac_description'] = spacy_fac(ep['description_clean'])
    
    list_fac_mentioned.append(spacy_result)

In [10]:
list_fac_mentioned
#My opinion is that this named entity seems rather limited compared to the others. 

[{'fac_title': [['Archaeological Trail Blazer', 'FAC']],
  'fac_description': []},
 {'fac_title': [], 'fac_description': []},
 {'fac_title': [], 'fac_description': []},
 {'fac_title': [], 'fac_description': []},
 {'fac_title': [], 'fac_description': []},
 {'fac_title': [], 'fac_description': []},
 {'fac_title': [], 'fac_description': [['Tacoma Narrows', 'FAC']]},
 {'fac_title': [], 'fac_description': [['the Statue of Liberty', 'FAC']]},
 {'fac_title': [], 'fac_description': []}]

## Extracting non-geopolitical locations like mountain ranges, bodies of water (LOC)

In [11]:
def spacy_loc(texts):
    doc = nlp(texts)
    ep_results = []
    for ent in doc.ents:
        if ent.label_=='LOC': 
            ep_results.append([ent.text,ent.label_])
        
    return ep_results

list_loc_mentioned =[]

for ep in country_mentioned_temp:
    spacy_result = {}
    spacy_result['loc_title'] = spacy_loc(ep['title'])
    spacy_result['loc_description'] = spacy_loc(ep['description_clean'])
    
    list_loc_mentioned.append(spacy_result)

In [12]:
list_loc_mentioned
#doesn't seem to be working very well...

[{'loc_title': [], 'loc_description': []},
 {'loc_title': [], 'loc_description': []},
 {'loc_title': [], 'loc_description': []},
 {'loc_title': [], 'loc_description': [['Manchester', 'LOC']]},
 {'loc_title': [], 'loc_description': []},
 {'loc_title': [], 'loc_description': [['the Indian Rebellion', 'LOC']]},
 {'loc_title': [], 'loc_description': []},
 {'loc_title': [], 'loc_description': [['South America', 'LOC']]},
 {'loc_title': [], 'loc_description': []}]

In [13]:
def spacy_event(texts):
    doc = nlp(texts)
    ep_results = []
    for ent in doc.ents:
        if ent.label_=='EVENT': 
            ep_results.append([ent.text,ent.label_])
        
    return ep_results

list_event_mentioned = []

for ep in country_mentioned_temp:
    spacy_result = {}
    spacy_result['event_title'] = spacy_event(ep['title'])
    spacy_result['event_description'] = spacy_event(ep['description_clean'])
    
    list_event_mentioned.append(spacy_result)

In [14]:
list_event_mentioned

[{'event_title': [], 'event_description': []},
 {'event_title': [], 'event_description': []},
 {'event_title': [], 'event_description': []},
 {'event_title': [], 'event_description': []},
 {'event_title': [], 'event_description': []},
 {'event_title': [], 'event_description': []},
 {'event_title': [['Tacoma Narrows Bridge', 'EVENT']],
  'event_description': []},
 {'event_title': [], 'event_description': []},
 {'event_title': [], 'event_description': []}]

## Extracting time (DATE)

In [16]:
def spacy_time(texts):
    doc = nlp(texts)
    ep_results = []
    for ent in doc.ents:
        if ent.label_=='DATE':
            ep_results.append([ent.text,ent.label_])
        
    return ep_results

list_time_mentioned = []

for ep in country_mentioned_temp:
    spacy_result = {}
    spacy_result['time_mentioned'] = spacy_time(ep['description_clean'])
    
    list_time_mentioned.append(spacy_result)

In [17]:
list_time_mentioned

[{'time_mentioned': []},
 {'time_mentioned': [['the 1800s', 'DATE']]},
 {'time_mentioned': []},
 {'time_mentioned': []},
 {'time_mentioned': [['the year 532', 'DATE']]},
 {'time_mentioned': [['1830', 'DATE'], ['1857', 'DATE']]},
 {'time_mentioned': []},
 {'time_mentioned': []},
 {'time_mentioned': [['1959', 'DATE']]}]

## Extracting Persons Mentioned (PERSON)

In [18]:
def spacy_person(texts):
    doc = nlp(texts) 
    ep_results = []
    for ent in doc.ents:
        if ent.label_=='PERSON' and (ent.text=='Sarah' or ent.text=='Katie' or ent.text=='Holly' or ent.text=='Tracy' or ent.text=='Deblina' or ent.text=='Josh' or ent.text=='Candace'):
            continue 
    #I don't want it to extract hosts names. I'm not sure how to write this. 
    #also, tricky because some of the hosts have names that are identical to episdoe topics 
        elif ent.label_=='PERSON':
            ep_results.append([ent.text,ent.label_])
    return ep_results

list_person_mentioned = []

for ep in country_mentioned_temp:
    spacy_result = {}
    spacy_result['title_person'] = spacy_person(ep['title'])
    spacy_result['description_person'] = spacy_person(ep['description_clean'])
    
    list_person_mentioned.append(spacy_result)

In [19]:
list_person_mentioned

[{'title_person': [['Julio Tello', 'PERSON']], 'description_person': []},
 {'title_person': [], 'description_person': []},
 {'title_person': [], 'description_person': [['Bonsai', 'PERSON']]},
 {'title_person': [], 'description_person': []},
 {'title_person': [], 'description_person': []},
 {'title_person': [['Lakshmi Bai', 'PERSON'], ['Joan of Arc', 'PERSON']],
  'description_person': [['Lakshmi Bai', 'PERSON'],
   ['Lakshmi Bai', 'PERSON']]},
 {'title_person': [], 'description_person': []},
 {'title_person': [['A. Gustave Eiffel', 'PERSON']], 'description_person': []},
 {'title_person': [], 'description_person': []}]

## Extracting Pronouns for Male/Female

In [20]:
def spacy_pronouns(texts):
    doc = nlp(texts)
    ep_results = []
    for token in doc:
        if token.text=='we' or token.text=='We': #excluding 'we' because this usually refers to the hosts. 
            continue 
        elif token.pos_=='PRON' and token.tag_=='PRP':
            #token.tag_ 'PRP' means that it's a personal pronoun.
            ep_results.append([token.text, token.pos_,token.tag_, token.dep_])
        
    return ep_results

list_pronouns_mentioned = []

for ep in country_mentioned_temp:
    spacy_result = {}
    spacy_result['pronouns'] = spacy_pronouns(ep['description_clean'])
    
    list_pronouns_mentioned.append(spacy_result)

In [21]:
list_pronouns_mentioned

[{'pronouns': []},
 {'pronouns': [['it', 'PRON', 'PRP', 'nsubj'],
   ['itself', 'PRON', 'PRP', 'appos']]},
 {'pronouns': [['it', 'PRON', 'PRP', 'pobj']]},
 {'pronouns': []},
 {'pronouns': []},
 {'pronouns': [['she', 'PRON', 'PRP', 'nsubj']]},
 {'pronouns': [['it', 'PRON', 'PRP', 'nsubj']]},
 {'pronouns': [['he', 'PRON', 'PRP', 'nsubj']]},
 {'pronouns': [['them', 'PRON', 'PRP', 'nsubj']]}]