<a href="https://colab.research.google.com/github/kevinmfreire/wheres_waldo/blob/main/spacy_ner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import spacy
import requests
import re
import numpy as np
import json
from bs4 import BeautifulSoup
from spacy import displacy

In [2]:
NER = spacy.load("en_core_web_sm")
raw_text="The Indian Space Research Organisation or is the national space agency of India, headquartered in Bengaluru. It operates under Department of Space which is directly overseen by the Prime Minister of India while Chairman of ISRO acts as executive of DOS as well."
text1= NER(raw_text)

In [3]:
for word in text1.ents:
    print(word.text,word.label_)

The Indian Space Research Organisation ORG
India GPE
Bengaluru GPE
Department of Space ORG
India GPE
ISRO ORG
DOS ORG


In [4]:
spacy.explain("ORG")

'Companies, agencies, institutions, etc.'

In [5]:
spacy.explain("GPE")

'Countries, cities, states'

In [6]:
displacy.render(text1,style="ent",jupyter=True)

In [7]:
raw_text2='The Mars Orbiter Mission (MOM), informally known as Mangalyaan, was launched into Earth orbit on 5 November 2013 by the Indian Space Research Organisation (ISRO) and has entered Mars orbit on 24 September 2014. India thus became the first country to enter Mars orbit on its first attempt. It was completed at a record low cost of $74 million.'

In [8]:
text2= NER(raw_text2)
for word in text2.ents:
    print(word.text,word.label_)

The Mars Orbiter Mission (MOM PRODUCT
Mangalyaan PERSON
Earth LOC
5 November 2013 DATE
the Indian Space Research Organisation ORG
Mars LOC
24 September 2014 DATE
India GPE
first ORDINAL
Mars LOC
$74 million MONEY


In [9]:
spacy.explain("PERSON")

'People, including fictional'

In [10]:
displacy.render(text2,style="ent",jupyter=True)

In [None]:
def clean_text(contents):
  body= contents.replace('n', ' ')
  body= contents.replace('t', ' ')
  body= contents.replace('r', ' ')
  body= contents.replace('\xa0', ' ')
  return body

In [11]:
def web_scraper(url, number_of_articles=1):
    # Request
    r1 = requests.get(url)
    print(r1.status_code)

    # We'll save in coverpage the cover page content
    coverpage = r1.content

    # Soup creation
    soup1 = BeautifulSoup(coverpage, 'html5lib')

    # News identification
    coverpage_news = []
    for tag in soup1.find_all('h2', class_='styles_headline__ice3t'):
        for anchor in tag.find_all('a'):
            coverpage_news.append(anchor)

    print('Number of articles found: {}'.format(len(coverpage_news)))

    ## Let's extract the text from the article
    # Empty lists for content, links and titles
    news_contents = []
    list_links = []
    list_titles = []

    for n in np.arange(0, number_of_articles):
            
        # Getting the link of the article
        link = coverpage_news[n]['href']
        list_links.append(link)
        
        # Getting the title
        title = coverpage_news[n].get_text()
        list_titles.append(title)
        
        # Reading the content (it is divided in paragraphs)
        article = requests.get(link)
        article_content = article.content
        soup_article = BeautifulSoup(article_content, 'html5lib')
        x = soup_article.find_all('p', {'class':['','endmark']})
        
        # Unifying the paragraphs
        list_paragraphs = []
        for p in np.arange(0, len(x)):
            paragraph = x[p].get_text()
            list_paragraphs.append(paragraph)
            final_article = " ".join(list_paragraphs)
            
        news_contents.append(final_article)

    # df_show_info
    nbc_articles = pd.DataFrame({
        # 'Article Title': list_titles,
        'Article Link': list_links,
        'Article Content': news_contents})

    # return [list_titles, news_contents, list_links]
    return nbc_articles

In [12]:
URL='https://www.nbcnews.com/'

In [13]:
contents = web_scraper(URL, 5)

200
Number of articles found: 37


In [14]:
contents

Unnamed: 0,Article Link,Article Content
0,https://www.nbcnews.com/politics/immigration/t...,Roshan Mashal had been fighting for women’s ri...
1,https://www.nbcnews.com/news/world/far-right-s...,The far right has sent a shock wave through Fr...
2,https://www.nbcnews.com/news/us-news/kidnapped...,A Florida dog breeder who was being forced to ...
3,https://www.nbcnews.com/news/us-news/yellowsto...,The federal government plans to pump $50 milli...
4,https://www.nbcnews.com/news/world/tooth-patri...,A tooth believed to have been ripped from a ma...


In [15]:
contents['Article Content']

0    Roshan Mashal had been fighting for women’s ri...
1    The far right has sent a shock wave through Fr...
2    A Florida dog breeder who was being forced to ...
3    The federal government plans to pump $50 milli...
4    A tooth believed to have been ripped from a ma...
Name: Article Content, dtype: object

In [16]:
def get_unique_results(model_output):
    # Prepare dictionary for obtaining only Name, Organization and Location
    article = {'NAME':[], 'ORGANIZATION':[], 'LOCATION':[]}

    # Iterate through each word in the sentence and extract the target entities
    for word in model_output.ents:
        if word.label_ == 'PERSON' and (word.text not in article["NAME"]):
            article["NAME"].append(word.text)
        elif word.label_ == 'ORG' and (word.text not in article["ORGANIZATION"]):
            article["ORGANIZATION"].append(word.text)
        elif word.label_ == 'GPE' and (word.text not in article["LOCATION"]):
            article["LOCATION"].append(word.text)
    return article

In [17]:
def get_ner_for_all(article, model):
    ''''
    This function is used to obtain NER results for each content in the article
    and is place in a new dataframe
    '''
    final_out = article.copy()
    for index, row in final_out.iterrows():
        spacy_results = model(row['Article Content'])
        article_ner = get_unique_results(spacy_results)
        final_out.iloc[[index], [1]] = [article_ner]
    return final_out

In [18]:
output = get_ner_for_all(contents, NER)
output

Unnamed: 0,Article Link,Article Content
0,https://www.nbcnews.com/politics/immigration/t...,"{'NAME': ['Roshan Mashal', 'Biden', 'Chris Geo..."
1,https://www.nbcnews.com/news/world/far-right-s...,"{'NAME': ['Emmanuel Macron', 'Macron', 'Jean-L..."
2,https://www.nbcnews.com/news/us-news/kidnapped...,"{'NAME': ['Cameron White', 'Tsdekiel Sellers',..."
3,https://www.nbcnews.com/news/us-news/yellowsto...,"{'NAME': ['Gardiner', 'Old Gardiner Road'], 'O..."
4,https://www.nbcnews.com/news/world/tooth-patri...,"{'NAME': ['Patrice Lumumba', 'George Floyd', '..."


In [19]:
render_list = []
for content in contents['Article Content']:
  render_out = NER(content)
  render_list.append(render_out)

In [20]:
len(render_list)

5

In [21]:
output

Unnamed: 0,Article Link,Article Content
0,https://www.nbcnews.com/politics/immigration/t...,"{'NAME': ['Roshan Mashal', 'Biden', 'Chris Geo..."
1,https://www.nbcnews.com/news/world/far-right-s...,"{'NAME': ['Emmanuel Macron', 'Macron', 'Jean-L..."
2,https://www.nbcnews.com/news/us-news/kidnapped...,"{'NAME': ['Cameron White', 'Tsdekiel Sellers',..."
3,https://www.nbcnews.com/news/us-news/yellowsto...,"{'NAME': ['Gardiner', 'Old Gardiner Road'], 'O..."
4,https://www.nbcnews.com/news/world/tooth-patri...,"{'NAME': ['Patrice Lumumba', 'George Floyd', '..."


In [22]:
displacy.render(render_list[0],style="ent",jupyter=True)

In [23]:
def save_to_json(results, path):
    outputDict = results.set_index('Article Link').to_dict()['Article Content']

    with open(path+'output.json', 'w') as fp:
        json.dump(outputDict, fp,  indent=4)

def save_to_csv(results, path):
    results.set_index('Article Link').to_csv(path+'output.csv')

In [24]:
save_to_json(output, './output.json')

In [25]:
save_to_csv(output, './output.csv')

In [26]:
json_file = open('./output.jsonoutput.json')
json_data = json.load(json_file)
json_data

{'https://www.nbcnews.com/news/us-news/kidnapped-florida-man-drove-erratically-police-pull-authorities-say-rcna34415': {'LOCATION': ['Florida',
   'Atlanta',
   'Buffalo',
   'Tallahassee',
   'West Palm Beach',
   'Martin County'],
  'NAME': ['Cameron White',
   'Tsdekiel Sellers',
   'Benyahveen Radcliffe',
   'Kashaveeyah Bragdon'],
  'ORGANIZATION': ['the Martin County Sheriff’s Office',
   'White',
   'the Port St. Lucie Police Department']},
 'https://www.nbcnews.com/news/us-news/yellowstone-reopen-2-weeks-influx-cash-repairs-rcna34421': {'LOCATION': ['Yellowstone',
   'Wyoming',
   'Montana',
   'Idaho',
   'Cooke City',
   'Lamar Valley'],
  'NAME': ['Gardiner', 'Old Gardiner Road'],
  'ORGANIZATION': ['Yellowstone National Park',
   'The National Park Service',
   'the National Weather Service',
   'the Federal Highway Administration']},
 'https://www.nbcnews.com/news/world/far-right-surges-french-election-macron-presidency-turmoil-rcna34365': {'LOCATION': ['France',
   'Washi

In [27]:
csv_df = pd.read_csv('./output.csvoutput.csv')
# csv_df.set_index('Article Link')
csv_df

Unnamed: 0,Article Link,Article Content
0,https://www.nbcnews.com/politics/immigration/t...,"{'NAME': ['Roshan Mashal', 'Biden', 'Chris Geo..."
1,https://www.nbcnews.com/news/world/far-right-s...,"{'NAME': ['Emmanuel Macron', 'Macron', 'Jean-L..."
2,https://www.nbcnews.com/news/us-news/kidnapped...,"{'NAME': ['Cameron White', 'Tsdekiel Sellers',..."
3,https://www.nbcnews.com/news/us-news/yellowsto...,"{'NAME': ['Gardiner', 'Old Gardiner Road'], 'O..."
4,https://www.nbcnews.com/news/world/tooth-patri...,"{'NAME': ['Patrice Lumumba', 'George Floyd', '..."


In [109]:
!pip install sqlalchemy
!pip install pyodbc

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyodbc
  Downloading pyodbc-4.0.32.tar.gz (280 kB)
[K     |████████████████████████████████| 280 kB 5.2 MB/s 
[?25hBuilding wheels for collected packages: pyodbc
  Building wheel for pyodbc (setup.py) ... [?25l[?25hdone
  Created wheel for pyodbc: filename=pyodbc-4.0.32-cp37-cp37m-linux_x86_64.whl size=287366 sha256=57ced4025da42003e9051a3d6d20b41ee793456caf7ce4dfb07c9d7af012ab71
  Stored in directory: /root/.cache/pip/wheels/2e/9c/da/8652fd42e0f662015554f00a9e96fe4f438dfd1ef59787879e
Successfully built pyodbc
Installing collected packages: pyodbc
Successfully installed pyodbc-4.0.32


In [31]:
data_list = [dict(content) for content in output['Article Content']]

In [32]:
dict1 = data_list[0]
df = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in dict1.items()]))

# df = pd.DataFrame()
# for i in range(len(data_list)):
#   cur_dict = data_list[i]
#   df = df.append(dict([(k,pd.Series(v)) for k,v in cur_dict.items()]), ignore_index=True)

In [228]:
df

Unnamed: 0,NAME,ORGANIZATION,LOCATION
0,Serhiy Haidai,Ukraine,Russia
1,Josep Borrell,Lysychansk,Ukraine
2,Borrell,Sievierodonetsk,Luhansk
3,Dmitry Muratov,Haidai,Moscow
4,Novaya Gazeta,The Associated Press,Sievierodonetsk
5,,the defense ministry,Ukrainian
6,,The European Union’s,Luxembourg
7,,the United Nations,New York
8,,Kremlin,


In [28]:
import sqlite3

In [29]:
conn = sqlite3.connect('test_database')
c = conn.cursor()

In [30]:
c.execute('CREATE TABLE IF NOT EXISTS articles (Article link, NER results)')
conn.commit()

In [33]:
df.to_sql('articles', conn, if_exists='replace', index = False)

In [34]:
c.execute('''  
SELECT * FROM articles
          ''')

for row in c.fetchall():
    print (row)

('Roshan Mashal', 'Taliban', 'Afghanistan')
('Biden', 'the State Department', 'Kabul')
('Chris George', 'Operation Allies Welcome', 'U.S.')
('George', 'Mashal', 'Texas')
('Alison Hoeman', 'Social Security', 'America')
('Teresa Casale', 'Medicaid', 'Dallas')
(None, 'State', 'Wisconsin')
(None, 'Health and Human Services', 'Washington')
(None, 'Congress', 'D.C.')
(None, 'the Supplemental Nutrition Assistance Program', 'Arlington')
(None, 'DFW Refugee Outreach Services', 'Connecticut')
(None, 'the University of Texas', 'Pennsylvania')
(None, 'Gender Studies', 'Iowa')
(None, 'the Georgetown Institute of Women, Peace and Security', 'Mashal')
(None, 'the Texas International Education Consortium', 'Mina')
(None, 'Uber', None)
(None, 'State Department', None)
(None, 'the Integrated Refugee and Immigrant Services', None)
(None, 'Trump', None)
(None, 'Des Moines Refugee Support', None)
(None, 'Mina’s List', None)
