In [1]:
import nltk, pickle
import numpy as np
from nltk.tokenize import word_tokenize
from collections import Counter
import os
from os import path, makedirs
import news_loader

In [2]:
def extract_single(tokens, by_type=False):
    """
    Gets the named entities in the list of tokens
    
    Parameters:
    -----------
    tokens:
        a tokenized document containing named entities
    
    Returns:
    --------
    list of strings:
        each string is the name of an entity
    """
    entities = []
    pos = nltk.pos_tag(tokens) # label parts of speech
    named_entities = nltk.ne_chunk(pos, binary=not by_type) # identify named entities
    for i in range(0, len(named_entities)):
        ents = named_entities.pop()
        if getattr(ents, 'label', None) != None and ents.label() == "NE": 
            entities.append(([ne for ne in ents]))
    extracted = np.array(entities)
    if extracted.ndim == 3:
        final = extracted[:,:,0].tolist()
    else:
        final = []
        for entity in extracted:
            final.append(np.array(entity)[:,0].tolist())
    r = []
    for entity in final:
        entity = " ".join(entity)
        r.append(entity)
    return r

In [3]:
def relate_entities(extracted):
    """
    
    Returns:
    --------
        Counter:
            
    """
    occurences = Counter(extracted)
    return occurences

In [4]:
def create_db(extracted=None):
    """
    Creates a database of named entities occuring the supplied article
    
    Parameters:
    -----------
    extracted (list of strings):
        the extracted named entities from a single documetn
        
    Returns:
    --------
    dictionary (string --> Counter):
        database relating the named entities to 
    """
    global db
    db = {}
    if extracted is not None:
        for entity in extracted:
            db[entity] = Counter(extracted)
            del db[entity][entity]
    return db

In [5]:
def update_single(extracted):
    for entity in extracted:
        if entity in db:
            db[entity].update(Counter(extracted))
        else:
            db[entity] = Counter(extracted)
        del db[entity][entity]
    return db

In [6]:
DATABASE_FR = "data/entities_database.txt"

In [7]:
def new_database(filepath=DATABASE_FR):
    """
    Creates a new text file and folder in the filepath; uses 

    If creating additional filepaths, specify it in the filepath variable 
    in all functions with the filepath kwarg
    """
    if not os.path.exists(filepath):
        os.makedirs(str.partition(filepath, "/")[0])
        with open(filepath, "w+"):
            pass

new_database(filepath=DATABASE_FR)

In [8]:
def write_database(filepath=DATABASE_FR):
    with open(filepath, "wb") as f:
        pickle.dump(db, f)

In [9]:
def retrieve_database(filepath=DATABASE_FR):
    with open(filepath, "rb") as f:
        db = pickle.load(f)
    return db

In [10]:
def clear_database(filepath=DATABASE_FR):
    db = {}
    with open(filepath, "wb") as f:
        pickle.dump(db, f)

In [11]:
def top_related(entity):
    return db[entity].most_common()

In [21]:
def update(docs):
    for doc in docs:
        update_single(extract_single(doc))
    return db

In [13]:
docs = news_loader.for_ner(filter_text=False)
print(docs[0])

downloading: http://feeds.reuters.com/~r/Reuters/worldNews/~3/fwIVm9YOgzo/us-cyber-france-facebook-spies-exclusive-idUSKBN1AC0EI
downloading: http://feeds.reuters.com/~r/Reuters/worldNews/~3/5S-3l10j6wQ/us-usa-trump-russia-sanctions-idUSKBN1AC1U8
downloading: http://feeds.reuters.com/~r/Reuters/worldNews/~3/_H47_5h6Up4/us-britain-baby-idUSKBN1AC1C1
downloading: http://feeds.reuters.com/~r/Reuters/worldNews/~3/MVgH3StW3FI/us-israel-palestinians-idUSKBN1AC0UF
downloading: http://feeds.reuters.com/~r/Reuters/worldNews/~3/Rfs2YUfZuX8/us-usa-northkorea-army-idUSKBN1AC2V3
downloading: http://feeds.reuters.com/~r/Reuters/worldNews/~3/BkvXqd7sJk4/us-mideast-crisis-syria-un-idUSKBN1AC315
downloading: http://feeds.reuters.com/~r/Reuters/worldNews/~3/9gthxn-Unx4/us-nigeria-security-idUSKBN1AC30B
downloading: http://feeds.reuters.com/~r/Reuters/worldNews/~3/jUGQkMqtkW8/us-poland-politics-judiciary-kaczynski-idUSKBN1AC2R4
downloading: http://feeds.reuters.com/~r/Reuters/worldNews/~3/WEP96rO-BgE/us-

In [22]:
extraction = update(docs)

In [23]:
db

{'Abdullah': Counter({'Amman': 2,
          'Aqsa': 4,
          'Benjamin Netanyahu': 2,
          'CCTV': 2,
          'Donald Trump': 2,
          'East Jerusalem': 8,
          'Embassy': 4,
          'Holy Land': 2,
          'Islam': 2,
          'Israel': 24,
          'Israeli': 16,
          'JERUSALEM': 2,
          'Jerusalem': 8,
          'Jordan': 6,
          'Judaism': 2,
          'L5N1KI8C8': 2,
          'Middle East': 4,
          'Muslim': 6,
          'Muslim Arab': 2,
          'Muslims': 8,
          'Naftali Bennett': 2,
          'Netanyahu': 16,
          'Noble Sanctuary': 6,
          'Old City': 4,
          'Palestinian': 12,
          'Palestinians': 2,
          'Temple': 2,
          'Temple Mount': 2,
          'U.S.': 2,
          'United Nations': 4,
          'Waqf': 2,
          'Waqf Muslim': 2,
          'West Bank': 2,
          'Western Wall': 2}),
 'Ahmed Mohammed': Counter({'Islamist': 1,
          'Lake Chad Basin': 2,
          'Maiduguri'

In [17]:
create_db()

{}

In [19]:
update_single(extraction)

{'American': Counter({'Britain': 1,
          'Defence Ministry': 1,
          'Democratic National Committee': 1,
          'Dmitry Peskov': 1,
          'Emmanuel Macron': 3,
          'Facebook': 5,
          'Fancy Bear': 1,
          'France': 4,
          'French': 6,
          'GRU': 4,
          'Jack Stubbs': 1,
          'Jonathan Weber': 1,
          'Joseph Menn': 1,
          'Kremlin': 2,
          'Le Mans': 1,
          'Le Pen': 1,
          'Le Touquet': 1,
          'LinkedIn': 1,
          'MI5': 1,
          'Macron': 8,
          'Marine Le Pen': 1,
          'Michel Rose': 1,
          'Moscow': 1,
          'Mounir Mahjoubi': 1,
          'PHOTO': 2,
          'Paris': 1,
          'Ross Colvin': 1,
          'Russia': 4,
          'Russian': 4,
          'SAN': 1,
          'San Francisco': 1,
          'U.S.': 3}),
 'Britain': Counter({'American': 1,
          'Defence Ministry': 1,
          'Democratic National Committee': 1,
          'Dmitry Peskov': 1,
  

In [27]:
top_related("Donald Trump")

[('Charlie', 24),
 ('Trump', 20),
 ('Russian', 14),
 ('U.S.', 12),
 ('Israel', 12),
 ('Senate', 8),
 ('Moscow', 8),
 ('Netanyahu', 8),
 ('Israeli', 8),
 ('Mueller', 6),
 ('White House', 6),
 ('U.S. House', 6),
 ('United States', 6),
 ('Russia', 6),
 ('Palestinian', 6),
 ('Caracas', 6),
 ('Venezuela', 5),
 ('CNN', 4),
 ('Republican', 4),
 ('Putin', 4),
 ('Obama', 4),
 ('Finland', 4),
 ('Iran', 4),
 ('North Korea', 4),
 ('Congress', 4),
 ('Vladimir Putin', 4),
 ('Great Ormond Street Hospital', 4),
 ('Jerusalem', 4),
 ('East Jerusalem', 4),
 ('Muslims', 4),
 ('Venezuelan', 4),
 ('High Court', 3),
 ('Noble Sanctuary', 3),
 ('Jordan', 3),
 ('Muslim', 3),
 ('Maduro', 3),
 ('Nicolas Maduro', 3),
 ('Speaker Ryan', 2),
 ('Sessions', 2),
 ('Special Counsel Robert Mueller', 2),
 ('Sauli Niinisto', 2),
 ('Finnish', 2),
 ('State Department', 2),
 ('Edward Fishman', 2),
 ('Sarah Sanders', 2),
 ('Democratic', 2),
 ('Corker', 2),
 ('Barack Obama', 2),
 ('European Union', 2),
 ('Ukraine', 2),
 ('Kremli

In [None]:
def common_entities(docs, k=None):
    """
    Given a list of token-lists, return the most common entity among 
    """