In [276]:
import nltk, pickle
import numpy as np
from nltk.tokenize import word_tokenize
from collections import Counter
import os
from os import path, makedirs
import news_loader

In [278]:
def extract_single(tokens, by_type=False):
    """
    Gets the named entities in the list of tokens
    
    Parameters:
    -----------
    tokens:
        a tokenized document containing named entities
    
    Returns:
    --------
    list of strings:
        each string is the name of an entity
    """
    entities = []
    pos = nltk.pos_tag(tokens) # label parts of speech
    named_entities = nltk.ne_chunk(pos, binary=not by_type) # identify named entities
    for i in range(0, len(named_entities)):
        ents = named_entities.pop()
        if getattr(ents, 'label', None) != None and ents.label() == "NE": 
            entities.append(([ne for ne in ents]))
    extracted = np.array(entities)
    if extracted.ndim == 3:
        final = extracted[:,:,0].tolist()
    else:
        final = []
        for entity in extracted:
            final.append(np.array(entity)[:,0].tolist())
    r = []
    for entity in final:
        entity = " ".join(entity)
        r.append(entity)
    return r

In [279]:
def relate_entities(extracted):
    """
    
    Returns:
    --------
        Counter:
            
    """
    occurences = Counter(extracted)
    return occurences

In [280]:
def create_db(extracted=None):
    """
    Creates a database of named entities occuring the supplied article
    
    Parameters:
    -----------
    extracted (list of strings):
        the extracted named entities from a single documetn
        
    Returns:
    --------
    dictionary (string --> Counter):
        database relating the named entities to 
    """
    global db
    db = {}
    if extracted is not None:
        for entity in extracted:
            db[entity] = Counter(extracted)
            del db[entity][entity]
    return db

In [281]:
def update_single(extracted):
    for entity in extracted:
        if entity in db:
            db[entity].update(Counter(extracted))
        else:
            db[entity] = Counter(extracted)
        del db[entity][entity]
    return db

In [262]:
DATABASE_FR = "data/entities_database.txt"

In [263]:
def new_database(filepath=DATABASE_FR):
    """
    Creates a new text file and folder in the filepath; uses 

    If creating additional filepaths, specify it in the filepath variable 
    in all functions with the filepath kwarg
    """
    if not os.path.exists(filepath):
        os.makedirs(str.partition(filepath, "/")[0])
        with open(filepath, "w+"):
            pass

new_database(filepath=DATABASE_FR)

In [258]:
def write_database(filepath=DATABASE_FR):
    with open(filepath, "wb") as f:
        pickle.dump(db, f)

In [246]:
def retrieve_database(filepath=DATABASE_FR):
    with open(filepath, "rb") as f:
        db = pickle.load(f)
    return db

In [271]:
def clear_database(filepath=DATABASE_FR):
    db = {}
    with open(filepath, "wb") as f:
        pickle.dump(db, f)

In [272]:
def top_related(entity):
    return db[entity].most_common()

In [284]:
def update(docs):
    for doc in docs:
        update_single(extract_entities(doc))
    return db

In [285]:
update(news_loader.for_ner())

downloading: http://feeds.reuters.com/~r/Reuters/worldNews/~3/fwIVm9YOgzo/us-cyber-france-facebook-spies-exclusive-idUSKBN1AC0EI
downloading: http://feeds.reuters.com/~r/Reuters/worldNews/~3/5S-3l10j6wQ/us-usa-trump-russia-sanctions-idUSKBN1AC1U8
downloading: http://feeds.reuters.com/~r/Reuters/worldNews/~3/_H47_5h6Up4/us-britain-baby-idUSKBN1AC1C1
downloading: http://feeds.reuters.com/~r/Reuters/worldNews/~3/MVgH3StW3FI/us-israel-palestinians-idUSKBN1AC0UF
downloading: http://feeds.reuters.com/~r/Reuters/worldNews/~3/Rfs2YUfZuX8/us-usa-northkorea-army-idUSKBN1AC2V3
downloading: http://feeds.reuters.com/~r/Reuters/worldNews/~3/BkvXqd7sJk4/us-mideast-crisis-syria-un-idUSKBN1AC315
downloading: http://feeds.reuters.com/~r/Reuters/worldNews/~3/9gthxn-Unx4/us-nigeria-security-idUSKBN1AC30B
downloading: http://feeds.reuters.com/~r/Reuters/worldNews/~3/jUGQkMqtkW8/us-poland-politics-judiciary-kaczynski-idUSKBN1AC2R4
downloading: http://feeds.reuters.com/~r/Reuters/worldNews/~3/WEP96rO-BgE/us-

KeyError: 'Michel Rose Paris Jack Stubbs Moscow'