In [191]:
import pandas as pd
import numpy as np
import pickle
import re

import nltk
from nltk import tokenize
import spacy

nltk.download('punkt')
# python -m spacy download en_core_web_lg
nlp = spacy.load("en_core_web_lg")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
%%time
text_df = pd.read_csv('combined_text.csv')
text_df.set_index('text_id', inplace = True)
text_df.head()

Wall time: 30.1 s


Unnamed: 0_level_0,text
text_id,Unnamed: 1_level_1
0,Anarchism is an anti-authoritarian political a...
1,Autism is a developmental disorder characteriz...
2,"Albedo () (, meaning 'whiteness') is the measu..."
3,A or a is the first letter and the first vowel...
4,Alabama () is a state in the southeastern regi...


In [3]:
%%time
entity_df = pd.read_csv('combined_entities.csv')
entity_df.head()

Wall time: 21.4 s


Unnamed: 0,entity,page_id,text_id
0,anti-authoritarian,867979,0
1,political,23040,0
2,social philosophy,586276,0
3,hierarchies,13998,0
4,self-managed,40949353,0


In [4]:
%%time
item_dict = pickle.load(open("item_dict.p", "rb"))
print(item_dict['tesla'])

[9036, 163343, 478214, 765530, 1050485, 1428953, 1548225, 2384079, 2406220, 7705502, 16258100, 19565583, 27701406, 31803712, 37251206, 56084926]
Wall time: 1min 35s


In [316]:
def get_item_ids(entities):
    if type(entities) == np.ndarray or type(entities) == list:
        return [item_dict.get(i.lower(), []) for i in entities]
    elif type(entities) == str:
        return item_dict.get(entities.lower(), [])
    else:
        return item_dict.get(entities, [])

print(get_item_ids('tesla'))

[9036, 163343, 478214, 765530, 1050485, 1428953, 1548225, 2384079, 2406220, 7705502, 16258100, 19565583, 27701406, 31803712, 37251206, 56084926]


In [317]:
# Much faster than sampling using Pandas

def get_samples(n, dataframe):
    idx = np.random.randint(len(dataframe), size=n)
    return dataframe.iloc[idx,:].to_numpy()

samples = get_samples(10, entity_df)
samples

array([['Charles W. Sweeney', 839581, 1763442],
       ['Southern Nevada', 22216509, 627246],
       ['National Basketball Association', 22093, 551260],
       ['Erebidae', 27888690, 2364745],
       ['BMX rider', 38103, 3654846],
       ['Travel Channel (UK)', 5433720, 2455898],
       ['Deerfield, Wisconsin', 260370, 4298627],
       ['moth', 66633, 4262315],
       ['diagnostic tool', 19013767, 403672],
       ['Hong Kong General Chamber of Commerce', 18058884, 4083316]],
      dtype=object)

In [318]:
def get_sentence(row):
    entity, page_id, text_id = row
    text = text_df.iat[text_id, 0]
    for line in tokenize.sent_tokenize(text): #re.split(r"(?<=[.])\s(?![a-z])", text):
        if entity in line:
            idx = line.find(entity)
            sentence = line[:idx] + line[idx+len(entity):]
            break
    entity_list = pd.unique([entity.lower()] + [i.text.lower() for i in nlp(sentence).ents])
    entity_id_list = get_item_ids(entity_list)
    return line, entity_id_list, entity_list, (entity, page_id)
    
sentences = [get_sentence(i) for i in samples]
sentences

[("On 9 August 1945, Bockscar, piloted by the 393d Bombardment Squadron's commander, Major Charles W. Sweeney, dropped a Fat Man nuclear bomb with a blast yield equivalent to 21 kilotons of TNT over the city of Nagasaki.",
  [[241128],
   [69276951],
   [697041],
   [],
   [239243,
    613617,
    702497,
    983927,
    1259251,
    2086011,
    4671286,
    4679633,
    4896145,
    6737995,
    6738003,
    6738004,
    6738009,
    7749682,
    11247470,
    11707054,
    13232557,
    13553889,
    16932055,
    16981612,
    18108359,
    19902046,
    20648870,
    21708176,
    27734840,
    28119893,
    35456557,
    48965718,
    55569052,
    58795659,
    60793902],
   [188163, 5501204],
   [],
   [170167,
    217939,
    688333,
    1105744,
    1654486,
    1756975,
    1757582,
    1975145,
    2335168,
    2376657,
    3324385,
    3979388,
    4050821,
    7670914,
    7670915,
    7670916,
    7670918,
    7670919,
    7810784,
    9358880,
    10376726,
    12741878