In [1]:
import pandas as pd
import numpy as np
import pickle
import re

import nltk
nltk.download('punkt')

import spacy
# python -m spacy download en_core_web_lg
nlp = spacy.load("en_core_web_lg")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
%%time
text_df = pd.read_csv('combined_text.csv')
text_df.set_index('text_id', inplace = True)
text_df

Wall time: 47.1 s


Unnamed: 0_level_0,text
text_id,Unnamed: 1_level_1
0,Anarchism is an anti-authoritarian political a...
1,Autism is a developmental disorder characteriz...
2,"Albedo () (, meaning 'whiteness') is the measu..."
3,A or a is the first letter and the first vowel...
4,Alabama () is a state in the southeastern regi...
...,...
5343560,Daming Zhu is an Assistant Dean for Continuing...
5343561,"Tony Oshey Dews (born June 6, 1973) is an Amer..."
5343562,(EC-PL20ZZBPRUS) is an sleek design digital co...
5343563,Major General Nils-Fredrik Palmstierna (8 Marc...


In [3]:
%%time
entity_df = pd.read_csv('combined_entities_filtered.csv')
entity_df

Wall time: 34.7 s


Unnamed: 0,entity,page_id,item_id,text_id
0,social philosophy,586276,180592,0
1,cooperative,89313,4539,0
2,far-left,18247344,1129409,0
3,political spectrum,23490,210918,0
4,collectivism,5708,237789,0
...,...,...,...,...
26712752,Tomasa Tequiero,39519608,6148330,5343559
26712753,Sos mi hombre,39950100,6132611,5343559
26712754,Luis Gatica,2099374,6316177,5343559
26712755,Lucho Gatica,2112544,954681,5343559


In [4]:
"""
%%time
item_page_dict = pickle.load(open("item_dict_filtered.p", "rb"))
print(item_page_dict['page_id']['tesla'])
"""

'\n%%time\nitem_page_dict = pickle.load(open("item_dict_filtered.p", "rb"))\nprint(item_page_dict[\'page_id\'][\'tesla\'])\n'

In [5]:
%%time
item_dict = pickle.load(open("item_dict_full.p", "rb"))
print(item_dict['tesla'])

[163343, 478214, 765530, 1050485, 1428953, 1548225, 2384079, 2406220, 7705502, 19565583, 27701406, 31803712, 37251206, 56084926, 9036, 16258100]
Wall time: 3min 16s


In [6]:
def get_id(entity):
    try:
        item = item_dict[entity]
        return item
    except:
        if len(entity.strip('\'" ')) < len(entity):
            return get_id(entity.strip('\'" '))
        elif entity[:4] == 'the ':
            return get_id(entity[4:])
        else:
            return []
    
def get_ids(entities):
    return [get_id(i.lower().strip()) for i in entities]

print(get_ids(['Tesla', '"The Tesla', 'Teslarati', 'The Teslarati']))

[[163343, 478214, 765530, 1050485, 1428953, 1548225, 2384079, 2406220, 7705502, 19565583, 27701406, 31803712, 37251206, 56084926, 9036, 16258100], [163343, 478214, 765530, 1050485, 1428953, 1548225, 2384079, 2406220, 7705502, 19565583, 27701406, 31803712, 37251206, 56084926, 9036, 16258100], [], []]


In [7]:
# Much faster than sampling using Pandas

def get_samples(n, dataframe):
    idx = np.random.randint(len(dataframe), size=n)
    samples = dataframe.iloc[idx,:].to_numpy()
    samples[:,0] = samples[:,0].astype(str)
    return samples

samples = get_samples(100000, entity_df)
samples

array([['Voivode', 149044, 275102, 678913],
       ['Thoroughbred', 18842022, 210826, 4212317],
       ['Oasis', 6542720, 6389213, 2976245],
       ...,
       ['SNK Arcade Classics Vol. 1', 18598659, 7391853, 1099414],
       ['football', 10568, 2736, 4803525],
       ['serial communication', 194114, 518280, 33671]], dtype=object)

In [8]:
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

In [9]:
def get_sentence(row):
    entity, page_id, item_id, text_id = row
    text = text_df.iat[text_id, 0]
    
    if text.find(entity) == -1:
        raise Exception(f'[{entity}] was not found in text:\n {text}')
    
    entity_start = text.find(entity)
    entity_end = entity_start + len(entity)
    sentence_start = 0
    sentence_end = len(text)
    
    #splits = [i.start() for i in re.finditer(r"(?<=[(\.|!|\?)])\s(?![a-z])", text)]
    splits = [s for s, e in sent_detector.span_tokenize(text)]
    
    for i, j in enumerate(splits):
        if j <= entity_start:
            sentence_start = j
        elif j >= entity_end:
            sentence_end = j
            break
    #sentence = text[sentence_start:sentence_end]
    doc = nlp(text[sentence_start:entity_start] + text[entity_end:sentence_end])
    
    entity_list = [entity.lower()]
    for ent in doc.ents:
        #print(ent.text, ent.label_) #ent.start_char, ent.end_char, 
        if ent.label_ == 'PERSON' and len(ent.text.split()) > 2:
            try:
                item_dict[ent.text.lower()]
            except:
                entity_split = ent.text.lower().split()
                entity_list.append(entity_split[0] + ' ' + entity_split[-1])

    entity_list = pd.unique(entity_list + [i.text.lower() for i in doc.ents if i.label_ not in ['DATE']])
    entity_id_list = get_ids(entity_list)
    return entity_id_list, entity_list, (entity, page_id, item_id)
    
[get_sentence(i) for i in samples[:5]]

[([[275102],
   [1675432, 56538769, 30272204],
   [18366588, 217, 209754],
   [],
   [],
   [7913, 2175775, 57477615],
   [3627558, 2497107],
   [13211, 26245],
   [4135847,
    4135849,
    4135859,
    7128194,
    16562681,
    17536409,
    31283669,
    11692785],
   [809, 385059, 22061936, 36982750, 57477613],
   []],
  array(['voivode', 'iuga', 'moldavia', '1400)(known', 'iurg', 'romanian',
         'yury', 'ruthenian', 'jerzy', 'polish', 'ologul'], dtype=object),
  ('Voivode', 149044, 275102)),
 ([[210826,
    7796628,
    7796630,
    7796631,
    7796633,
    15615095,
    7768898,
    7768899,
    22137032,
    56322631],
   [23411,
    25588,
    37141,
    186689,
    186799,
    2075023,
    2422126,
    2807587,
    3018412,
    3597401,
    3868542,
    4028767,
    4549327,
    4549328,
    4549330,
    4549332,
    4549333,
    4549334,
    6085538,
    9128774,
    10419720,
    16149787,
    16525661,
    18159512,
    18712667,
    19723593,
    20015844,
    20992

In [10]:
%%time
sentences = [get_sentence(i) for i in samples]
len(sentences)

Wall time: 21min 8s


100000

In [12]:
%%time
with open("dataset.p","wb") as f:
    pickle.dump(sentences, f, protocol=pickle.HIGHEST_PROTOCOL)

Wall time: 11.6 s
