In [8]:
import nltk
import spacy
import pickle
nlp = spacy.load("en_core_web_sm")

In [2]:
text = ""
with open("./data/humayun_text", "r") as file:
    text = file.read()

print(f"{text[:40]}  [...{len(text)-40} chars more]")

Step away from the buzz and bustle of De  [...53059 chars more]


In [3]:
sentences = nltk.sent_tokenize(text) 
print(f"Found {len(sentences)} sentences")

Found 568 sentences


In [13]:
sentences_map = {} # sentence id -> sentence_text
seen_sentences = set() # seen set

graph = {} # entity -> set(sentence ids)

idx = 0
for sentence in sentences:
    
    sentence = ' '.join(sentence.split()).lower()
    if sentence in seen_sentences:
        continue

    curr_idx = idx
    
    sentences_map[curr_idx] = sentence
    seen_sentences.add(sentence)
    idx += 1

    doc = nlp(sentence)
    for chunk in doc.noun_chunks:
        chunk = chunk.text.lower()
        if chunk in graph:
            graph[chunk].add(curr_idx)
        else:
            graph[chunk] = set((curr_idx, ))

for entity, linked_sentences in tuple(graph.items())[:10]:
    print(f"{entity}: {linked_sentences}")
print(f"[...{len(graph)} more]")    

print(f"\nSaw {len(seen_sentences)} sentences and {len(graph)} entities!")

the buzz: {0}
bustle: {0}
delhi: {0, 98, 450, 452, 39, 264, 456, 49, 306, 95}
crowded roads: {0}
neighbourhoods: {0}
the tranquil oasis: {0}
humayun’s tomb: {0, 355, 101, 261, 383, 271, 144, 178, 83, 19, 181, 437, 280, 25, 31}
the nizamuddin east area: {0}
it: {1, 514, 8, 10, 526, 529, 530, 19, 22, 28, 29, 32, 544, 545, 546, 85, 102, 109, 116, 117, 119, 130, 137, 141, 149, 151, 153, 159, 168, 169, 193, 196, 212, 213, 214, 215, 216, 222, 224, 233, 240, 248, 259, 267, 271, 277, 279, 290, 318, 322, 330, 353, 355, 359, 360, 361, 364, 376, 381, 388, 395, 398, 410, 420, 424, 425, 429, 430, 437, 438, 440, 444, 449, 450, 456, 457, 471, 483, 485, 499, 508, 511}
a unesco world heritage site: {1}
[...1603 more]

Saw 561 sentences and 1603 entities!


In [11]:
save_path = "./data/humayun_ir_entity_graph.pkl"
with open(save_path, "wb") as file:
    pickle.dump((graph, sentences_map), file)
print(f"Saved to '{save_path}' 🎉")

Saved to './data/humayun_ir_entity_graph.pkl' 🎉


In [12]:
read_graph, read_sentences_map = None, None
with open(save_path, "rb") as file:
    read_graph, read_sentences_map = pickle.load(file)

print(f"Loaded from '{save_path}' - {len(read_sentences_map)} sentences and {len(read_graph)} entities ✅")

Loaded from './data/humayun_ir_entity_graph.pkl' - 561 sentences and 1603 entities ✅
