-
Notifications
You must be signed in to change notification settings - Fork 0
/
indexer.py
107 lines (85 loc) · 2.71 KB
/
indexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os
import datetime
import math
from pymongo import MongoClient
from bson.objectid import ObjectId
from nodescraper import scrape_node_contents
from analyzecontent import AnalyzeContent
__client = MongoClient(os.environ.get('MONGODB_URI'))
__models_db = __client[os.environ.get('NODE_DB')]
__search_db = __client[os.environ.get('SEARCH_DB')]
__nodes_collection = __models_db['nodes']
__indexes_collection = __search_db['docs']
__lemmas_collection = __search_db['lemmas']
def index_all_nodes():
start = datetime.datetime.now()
nodes = list(__nodes_collection.find({"$or": [
{"nodeType": "CourseNode"},
{"nodeType": "SubjectNode"},
{"nodeType": "InstructorNode"}
]}))
print('Preparing to index ' + str(len(nodes)) + ' nodes')
percent_marker = math.floor(len(nodes) / 100)
for i, node in enumerate(nodes):
index(node, output=False)
if i % percent_marker == 0:
print(str(i / percent_marker) + '% Done')
print('Done in ' + str((datetime.datetime.now()) - start))
def index_by_id(node_id_string):
node_id = ObjectId(node_id_string)
# discover the node by its id and index
node = __nodes_collection.find_one({"_id": node_id})
if node is not None:
index(node)
def index(node, output=True):
node_id = node['_id']
# extract text content from node
text_content = scrape_node_contents(node)
analyzer = AnalyzeContent(text_weights=text_content)
node_lemmas = analyzer.content_token_list()
# create contents
index_contents = {
"node": node_id,
"lemmas": node_lemmas,
"lemmaWeights": analyzer.content_token_weight_map(),
"body": analyzer.content_tokens_flattened_raw(),
"bodyLength": analyzer.content_body_length()
}
# update the document
res = __indexes_collection.update_one(
{
"node": node_id
}, {
"$set": index_contents,
},
upsert=True
)
# insert document into the doc list for each of the lemmas
for lemma in node_lemmas:
# insert node id to lemma doc
__lemmas_collection.update_one(
{
"lemma": lemma
}, {
"$addToSet": {
"docs": res.upserted_id
}
},
upsert=True
)
# remove document from any lemma docs that are no longer relevant
__lemmas_collection.update(
{
"docs": res.upserted_id,
"lemma": {
"$nin": node_lemmas
}
},
{
"$pull": {
"docs": res.upserted_id
}
}
)
if output:
print('Indexed node named: ' + node["name"])