forked from readthedocs/readthedocs.org
-
Notifications
You must be signed in to change notification settings - Fork 0
/
documents.py
172 lines (138 loc) · 5.24 KB
/
documents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import logging
from django.conf import settings
from django_elasticsearch_dsl import Document, Index, fields
from elasticsearch import Elasticsearch
from readthedocs.projects.models import HTMLFile, Project
project_conf = settings.ES_INDEXES['project']
project_index = Index(project_conf['name'])
project_index.settings(**project_conf['settings'])
page_conf = settings.ES_INDEXES['page']
page_index = Index(page_conf['name'])
page_index.settings(**page_conf['settings'])
log = logging.getLogger(__name__)
class RTDDocTypeMixin:
def update(self, *args, **kwargs):
# Hack a fix to our broken connection pooling
# This creates a new connection on every request,
# but actually works :)
log.info('Hacking Elastic indexing to fix connection pooling')
self.using = Elasticsearch(**settings.ELASTICSEARCH_DSL['default'])
super().update(*args, **kwargs)
@project_index.document
class ProjectDocument(RTDDocTypeMixin, Document):
# Metadata
url = fields.TextField(attr='get_absolute_url')
users = fields.NestedField(
properties={
'username': fields.TextField(),
'id': fields.IntegerField(),
}
)
language = fields.KeywordField()
modified_model_field = 'modified_date'
class Django:
model = Project
fields = ('name', 'slug', 'description')
ignore_signals = True
@page_index.document
class PageDocument(RTDDocTypeMixin, Document):
"""
Document representation of a Page.
Some text fields use the simple analyzer instead of the default (standard).
Simple analyzer will break the text in non-letter characters,
so a text like ``python.submodule`` will be broken like [python, submodule]
instead of [python.submodule].
See more at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-analyzers.html # noqa
Some text fields use the ``with_positions_offsets`` term vector,
this is to have faster highlighting on big documents.
See more at https://www.elastic.co/guide/en/elasticsearch/reference/7.9/term-vector.html
"""
# Metadata
project = fields.KeywordField(attr='project.slug')
version = fields.KeywordField(attr='version.slug')
doctype = fields.KeywordField(attr='version.documentation_type')
path = fields.KeywordField(attr='processed_json.path')
full_path = fields.KeywordField(attr='path')
rank = fields.IntegerField()
# Searchable content
title = fields.TextField(attr='processed_json.title')
sections = fields.NestedField(
attr='processed_json.sections',
properties={
'id': fields.KeywordField(),
'title': fields.TextField(),
'content': fields.TextField(term_vector='with_positions_offsets'),
}
)
domains = fields.NestedField(
properties={
'role_name': fields.KeywordField(),
# For linking to the URL
'anchor': fields.KeywordField(),
# For showing in the search result
'type_display': fields.TextField(),
'docstrings': fields.TextField(term_vector='with_positions_offsets'),
# Simple analyzer breaks on `.`,
# otherwise search results are too strict for this use case
'name': fields.TextField(analyzer='simple'),
}
)
modified_model_field = 'modified_date'
class Django:
model = HTMLFile
fields = ('commit', 'build')
ignore_signals = True
def prepare_rank(self, html_file):
if not (-10 <= html_file.rank <= 10):
return 0
return html_file.rank
def prepare_domains(self, html_file):
"""Prepares and returns the values for domains field."""
if not html_file.version.is_sphinx_type:
return []
all_domains = []
try:
domains_qs = html_file.sphinx_domains.exclude(
domain='std',
type__in=['doc', 'label']
).iterator()
all_domains = [
{
'role_name': domain.role_name,
'anchor': domain.anchor,
'type_display': domain.type_display,
'docstrings': html_file.processed_json.get(
'domain_data', {}
).get(domain.anchor, ''),
'name': domain.name,
}
for domain in domains_qs
]
log.debug(
"[%s] [%s] Total domains for file %s are: %s",
html_file.project.slug,
html_file.version.slug,
html_file.path,
len(all_domains)
)
except Exception:
log.exception(
"[%s] [%s] Error preparing domain data for file %s",
html_file.project.slug,
html_file.version.slug,
html_file.path
)
return all_domains
def get_queryset(self):
"""
Ignore certain files from indexing.
- Files from external versions
- Ignored files
"""
queryset = super().get_queryset()
queryset = (
queryset
.internal()
.exclude(ignore=True)
)
return queryset