-
Notifications
You must be signed in to change notification settings - Fork 7
/
hugopreprocessor.py
168 lines (130 loc) · 5.61 KB
/
hugopreprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
r"""Preprocessor Module for Hugo.
This module exports a single class.
HugoPreprocessor: An `nbconvert` `Preprocessor` for exporting
notebooks to a Markdown format compatible with
[Hugo](https://gohugo.io)
"""
import datetime
import os.path
import re
from nbconvert.preprocessors import Preprocessor
class HugoPreprocessor(Preprocessor):
r"""Preprocessor class for Hugo.
This class overrides the `preprocess` and `preprocess_cell` methods
of the `nbcovert` `Preprocessor` class, to accomplish the following
tasks:
1. Properly quote underscores in math mode. See
https://gohugo.io/content-management/formats/#issues-with-markdown
for more context on the problem. This resolves the issue with the
"tedious" solution of quoting all underscores.
2. Set default values for metadata (date, title, and draft).
"""
def _insert_newline_before_lists(self, text):
r"""Ensure that there is a blank line before all lists."""
# Capture all single lines starting with "* "
ptn = re.compile(r'(\n\* [^\n]*)')
lines = re.findall(ptn, text)
# Create a list of 'blocks' which are runs of consecutive lines.
blocks = []
block = ''
for line in lines:
if block + line in text:
block += line
else:
blocks.append(block)
block = line
blocks.append(block)
# Find the starting index of each block that is NOT already
# preceded by \n.
indexes = [text.find(b) for b in blocks if '\n' + b not in text]
# Ensure indexes starts with 0 and ends with None, for the join
# below.
if indexes[0] != 0:
indexes = [0] + indexes
indexes.append(None)
return '\n'.join(text[indexes[i]:indexes[i + 1]]
for i in range(len(indexes) - 1))
def _quote_underscores_in_latex(self, text, latex):
r"""
Return modified `text`, with the '_' in `latex` quoted.
Args:
text: A string which contains `latex` as a substring.
latex: A substring of `text` consisting of actual Latex.
Returns: A copy of `text`, where every underscore inside `latex`
is replaced by '\_'.
"""
quoted_latex = latex.replace(r'_', r'\_')
return text.replace(latex, quoted_latex)
def _extract_latex(self, markdown):
r"""
Return a list of the blocks of latex occurring in `markdown`.
Args:
markdown: A string
Returns: A list of the strings of latex occurring in `markdown`,
including delimiters.
"""
# '$$ but not \$$' 'anything not ending in \' '$$'.
display_math = re.compile(r'[^\\](\$\$.*?[^\\]\$\$)', re.DOTALL)
out = re.findall(display_math, markdown)
# '$ but not \$ or $$' 'anything not ending in \' '$'.
inline_math = re.compile(r'[^\$\\](\$[^$].*?[^\\]\$)',
re.DOTALL)
# Inline math cannot span two newlines.
for block in markdown.split('\n\n'):
out += re.findall(inline_math, block)
return out
def _time_format_hugo(self, ts):
r"""Return a string in the ISO-8601 flavor that Hugo uses."""
local_tz = datetime.datetime.now(
datetime.timezone.utc).astimezone().tzinfo
out = ts.astimezone(local_tz).strftime('%Y-%m-%dT%H:%M:%S%z')
# %z is [+-]HHMM, but we want [+-]HH:MM
return out[:-2] + ':' + out[-2:]
def preprocess_cell(self, cell, resources, cell_index):
r"""
Quote the underscores in Latex appearing in the cell.
Args: See the `nbconvert.preprocessors.Preprocessor`
documentation.
Returns: The tuple `(cell, resources)`, where `cell` has been
modified so that every '_' in Latex that is part of a markdown
cell or output of type 'text/latex' is preceded by '\'.
"""
if cell.cell_type == 'markdown':
latex_segments = self._extract_latex(cell.source)
for latex in latex_segments:
cell.source = self._quote_underscores_in_latex(
cell.source, latex)
elif cell.cell_type == 'code':
for o in cell.outputs or []:
latex = o.get('data', {}).get('text/latex')
if latex:
o['data'][
'text/latex'] = self._quote_underscores_in_latex(
latex, latex)
return cell, resources
def preprocess(self, nb, resources):
r"""
Set metadata defaults, and process markdown.
Args: See the `nbconvert.preprocessors.Preprocessor`
documentation.
Returns: (nb, resources) where these have been fully processed.
"""
metadata = resources['metadata']
if metadata.get('hugo') is None:
metadata['hugo'] = {}
hugo = metadata['hugo']
# Set default metadata
file_path = os.path.join(metadata['path'],
metadata['name'] + '.ipynb')
ts = datetime.datetime.fromtimestamp(
os.path.getmtime(file_path))
hugo['date'] = hugo.get('date') or self._time_format_hugo(ts)
title = ' '.join(
_.capitalize() for _ in metadata['name'].split('_'))
hugo['title'] = hugo.get('title') or title
hugo['draft'] = hugo.get('draft') or True
# Process the cells
for index, cell in enumerate(nb.cells):
nb.cells[index], resources = self.preprocess_cell(
cell, resources, index)
return nb, resources