This repository has been archived by the owner on Jan 9, 2018. It is now read-only.
forked from ericflo/django-oembed
/
core.py
183 lines (168 loc) · 7.17 KB
/
core.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import re
import urllib2
import gzip
from heapq import heappush, heappop
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
try:
import simplejson
except ImportError:
from django.utils import simplejson
from django.conf import settings
from django.utils.safestring import mark_safe
from oembed.models import ProviderRule, StoredOEmbed
from django.template.loader import render_to_string
import logging
logger = logging.getLogger("oembed core")
END_OVERRIDES = (')', ',', '.', '>', ']', ';')
MAX_WIDTH = getattr(settings, "OEMBED_MAX_WIDTH", 320)
MAX_HEIGHT = getattr(settings, "OEMBED_MAX_HEIGHT", 240)
FORMAT = getattr(settings, "OEMBED_FORMAT", "json")
def fetch(url, user_agent="django-oembed/0.1"):
"""
Fetches from a URL, respecting GZip encoding, etc.
"""
request = urllib2.Request(url)
request.add_header('User-Agent', user_agent)
request.add_header('Accept-Encoding', 'gzip')
opener = urllib2.build_opener()
f = opener.open(request)
result = f.read()
if f.headers.get('content-encoding', '') == 'gzip':
result = gzip.GzipFile(fileobj=StringIO(result)).read()
f.close()
return result
def re_parts(regex_list, text):
"""
An iterator that returns the entire text, but split by which regex it
matched, or none at all. If it did, the first value of the returned tuple
is the index into the regex list, otherwise -1.
>>> first_re = re.compile('asdf')
>>> second_re = re.compile('an')
>>> list(re_parts([first_re, second_re], 'This is an asdf test.'))
[(-1, 'This is '), (1, 'an'), (-1, ' '), (0, 'asdf'), (-1, ' test.')]
>>> list(re_parts([first_re, second_re], 'asdfasdfasdf'))
[(0, 'asdf'), (0, 'asdf'), (0, 'asdf')]
>>> list(re_parts([], 'This is an asdf test.'))
[(-1, 'This is an asdf test.')]
>>> third_re = re.compile('sdf')
>>> list(re_parts([first_re, second_re, third_re], 'This is an asdf test.'))
[(-1, 'This is '), (1, 'an'), (-1, ' '), (0, 'asdf'), (-1, ' test.')]
"""
def match_compare(x, y):
return x.start() - y.start()
prev_end = 0
iter_dict = dict((r, r.finditer(text)) for r in regex_list)
# a heapq containing matches
matches = []
# bootstrap the search with the first hit for each iterator
for regex, iterator in iter_dict.items():
try:
match = iterator.next()
heappush(matches, (match.start(), match))
except StopIteration:
iter_dict.pop(regex)
# process matches, revisiting each iterator from which a match is used
while matches:
# get the earliest match
start, match = heappop(matches)
end = match.end()
if start > prev_end:
# yield the text from current location to start of match
yield (-1, text[prev_end:start])
# yield the match
yield (regex_list.index(match.re), text[start:end])
# get the next match from the iterator for this match
if match.re in iter_dict:
try:
newmatch = iter_dict[match.re].next()
heappush(matches, (newmatch.start(), newmatch))
except StopIteration:
iter_dict.pop(match.re)
prev_end = end
# yield text from end of last match to end of text
last_bit = text[prev_end:]
if len(last_bit) > 0:
yield (-1, last_bit)
def replace(text, max_width=MAX_WIDTH, max_height=MAX_HEIGHT):
"""
Scans a block of text, replacing anything matched by a ``ProviderRule``
pattern with an OEmbed html snippet, if possible.
Templates should be stored at oembed/{format}.html, so for example:
oembed/video.html
These templates are passed a context variable, ``response``, which is a
dictionary representation of the response.
"""
rules = list(ProviderRule.objects.all())
patterns = [re.compile(r.regex) for r in rules] # Compiled patterns from the rules
parts = [] # The parts that we will assemble into the final return value.
indices = [] # List of indices of parts that need to be replaced with OEmbed stuff.
indices_rules = [] # List of indices into the rules in order for which index was gotten by.
urls = set() # A set of URLs to try to lookup from the database.
stored = {} # A mapping of URLs to StoredOEmbed objects.
index = 0
# First we pass through the text, populating our data structures.
for i, part in re_parts(patterns, text):
if i == -1:
parts.append(part)
index += 1
else:
to_append = ""
# If the link ends with one of our overrides, build a list
while part[-1] in END_OVERRIDES:
to_append += part[-1]
part = part[:-1]
indices.append(index)
urls.add(part)
indices_rules.append(i)
parts.append(part)
index += 1
if to_append:
parts.append(to_append)
index += 1
# Now we fetch a list of all stored patterns, and put it in a dictionary
# mapping the URL to to the stored model instance.
for stored_embed in StoredOEmbed.objects.filter(match__in=urls, max_width=max_width, max_height = max_height):
stored[stored_embed.match] = stored_embed
# Now we're going to do the actual replacement of URL to embed.
for i, id_to_replace in enumerate(indices):
rule = rules[indices_rules[i]]
part = parts[id_to_replace]
try:
# Try to grab the stored model instance from our dictionary, and
# use the stored HTML fragment as a replacement.
parts[id_to_replace] = stored[part].html
except KeyError:
try:
# Build the URL based on the properties defined in the OEmbed spec.
sep = "?" in rule.endpoint and "&" or "?"
clean_part = urllib2.quote(part)
url = u"%s%surl=%s&maxwidth=%s&maxheight=%s&format=%s" % (
rule.endpoint, sep, clean_part, max_width, max_height, FORMAT
)
# Fetch the link and parse the JSON.
resp = simplejson.loads(fetch(url))
# Depending on the embed type, grab the associated template and
# pass it the parsed JSON response as context.
replacement = render_to_string('oembed/%s.html' % resp['type'], {'response': resp})
if replacement:
stored_embed = StoredOEmbed.objects.create(
match = part,
max_width = max_width,
max_height = max_height,
html = replacement,
)
stored[stored_embed.match] = stored_embed
parts[id_to_replace] = replacement
else:
raise ValueError
except ValueError:
parts[id_to_replace] = part
except KeyError:
parts[id_to_replace] = part
except urllib2.HTTPError:
parts[id_to_replace] = part
# Combine the list into one string and return it.
return mark_safe(u''.join(parts))