Skip to content

Commit

Permalink
Merge pull request #972 from anastasia/cite
Browse files Browse the repository at this point in the history
Cite
  • Loading branch information
ChefAndy committed May 20, 2019
2 parents e4f6b44 + 415e04d commit 4324bbb
Show file tree
Hide file tree
Showing 8 changed files with 181 additions and 15 deletions.
6 changes: 5 additions & 1 deletion capstone/capapi/permissions.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from capweb import helpers
from rest_framework import permissions

staff_level_permissions = [
Expand All @@ -7,6 +8,7 @@
'capapi.delete_capuser',
]


class IsSafeMethodsUser(permissions.BasePermission):
def has_permission(self, request, view):
# we're a read-only operation here
Expand Down Expand Up @@ -34,13 +36,15 @@ def get_single_casebody_permissions(request, case):
updating case download permissions for user if case is blacklisted
"""
casebody = {"status": None, "data": None}

if not case.jurisdiction_id:
casebody["status"] = casebody_permissions["unk"]

elif case.jurisdiction_whitelisted:
casebody["status"] = casebody_permissions["ok"]

elif helpers.is_google_bot(request):
casebody["status"] = casebody_permissions["ok"]

elif request.user.is_anonymous:
casebody["status"] = casebody_permissions["auth"]

Expand Down
1 change: 0 additions & 1 deletion capstone/capapi/templates/case.html
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,6 @@ <h6 class="case-viewing-error">
"@id":"{{ frontend_url }}"
},
"headline": "{{ metadata.name_abbreviation }}",
"isAccessibleForFree": "False",
{% if not metadata.jurisdiction.whitelisted %}
"hasPart": {
"@type": "WebPageElement",
Expand Down
49 changes: 46 additions & 3 deletions capstone/capweb/helpers.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import json
import re
import socket
from bs4 import BeautifulSoup
from collections import namedtuple
from contextlib import contextmanager
from functools import wraps
import markdown
from markdown.extensions.toc import TocExtension
import requests
import django_hosts
from django_user_agents.utils import get_user_agent
from ipware import get_client_ip

from django.conf import settings
from django.contrib.auth.decorators import user_passes_test
Expand All @@ -24,6 +28,7 @@ def cache_func(key, timeout=None, cache_name='default'):
`key` should be a lambda that takes the decorated function's arguments and returns a cache key.
"""
cache = caches[cache_name]

def decorator(func):
@wraps(func)
def decorated(*args, **kwargs):
Expand All @@ -40,9 +45,12 @@ def decorated(*args, **kwargs):
value = func(*args, **kwargs)
cache.set(cache_key, value, timeout)
return value

return decorated

return decorator


@cache_func(
key=lambda section: 'get_data_from_lil_site:%s' % section,
timeout=settings.CACHED_LIL_DATA_TIMEOUT
Expand All @@ -59,6 +67,7 @@ def get_data_from_lil_site(section="news"):
data = json.loads(content.strip()[start_index + 1:end_index])
return data[section]


def reverse(*args, **kwargs):
"""
Wrap django_hosts.reverse() to try all known hosts.
Expand All @@ -77,11 +86,13 @@ def reverse(*args, **kwargs):
return django_hosts.reverse(*args, **kwargs)
except NoReverseMatch:
# raise NoReverseMatch only after testing final host
if i == len(hosts)-1:
if i == len(hosts) - 1:
raise


reverse_lazy = lazy(reverse, str)


def show_toolbar_callback(request):
"""
Whether to show django-debug-toolbar.
Expand All @@ -94,6 +105,7 @@ def show_toolbar_callback(request):
class StatementTimeout(Exception):
pass


@contextmanager
def statement_timeout(timeout, db="default"):
"""
Expand All @@ -114,6 +126,7 @@ def statement_timeout(timeout, db="default"):
# reset to default, in case we're in a nested transaction
cursor.execute("SET LOCAL statement_timeout = %s", [original_timeout])


@contextmanager
def transaction_safe_exceptions(using=None):
"""
Expand All @@ -135,12 +148,14 @@ def transaction_safe_exceptions(using=None):
else:
yield


def select_raw_sql(sql, args=None, using=None):
with connections[using].cursor() as cursor:
cursor.execute(sql, args)
nt_result = namedtuple('Result', [col[0] for col in cursor.description])
return [nt_result(*row) for row in cursor.fetchall()]


def send_contact_email(title, content, from_address):
"""
Send a message on behalf of a user to our contact email.
Expand Down Expand Up @@ -172,9 +187,37 @@ def render_markdown(markdown_doc):
Render given markdown document and return (html, table_of_contents, meta)
"""
md = markdown.Markdown(extensions=[TocExtension(baselevel=2, marker=''), 'meta'])
html = md.convert(markdown_doc)\
html = md.convert(markdown_doc) \
.replace('<h2 ', '<h2 class="subtitle" ')
toc = md.toc.replace('<a ', '<a class="list-group-item" ')
toc = "".join(toc.splitlines(True)[2:-2]) # strip <div><ul> around toc by dropping first and last two lines
meta = {k:' '.join(v) for k, v in md.Meta.items()}
meta = {k: ' '.join(v) for k, v in md.Meta.items()}
return html, toc, meta


def is_google_bot(request):
"""
from https://blog.majsky.cz/detecting-google-bot-python-and-django/
"""
user_agent = get_user_agent(request)
if not user_agent.is_bot:
return False
ip, _ = get_client_ip(request)
try:
host = socket.gethostbyaddr(ip)[0]
print(host)
except (socket.herror, socket.error):
return False
domain_name = ".".join(host.split('.')[1:])
if domain_name not in ['googlebot.com', 'google.com']:
return False
host_ip = socket.gethostbyname(host)
return host_ip == ip


def get_schema(response):
soup = BeautifulSoup(response.content.decode(), 'html.parser')
scripts = soup.find_all('script', {'type': 'application/ld+json'})
assert len(scripts) == 1
script = scripts[0]
return json.loads(script.text)
15 changes: 15 additions & 0 deletions capstone/capweb/tests/test_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import pytest
from django.test import RequestFactory
from capweb.helpers import is_google_bot


@pytest.mark.django_db
def x_test_is_google_bot():
# this test will perform a DNS lookup
request = RequestFactory().get("/")
request.META["HTTP_USER_AGENT"] = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
request.META["REMOTE_ADDR"] = "66.249.66.1"
assert is_google_bot(request)

request.META["HTTP_USER_AGENT"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:67.0) Gecko/20100101 Firefox/67.0"
assert not is_google_bot(request)
94 changes: 87 additions & 7 deletions capstone/cite/tests/test_views.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import re

import pytest
from django.conf import settings
from django.utils.text import slugify

from capapi.tests.helpers import check_response
from capweb.helpers import reverse
from capweb.helpers import reverse, get_schema
from capweb import helpers

from scripts.helpers import parse_xml


Expand All @@ -16,6 +16,7 @@ def test_home(client, django_assert_num_queries, ingest_metadata):
response = client.get(reverse('home', host='cite'))
check_response(response, content_includes="Alabama Appellate Courts Reports")


@pytest.mark.django_db
def test_series(client, django_assert_num_queries, volume_factory):
""" Test /series/ """
Expand All @@ -33,6 +34,7 @@ def test_series(client, django_assert_num_queries, volume_factory):
assert vol.volume_number in content
assert vol.reporter.full_name in content


@pytest.mark.django_db
def test_volume(client, django_assert_num_queries, citation_factory):
""" Test /series/volume/ """
Expand All @@ -45,21 +47,24 @@ def test_volume(client, django_assert_num_queries, citation_factory):
case_2.volume.save()

with django_assert_num_queries(select=3):
response = client.get(reverse('volume', args=[case_1.reporter.short_name_slug, case_1.volume.volume_number], host='cite'))
response = client.get(
reverse('volume', args=[case_1.reporter.short_name_slug, case_1.volume.volume_number], host='cite'))
check_response(response)
content = response.content.decode()
for case in (case_1, case_2):
assert case.volume.volume_number in content
assert case.reporter.full_name in content
assert case.name_abbreviation in content


@pytest.mark.django_db
def test_case_not_found(client, django_assert_num_queries):
""" Test /series/volume/case/ not found """
with django_assert_num_queries(select=1):
response = client.get(reverse('citation', args=['fake', '123', '456'], host='cite'))
check_response(response, content_includes='Citation "123 Fake 456" not found')


@pytest.mark.django_db
def test_cases_multiple(client, django_assert_num_queries, three_cases):
""" Test /series/volume/case/ with multiple matching cases """
Expand All @@ -74,7 +79,8 @@ def test_cases_multiple(client, django_assert_num_queries, three_cases):
case.save()
cite_parts = re.match(r'(\S+)\s+(.*?)\s+(\S+)$', cite.cite).groups()
with django_assert_num_queries(select=3):
response = client.get(reverse('citation', args=[slugify(cite_parts[1]), cite_parts[0], cite_parts[2]], host='cite'))
response = client.get(
reverse('citation', args=[slugify(cite_parts[1]), cite_parts[0], cite_parts[2]], host='cite'))
check_response(response, content_includes='Multiple cases match')
content = response.content.decode()
for case in three_cases:
Expand All @@ -83,9 +89,11 @@ def test_cases_multiple(client, django_assert_num_queries, three_cases):
# load one of the results
first_case.jurisdiction.whitelisted = True
first_case.jurisdiction.save()
response = client.get(reverse('citation', args=[slugify(cite_parts[1]), cite_parts[0], cite_parts[2], first_case.id], host='cite'))
response = client.get(
reverse('citation', args=[slugify(cite_parts[1]), cite_parts[0], cite_parts[2], first_case.id], host='cite'))
check_response(response)


@pytest.mark.django_db
def test_single_case(client, auth_client, django_assert_num_queries, case):
""" Test /series/volume/case/ with one matching case """
Expand Down Expand Up @@ -136,7 +144,7 @@ def test_single_case(client, auth_client, django_assert_num_queries, case):
assert session['case_allowance_remaining'] == 0

# check daily quota resettest_unlimited_access
session['case_allowance_last_updated'] -= 60*60*24 + 1
session['case_allowance_last_updated'] -= 60 * 60 * 24 + 1
session.save()
response = client.get(url)
check_response(response, content_includes=case_text)
Expand All @@ -149,3 +157,75 @@ def test_single_case(client, auth_client, django_assert_num_queries, case):
check_response(response, content_includes=case_text)
auth_client.auth_user.refresh_from_db()
assert auth_client.auth_user.case_allowance_remaining == settings.API_CASE_DAILY_ALLOWANCE - 1


@pytest.mark.django_db
def test_schema_in_case(client, case):
# setup
url = case.get_frontend_url()
parsed = parse_xml(case.case_xml.orig_xml)
case_text = parsed('casebody|casebody').children()[10].text.replace('\xad', '')

### whitelisted case

case.jurisdiction.whitelisted = True
case.jurisdiction.save()

response = client.get(url)
check_response(response, content_includes=case_text)

schema = get_schema(response)
assert schema["headline"] == case.name_abbreviation
assert schema["author"]["name"] == case.court.name

# if case is whitelisted, extra info about inaccessibility is not needed
# https://developers.google.com/search/docs/data-types/paywalled-content
assert "hasPart" not in schema

### blacklisted case

case.jurisdiction.whitelisted = False
case.jurisdiction.save()

response = client.post(reverse('set_cookie'), {'not_a_bot': 'yes', 'next': url}, follow=True)
check_response(response, content_includes=case_text)

schema = get_schema(response)
assert schema["headline"] == case.name_abbreviation
assert schema["author"]["name"] == case.court.name

# if case is blacklisted, we include more data
assert "hasPart" in schema
assert schema["hasPart"]["isAccessibleForFree"] == 'False'


@pytest.mark.django_db()
def test_schema_in_case_as_google_bot(client, case, monkeypatch):
# setup
url = case.get_frontend_url()
parsed = parse_xml(case.case_xml.orig_xml)
case_text = parsed('casebody|casebody').children()[10].text.replace('\xad', '')

def mock_is_google_bot(request):
return True

monkeypatch.setattr(helpers, 'is_google_bot', mock_is_google_bot)

# our bot has seen too many cases!
session = client.session
session['case_allowance_remaining'] = 0
session.save()
assert session['case_allowance_remaining'] == 0

case.jurisdiction.whitelisted = False
case.jurisdiction.save()

response = client.get(url, follow=True)

# show cases anyway
check_response(response, content_includes=case_text)
schema = get_schema(response)
assert schema["headline"] == case.name_abbreviation
assert schema["author"]["name"] == case.court.name
assert "hasPart" in schema
assert schema["hasPart"]["isAccessibleForFree"] == 'False'
13 changes: 10 additions & 3 deletions capstone/cite/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from capapi.authentication import SessionAuthentication
from capapi.renderers import HTMLRenderer
from capdb.models import Reporter, VolumeMetadata, Citation, CaseMetadata
from capweb.helpers import reverse
from capweb import helpers

### helpers ###

Expand Down Expand Up @@ -148,11 +148,13 @@ def citation(request, series_slug, volume_number, page_number, case_id=None):
else:
serializer = serializers.CaseSerializerWithCasebody

elif helpers.is_google_bot(request):
serializer = serializers.CaseSerializerWithCasebody
# if non-whitelisted case, not logged in, and no cookies set up, redirect to ?set_cookie=1
else:
request.session['case_allowance_remaining'] = settings.API_CASE_DAILY_ALLOWANCE
request.session['case_allowance_last_updated'] = time.time()
return HttpResponseRedirect('%s?%s' % (reverse('set_cookie', host='cite'), urlencode({'next': request.get_full_path()})))
return HttpResponseRedirect('%s?%s' % (helpers.reverse('set_cookie', host='cite'), urlencode({'next': request.get_full_path()})))

# render case using API serializer
api_request = Request(request, authenticators=[SessionAuthentication()])
Expand All @@ -178,14 +180,19 @@ def citation(request, series_slug, volume_number, page_number, case_id=None):
"page_number": page_number,
})


def set_cookie(request):
"""
/set_cookie/ -- try to use javascript to set a 'not_a_bot=1' cookie
/set_cookie/?no_js=1 -- ask user to click a button to set a 'not_a_bot=1' cookie
"""
# user is actually a google bot
if helpers.is_google_bot(request):
return safe_redirect(request)

# user already had a not_a_bot cookie and just needed a session cookie,
# which was set when they were forwarded here -- they're ready to go:
if 'case_allowance_remaining' in request.session and request.COOKIES.get('not_a_bot', 'no') == 'yes':
elif 'case_allowance_remaining' in request.session and request.COOKIES.get('not_a_bot', 'no') == 'yes':
return safe_redirect(request)

# user has successfully POSTed to get their not_a_bot cookie:
Expand Down

0 comments on commit 4324bbb

Please sign in to comment.