Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
344 lines (278 sloc) 12.4 KB
{# just in case the host doesn't have this defined, can't go in role defaults because it messes up inheritance for some playbooks #}
{% set nginx_server_name = nginx_server_name | default("localhost") %}
{% set nginx_vhosts = nginx_vhosts | default(nginx_server_name) %}
{% if 'mahider.cgiar.org' in nginx_vhosts -%}
{% include 'nginx/mahider.cgiar.org.conf.j2' %}
{% endif %}
{% if 'library.cgiar.org' in nginx_vhosts -%}
{% include 'nginx/library.cgiar.org.conf.j2' %}
{% endif %}
{% if 'mahider.ilri.org' in nginx_vhosts and 'dspace.ilri.org' in nginx_vhosts -%}
{% include 'nginx/legacy-dspace-domains.conf.j2' %}
{% endif %}
{% if nginx_tls_cert is defined %}
# Handle {{ nginx_server_name }} http -> https
#
server {
listen 80;
listen [::]:80;
server_name {{ nginx_server_name }};
# feedburner doesn't support https :(
location ~ /(feed|open-search/discover) {
# log access requests for debug / load analysis
access_log /var/log/nginx/access.log;
# Send requests to Tomcat
proxy_pass http://tomcat_http;
}
# redirect http -> https
location / {
# log access requests for debug / load analysis
access_log /var/log/nginx/access.log;
# ? in rewrite makes sure nginx doesn't append query string again
# see: http://wiki.nginx.org/NginxHttpRewriteModule#rewrite
rewrite ^ https://{{ nginx_server_name }}$request_uri? {% if nginx_redirect_type == 301 %}permanent{% else %}redirect{% endif %};
}
include extra-security.conf;
}
{% endif %} {# end: nginx_tls_cert #}
# HTTPS server (only for domains we have a cert for)
#
server {
{% if nginx_tls_cert is defined %}
listen {{ nginx_ssl_port }} ssl http2;
listen [::]:{{ nginx_ssl_port }} ssl http2;
{% else %}
listen 80;
listen [::]:80;
{% endif %} {# end: nginx_tls_cert #}
server_name {{ nginx_server_name }};
root {{ nginx_root }};
{% if nginx_forbid_robots == True %}
# Add header to forbid robots to index
# See: https://developers.google.com/webmasters/control-crawl-index/docs/robots_meta_tag
add_header X-Robots-Tag "none";
{% endif %}
{% if nginx_tls_cert is defined %}
ssl_certificate {{ nginx_tls_cert }};
ssl_certificate_key {{ nginx_tls_key }};
ssl_session_timeout {{ nginx_ssl_session_timeout }};
ssl_session_cache {{ nginx_ssl_session_cache }};
ssl_buffer_size {{ nginx_ssl_buffer_size }};
ssl_dhparam {{ nginx_ssl_dhparam }};
ssl_protocols {{ nginx_ssl_protocols }};
ssl_ciphers "{{ tls_cipher_suite }}";
ssl_prefer_server_ciphers on;
{% if ansible_distribution == 'Ubuntu' and ansible_distribution_version is version_compare('16.04', '>=') %}
ssl_stapling on;
ssl_stapling_verify on;
resolver {{ nginx_ssl_stapling_resolver }};
{% endif %}
# nginx does not auto-rotate session ticket keys: only a HUP / restart will do so and
# when a restart is performed the previous key is lost, which resets all previous
# sessions. The fix for this is to setup a manual rotation mechanism:
# http://trac.nginx.org/nginx/changeset/1356a3b9692441e163b4e78be4e9f5a46c7479e9/nginx
#
# Note that you'll have to define and rotate the keys securely by yourself. In absence
# of such infrastructure, consider turning off session tickets:
ssl_session_tickets off;
{% if nginx_enable_hsts == True %}
# Enable this if you want HSTS (recommended, but be careful)
add_header Strict-Transport-Security max-age=15768000;
{% endif %} {# end: nginx_enable_hsts #}
{% endif %} {# end: nginx_tls_cert #}
# static assets we can load from the file system directly with nginx
location ~ /(themes|aspects/ReportingSuite|aspects/Statistics) {
try_files $uri @tomcat;
location ~* \.(?:ico|css|js|gif|jpe?g|png|woff)$ {
add_header Cache-Control "max-age=604800";
}
}
# rate limit requests to dynamic pages to protect against bots and malicious
# users (see the definition for the dynamicpages limit_req_zone later).
location ~ /(browse|discover|search-filter|most-popular) {
# rate limit for dynamic pages
limit_req zone=dynamicpages burst=5;
# rate limit for poorly behaved bots, see limit_req_zone below
limit_req zone=badbots;
# log access requests for debug / load analysis
access_log /var/log/nginx/access.log;
proxy_pass http://tomcat_http;
}
# tell robots not to index or follow links on dynamic pages (because we
# can't use * in robots.txt!) and rate limit requests.
# see: https://jira.duraspace.org/browse/DS-2962
location ~ /handle/[0-9]+/[0-9]+/(browse|discover|search-filter|most-popular) {
# rate limit for dynamic pages
limit_req zone=dynamicpages burst=5;
# rate limit for poorly behaved bots, see limit_req_zone below
limit_req zone=badbots;
# log access requests for debug / load analysis
access_log /var/log/nginx/access.log;
add_header X-Robots-Tag "none";
proxy_pass http://tomcat_http;
}
location / {
# rate limit for poorly behaved bots, see limit_req_zone below
limit_req zone=badbots;
# log access requests for debug / load analysis
access_log /var/log/nginx/access.log;
# Explicitly set proxy headers again because the existing headers from
# the global context get cleared when we set the User-Agent here.
# See: https://nginx.org/en/docs/http/ngx_http_proxy_module.html#proxy_set_header
include proxy_params;
# Send requests to Tomcat
proxy_pass http://tomcat_http;
# Set user agent string to mapped value (see mapping block)
proxy_set_header User-Agent $ua;
}
# AReS explorer
location /explorer {
# log access requests for debug / load analysis
access_log /var/log/nginx/access.log;
proxy_pass https://codeobia_ares/explorer;
}
# log rest requests
location /rest {
location ~ /rest/statistics/?(.*) {
access_log /var/log/nginx/statistics.log;
proxy_pass http://statistics_api/$1$is_args$args;
}
access_log /var/log/nginx/rest.log;
proxy_pass http://tomcat_http;
}
# log oai requests
location /oai {
access_log /var/log/nginx/oai.log;
proxy_pass http://tomcat_http;
}
# Only allow Solr access from localhost
location /solr {
allow 127.0.0.1;
deny all;
}
# Only allow access to management contexts from localhost (for Munin tomcat)
location ~ /(host-manager|manager) {
allow 127.0.0.1;
deny all;
}
# Only allow jspui access for atmire modules
location /jspui {
location ~ ^/jspui/(export|listings-and-reports) {
allow all;
proxy_pass http://tomcat_http;
}
deny all;
}
# named location for above try_files
location @tomcat {
# log access requests for debug / load analysis
access_log /var/log/nginx/access.log;
proxy_pass http://tomcat_http;
}
include extra-security.conf;
}
upstream tomcat_http {
{% if nginx_tls_cert is defined %}
server 127.0.0.1:8443;
{% else %}
server 127.0.0.1:8001;
{% endif %}
# The keepalive parameter sets the maximum number of idle keepalive connections
# to upstream servers that are preserved in the cache of each worker process. When
# this number is exceeded, the least recently used connections are closed.
keepalive 60;
}
upstream codeobia_ares {
server {{ dspace_explorer_server }}:443;
# The keepalive parameter sets the maximum number of idle keepalive connections
# to upstream servers that are preserved in the cache of each worker process. When
# this number is exceeded, the least recently used connections are closed.
keepalive 60;
}
upstream statistics_api {
server 127.0.0.1:5000;
}
# Assign a custom user agent to certain client IPs who are sending hundreds or
# thousands of requests like a bot, but don't declare a proper user agent. We
# can then rely on Tomcat's Crawler Session Manager Valve to assign one single
# session to all these users in order to reduce resource usage.
map $remote_addr $ua {
# 2017-11-08 ciat.cgiar.org scraper
104.196.152.243 'bot';
# 2017-11-08 Random Chinese host grabbing 20,000 PDFs
~124\.17\.34\. 'bot';
# 2018-07-10 mystery client on Google Cloud
35.227.26.162 'bot';
# 2018-09-27 mystery client on Google Cloud
35.237.175.180 'bot';
# 2018-09-27 mystery client on Cox
68.6.87.12 'bot';
# 2018-10-03 mystery client on Hetzner
138.201.49.199 'bot';
# 2018-11-04 mystery client on Hetzner
78.46.89.18 'bot';
# 2019-03-26 mystery client in Ukraine
93.179.69.74 'bot';
# 2019-03-26 CTA scraper on Amazon
18.195.78.144 'bot';
# 2019-03-26 CTA scraper on Amazon
18.196.196.108 'bot';
# 2019-03-26 CTA scraper on Amazon
18.194.46.84 'bot';
default $http_user_agent;
}
# Use a mapping to identify certain search bots with many IP addresses and force
# them to obey a global request rate limit. For example, Baidu actually has over
# 160 IP addresses and often crawls the site with fifty or so concurrently! This
# maps all Baidu requests to the same $limit_bots value, allowing us to force it
# to abide by a total rate limit for all client instances regardless of the IP.
#
# $limit_bots will be used as the key for the limit_req_zone.
map $http_user_agent $limit_bots {
~Baiduspider 1;
# 2018-01-10 user agent of bot making a few hundred thousand requests from a handful of IPs
'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36' 1;
# 2018-07-12 user agent of bot making tens of thousand requests to /discover, which is forbidden in robots.txt
~Pcore-HTTP 1;
# 2019-02-18 user agent of bot making tens of thousand requests to /discover, which is forbidden in robots.txt
'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0' 1;
# 2019-03-26 user agent of bot making thousands of requests to /discover, which is forbidden in robots.txt
~BLEXBot 1;
# 2019-03-26 user agent of bot making thousands of requests to /discover, which is forbidden in robots.txt
~MauiBot 1;
# 2019-03-26 user agent of bot making thousands of requests to /discover, which is forbidden in robots.txt
~CCBot 1;
# 2019-03-26 user agent of bot making hundreds of requests to /discover, which is forbidden in robots.txt
~SemanticScholarBot 1;
# 2019-03-26 user agent of bot making hundreds of requests to /discover, which is forbidden in robots.txt
~DotBot 1;
# 2019-03-26 user agent of bot making thousands of requests to /discover, which is forbidden in robots.txt
~SemrushBot 1;
# 2019-03-26 user agent of bot making hundreds of requests to /discover, which is forbidden in robots.txt
~GuzzleHttp 1;
# requests with an empty key are not evaluated by limit_req
# see: http://nginx.org/en/docs/http/ngx_http_limit_req_module.html
default '';
}
# Zone for limiting "bad bot" requests with a hard limit of 1 per minute. Uses
# the variable $limit_bots as a key, which is controlled by the mapping above.
# I am using 1 requests per minute because Baidu currently does about 20 or 30,
# but I don't feel like prioritizing their requests because they don't respect
# the instructions in robots.txt. This is probably overkill for just punishing
# Baidu, but I wanted to explore a solution that could work for other bad user
# agents in the future with little adjustments.
#
# A zone key size of 1 megabyte should be able to store around 16,000 sessions,
# which should be about 15,999 sessions too many for now as I'm currently only
# worried about Baidu (see mapping above).
limit_req_zone $limit_bots zone=badbots:1m rate=1r/m;
# Zone for limiting requests to dynamic pages like browse and discover that are
# of no use to bots and cause a high load on the server. For now I think I will
# use a limit of 12, because I imagine a human user could legitimately click a
# link every five seconds, but bots like Facebook are requesting 300/minute.
#
# The user's IP address is used as the key.
#
# See: https://www.nginx.com/blog/rate-limiting-nginx/
limit_req_zone $binary_remote_addr zone=dynamicpages:1m rate=12r/m;
# vim: set ts=4 sw=4:
You can’t perform that action at this time.