In [1]:
import re
import logging
from bs4 import BeautifulSoup
from scrapy import Spider, Request
from json import dump
from os import makedirs, path
from datetime import datetime

SCRAPE_OUTPUT_DIR = "./Acts"

months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
          'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']


def windows_file_name(title):
    """
    Appropriately process characters not accepted by Windows file system
    :param title:   Title of the decision
    :return:        Processed title
    """
    # Remove \r and \n
    title = re.sub(r'[\r\n]', '', title)
    # Remove characters not accepted by Windows file system
    return re.sub(r'[\\/:*?"<>|]', "", title)


def save_output(data):
    date = datetime.strptime(data["date"], "%Y-%m-%d")
    year = date.year
    month = datetime.strftime(date, "%B")
    output_dir = f"{SCRAPE_OUTPUT_DIR}/{year}/{month}"

    if not path.isdir(output_dir):
        makedirs(output_dir)

    # Set DB name and identifier
    data["identifier"] = data["title"]
    data["database"] = "acts"

    file_id = windows_file_name(data["title"])
    file_name = f"{output_dir}/{file_id}.json"
    with open(file_name, "w") as fl:
        # Indent for readability
        dump(data, fl, indent=0)


def is_existing(data):
    date = datetime.strptime(data["date"], "%Y-%m-%d")
    year = date.year
    month = datetime.strftime(date, "%B")
    output_dir = f"{SCRAPE_OUTPUT_DIR}/{year}/{month}"

    file_id = windows_file_name(data["title"])
    file_name = f"{output_dir}/{file_id}.json"

    return path.isfile(file_name)


class ScJurisprudenceSpider(Spider):
    name = "sc-jurisprudence"
    allowed_domains = ["elibrary.judiciary.gov.ph"]
    start_urls = ["https://elibrary.judiciary.gov.ph/thebookshelf/28"]

    custom_settings = {
        # Set up fake user agent to avoid getting blocked
        "USER_AGENT": "Mozilla/5.0",
        "FEED_EXPORT_ENCODING": "utf-8",
        "FEED_FORMAT": "json",
        # Disable robots.txt
        "ROBOTSTXT_OBEY": False,
        # Do not retry failed requests
        "RETRY_TIMES": 0
    }

    # --- CONFIGS --- #
    year = datetime.now().year - 1
    month = 12

    def __init__(self, month=None, year=None, **kwargs):
        """
        This spider crawls the Supreme Court Jurisprudence website.

        :param month:   Month to crawl
        :param year:    Year to crawl
        :param kwargs:  Other arguments
        """
        if month is not None:
            self.month = months[int(month) - 1]
        else:
            # Default to all months
            self.month = None

        if year is not None:
            self.year = int(year)
        else:
            # Default to current year
            self.year = datetime.now().year
        super().__init__(**kwargs)

    def parse(self, response):
        soup = make_soup(response)
        links = get_links(soup, self.month, self.year)

        # Visit each link and scrape the page containing the decisions
        for link in links:
            yield Request(link, callback=self.parse_decision_list)

    def parse_decision_list(self, response):
        soup = make_soup(response)
        decisions = get_decisions_metadata(soup)
        logging.debug(f"Processing {len(decisions)} decisions")

        # Visit each decision and scrape the decision page
        for decision in decisions:
            if not is_existing(decision):
                yield Request(decision["url"], callback=self.parse_decision, cb_kwargs={"decision": decision})

    def parse_decision(self, response, decision):
        soup = make_soup(response)
        container = soup.find("div", {"class": "single_content"})

        # Get division
        # decision["division"] = container.find("h2").text.strip()

        # Get type
        h3 = container.find("h3")
        h3 = str(h3)
        # Remove \r and \n
        h3 = re.sub(r'[\r\n]', '', h3)
        # Split by break tag
        h3 = h3.split('<br/>')
        # Remove remaining tags
        h3 = [re.sub(r'<.*?>', '', x) for x in h3]
        # Remove empty strings and strip
        h3 = [x.strip() for x in h3 if x.strip() != '']
        # Assign type as the last element and remove spaces in between
        decision["type"] = h3[-1].replace(" ", "")

        # Get presiding justice. Find the first p tag
        presiding_justice = container.find("p")
        # Remove the colon if present
        # decision["presiding_justice"] = presiding_justice.text.replace(':', '').strip()

        # Get whole text, which is all the succeeding siblings of the presiding justice
        whole_text = presiding_justice.find_next_siblings()
        # Remove empty strings and strip
        whole_text = [str(x).strip() for x in whole_text if str(x).strip() != '']
        # Join the list
        whole_text = ' '.join(whole_text)
        # Remove \r and \n
        whole_text = re.sub(r'[\r\n]', ' ', whole_text)
        # Remove tags
        whole_text = re.sub(r'<.*?>', ' ', whole_text)
        # Remove multiple spaces
        whole_text = re.sub(r'\s+', ' ', whole_text)
        # Remove leading and trailing spaces
        decision["text"] = whole_text.strip()

        # Save output
        save_output(decision)


def make_soup(response):
    """
    Make a BeautifulSoup object from the response.
    :param response:    Response object
    :return:            BeautifulSoup object
    """
    return BeautifulSoup(response.body, "lxml")


def get_links(soup, month=None, year=None):
    """
    Get links to the decisions for the specified month and year.
    :param soup:    BeautifulSoup object
    :param month:   Month to crawl (optional, defaults to all months)
    :param year:    Year to crawl (optional, defaults to current year). 0 for all years.
    :return:        List of links
    """
    # Set up default values
    time_period = ''
    if month is not None:
        time_period += f"{month}/"

    time_period += f"{datetime.now().year}" if year is None else f"{year}" if year != 0 else ""

    date_container = soup.find("div", {"id": "container_date"})
    logging.info(f"Getting links for {time_period}")
    links = []
    for a in date_container.find_all("a"):
        if f"{time_period}" in a['href']:
            links.append(a['href'])
    logging.info(f"Found {len(links)} links for {time_period}\n" + '\n'.join(links))

    return links


def get_decisions_metadata(soup):
    """
    Get metadata of the decisions in the current page.
    :param soup:    BeautifulSoup object
    :return:        List of metadata
    """
    decisions = soup.find("div", {"id": "container_title"}).find("ul").find_all("li")
    metadata = []

    logging.info(f"Found {len(decisions)} decisions")

    for decision in decisions:
        raw_metadata = decision.find("a")

        # Get the URL
        current_metadata = {"url": raw_metadata["href"]}
        logging.debug(f"URL: {current_metadata['url']}")

        # Get the title
        current_metadata["title"] = raw_metadata.find("strong").text
        logging.debug(f"Title: {current_metadata['title']}")

        # Get parties
        parties = raw_metadata.find("small")
        current_metadata["subtitle"] = parties.text.strip()

        # Get the date
        date = parties.nextSibling.strip()
        # Create date object
        current_metadata["date"] = datetime.strptime(date, "%B %d, %Y").date().isoformat()
        logging.debug(f"Date: {current_metadata['date']}")

        # Log divider
        logging.debug('-' * 50)

        # Append to list
        metadata.append(current_metadata)

    return metadata
