In [None]:
import re
import pickle
from pathlib import Path
from multiprocessing import Pool

import requests
from bs4 import BeautifulSoup

class Element:
    def __init__(self, name, url):
        self.name = name.strip()
        self.url = url
    
    def get_url(self):
        return Element.convert_url(self.url)
        
    def convert_url(relative_url):
        return 'https://en.wikinews.org{}'.format(relative_url)

    def __repr__(self):
        return self.__str__()
    
    def __str__(self):
        return '"{}"'.format(self.name)
    
class Article(Element):
    def __init__(self, name, url):
        super().__init__(name, url)
    
    def is_meaningfull(self):
        if not self.url.startswith('/wiki/'):
            return False

        if self.name.startswith('Template') \
        or self.name.startswith('Portal') \
        or self.name.startswith('User') \
        or self.name.startswith('Talk') \
        or self.name.startswith('Help') \
        or self.name.startswith('TEST') \
        or self.name.startswith('Category') \
        or self.name.startswith('Module') \
        or self.name.startswith('News'):
            return False

        if 'Wikinews' in self.name:
            return False

        return True

    def __str__(self):
        return '"{}"'.format(self.name)
        
class Category(Element):
    def __init__(self, name, url, sub_categories=[], articles=[]):
        super().__init__(name, url)
        self.sub_categories = sub_categories
        self.articles = articles
    
    def parse(self, articles_cache, categories_cache):
        page_html = requests.get(self.get_url(), timeout=5)
        page_content = BeautifulSoup(page_html.content, "html.parser")

        # Parse and add unknown articles
        mw_pages = page_content.find('div', {"id": "mw-pages"})
        if mw_pages is not None:
            article_candidates = [Article(name=article.text, url=article['href']) for article in mw_pages.findAll('a')]
            articles = {
                article.url: article
                for article in article_candidates
                if article.is_meaningfull()
            }
            articles_cache.update(articles)
        
            # Add childs
            self.articles = list(articles.keys())
        
        mw_subcategories = page_content.find('div', {"id": "mw-subcategories"})
        if mw_subcategories is not None:
            self.sub_categories = [
                category['href']
                for category in mw_subcategories.findAll('a')
                if category['href'] in categories_cache
            ]
            
        return True
    
    def from_wikinews(limit = 2000, num_categories=-1):
        WIKINEWS = "https://en.wikinews.org/w/index.php?title=Special:Categories&limit={}&offset={}"

        category_detector = re.compile('\/wiki\/Category:.+')
        number_detector = re.compile('\(([0-9,]+) member')
        categories = []

        while (num_categories == -1 or len(categories) < num_categories):
            url = WIKINEWS.format(limit, categories[-1].url[15:] if len(categories) > 0 else '')
            page_response = requests.get(url, timeout=5)
            page_content = BeautifulSoup(page_response.content, "html.parser").find('div', {"class": "mw-spcontent"})

            for candidate in page_content.find_all('a', {"href": category_detector}):
                num_childs = int(number_detector.search(candidate.parent.text).group(1).replace(',', ''))
                if num_childs > 0:
                    categories.append(Category(url=candidate['href'], name=candidate.text))

            if page_content.find('a', {'class':'mw-nextlink'}) is None:
                break

        return { category.url : category for category in categories }

    def __str__(self):
        return '"{}":\n\tArticles: {}\n\tSubcategories: {}\n'.format(self.name, self.articles, self.sub_categories)

In [None]:
def load_categories():
    categories = Category.from_wikinews()
    print("Complete:\t\t", len(categories))

    date_regex = re.compile('^((January)|(February)|(March)|(April)|(May)|(June)|(July)|(August)|(September)|(October)|(November)|(December))')
    year_regex = re.compile('^[0-9]{4}$')
    filtered = {key: value for key, value in categories.items() if not date_regex.match(value.name)}
    filtered = {key: value for key, value in filtered.items() if not year_regex.match(value.name)}
    print("Without dates:\t\t", len(filtered))

    wiki_regex = re.compile('.*[w|W]iki.*')
    filtered = {key: value for key, value in filtered.items() if not wiki_regex.match(value.name)}
    print("Without 'wiki':\t\t", len(filtered))

    wiki_other_noise = re.compile('.*((Dialog )|(Files)|(Sockpuppets)|(Sources\/)|(Requests for)|(Peer reviewed)|(Non-)|(News articles)|(Media)|(Categor[y|(ies)])|(CC[-| ]BY)|(Assistant)|(Pages*)|(Checkuser)|(Template)|(WWC)|(Failed)|(Abandoned)|(Local)|(Live)|(Wikinewsie)|(User)|(UoW)).*', re.IGNORECASE)
    filtered = {key: value for key, value in filtered.items() if not wiki_other_noise.match(value.name)}
    print("Without noise:\t\t", len(filtered))

    #TO BE REMOVED
    # Move To Commons
    # Module documentation
    # No publish
    # Disputed
    # Archive[s|d]
    
    return filtered

FILE_CACHE = Path("wikinews.pickle")

if not FILE_CACHE.exists():
    print("Loading from Wikinews...")
    categories = load_categories()
    articles = {}
    
    for category in categories.values():
        category.parse(articles, categories)
    
    with open(FILE_CACHE, "wb") as file:
        pickle.dump((categories, articles), file)
else:
    print("Loading from cache...")
    with open(FILE_CACHE, "rb") as file:
        categories, articles = pickle.load(file)