In [None]:
import re
import requests
from bs4 import BeautifulSoup

class Element:
    def __init__(self, name, url):
        self.name = name
        self.url = url
    
    def get_url(self):
        return Element.convert_url(self.url)
        
    def convert_url(relative_url):
        return 'https://en.wikinews.org{}'.format(self.url)

class Article(Element):
    def __init__(self, name, url):
        super().__init__(name, url)
        
class Category(Element):
    def __init__(self, name, url, sub_categories=[], articles=[]):
        super().__init__(name, url)
        self.sub_categories = sub_categories
        self.articles = articles
    
    def parse(self, articles_cache, categories_cache):
        page_html = requests.get(self.get_url(), timeout=5)
        page_content = BeautifulSoup(page_html.content, "html.parser")

        # Read and add articles
        articles = {
            article['href']: Article(name=article.text, url=article['href'])
            for article in page_content.find('div', {"id": "mw-pages"}).findAll('a')
        }
        articles_cache.update(articles)
        self.sub_categories = list(categories.keys())
        
        # Read all categories
        self.articles = [
            category['href']
            for category in page_content.find('div', {"id": "mw-subcategories"}).findAll('a')
            if category['href'] in categories_cache
        ]
    
    def from_wikinews(limit = 2000, num_categories=-1):
        WIKINEWS = "https://en.wikinews.org/w/index.php?title=Special:Categories&limit={}&offset={}"

        category_detector = re.compile('\/wiki\/Category:.+')
        number_detector = re.compile('\(([0-9,]+) member')
        categories = []

        while (num_categories == -1 or len(categories) < num_categories):
            url = WIKINEWS.format(limit, categories[-1].url[15:] if len(categories) > 0 else '')
            page_response = requests.get(url, timeout=5)
            page_content = BeautifulSoup(page_response.content, "html.parser").find('div', {"class": "mw-spcontent"})

            for candidate in page_content.find_all('a', {"href": category_detector}):
                num_childs = int(number_detector.search(candidate.parent.text).group(1).replace(',', ''))
                if num_childs > 0:
                    categories.append(Category(url=candidate['href'], name=candidate.text))

            if page_content.find('a', {'class':'mw-nextlink'}) is None:
                break

        return { category.url : category for category in categories }


In [None]:
categories = Category.from_wikinews()
print("Complete:\t\t", len(categories))

date_regex = re.compile('^((January)|(February)|(March)|(April)|(May)|(June)|(July)|(August)|(September)|(October)|(November)|(December))')
year_regex = re.compile('^[0-9]{4}$')
filtered = {key: value for key, value in categories.items() if not date_regex.match(value.name)}
filtered = {key: value for key, value in filtered.items() if not year_regex.match(value.name)}
print("Without dates:\t\t", len(filtered))

wiki_regex = re.compile('.*[w|W]iki.*')
filtered = {key: value for key, value in filtered.items() if not wiki_regex.match(value.name)}
print("Without 'wiki':\t\t", len(filtered))

wiki_other_noise = re.compile('.*((Dialog )|(Files)|(Sockpuppets)|(Sources\/)|(Requests for)|(Peer reviewed)|(Non-)|(News articles)|(Media)|(Categor[y|(ies)])|(CC[-| ]BY)|(Assistant)|(Pages*)|(Checkuser)|(Template)|(WWC)|(Failed)|(Abandoned)|(Local)|(Live)|(Wikinewsie)|(User)|(UoW)).*', re.IGNORECASE)
filtered = {key: value for key, value in filtered.items() if not wiki_other_noise.match(value.name)}
print("Without noise:\t\t", len(filtered))

#TO BE REMOVED
# Move To Commons
# Module documentation
# No publish
# Disputed
# Archive[s|d]

print()
for category in filtered.values():
    print(category.name)