In [6]:
import re
import pickle
from pathlib import Path
from multiprocessing import Pool
import json

import requests
from bs4 import BeautifulSoup

class Element:
    def __init__(self, url):
        self.url = url
    
    def get_url(self):
        return Element.convert_url(self.url)
        
    def convert_url(relative_url):
        return 'https://en.wikinews.org{}'.format(relative_url)

    def __repr__(self):
        return self.__str__()
    
    def __str__(self):
        return '"{}"'.format(self.url)
    
class Article(Element):
    def __init__(self, url, name=None, categories=[]):
        super().__init__(url)
        
        if name is not None:
            self.name = name.strip()
            self.categories = categories
        else:
            self.name, self.categories = Article.parse(url)
    
    def is_meaningfull(self):
        if not self.url.startswith('/wiki/'):
            return False

        if self.name.startswith('Template') \
        or self.name.startswith('Portal') \
        or self.name.startswith('User') \
        or self.name.startswith('Talk') \
        or self.name.startswith('Help') \
        or self.name.startswith('TEST') \
        or self.name.startswith('Category') \
        or self.name.startswith('Module') \
        or self.name.startswith('News'):
            return False

        if 'Wikinews' in self.name:
            return False

        return True

    def parse(url):    
        json_url = "https://en.wikinews.org/w/api.php?action=query&titles={}&prop=categories&format=json".format(
            url[6:]
        )
        
        response = json.loads(requests.get(json_url, timeout=5).content)
        pages = response['query']['pages']
        if len(pages) != 1:
            raise ""
        page = next(iter(pages.values()))
        
        categories = page['categories'] if 'categories' in page else []
        return page['title'], [category['title'][9:] for category in categories]
            
    def __str__(self):
        return '"{}"'.format(self.name)
        
class Category(Element):
    def __init__(self, url, name=None, sub_categories=[], articles=[]):
        super().__init__(url)
        self.name = name
        self.sub_categories = sub_categories
        self.articles = articles
    
    def parse(self, articles_cache, categories_cache, check_article=True, check_categories=True):
        page_html = requests.get(self.get_url(), timeout=5)
        page_content = BeautifulSoup(page_html.content, "html.parser")

        if self.name is None:
            title_element = page_content.find('h1')
            if title_element is not None:
                self.name = title_element.text[9:].strip()
                
        # Parse and add unknown articles
        mw_pages = page_content.find('div', {"id": "mw-pages"})
        if mw_pages is not None:
            article_candidates = [Article(name=article.text, url=article['href']) for article in mw_pages.findAll('a')]
            articles = {
                article.url: article
                for article in article_candidates
                if (not check_article) or article.is_meaningfull()
            }
            articles_cache.update(articles)
        
            # Add childs
            self.articles = list(articles.keys())
        
        mw_subcategories = page_content.find('div', {"id": "mw-subcategories"})
        if mw_subcategories is not None:
            self.sub_categories = [
                category['href']
                for category in mw_subcategories.findAll('a')
                if (not check_categories) or category['href'] in categories_cache
            ]
            
            if not check_categories:
                new_categories = [
                    Category(category_url) for category_url in self.sub_categories 
                    if category_url not in categories_cache
                ]
                
                categories_cache.update({
                    category.url: category 
                    for category in new_categories 
                    if category.parse(articles_cache, categories_cache, check_article, check_categories)
                })
            
        return True
    
    def from_urls(categories):
        articles_tmp = {}
        categories_tmp = {}
        return [
            category 
            for category in (Category(category) for category in categories)
            if category.parse(articles_tmp, categories_tmp, False, False)
        ], categories_tmp, articles_tmp
            
    def from_wikinews(limit = 2000, num_categories=-1):
        WIKINEWS = "http://en.wikinews.org/w/index.php?title=Special:Categories&limit={}&offset={}"

        category_detector = re.compile('\/wiki\/Category:.+')
        number_detector = re.compile('\(([0-9,]+) ')
        categories = []

        while (num_categories == -1 or len(categories) < num_categories):
            url = WIKINEWS.format(limit, categories[-1].url[15:] if len(categories) > 0 else '')
            page_response = requests.get(url, timeout=5)
            page_content = BeautifulSoup(page_response.content, "html.parser").find('div', {"class": "mw-spcontent"})

            for candidate in page_content.find_all('a', {"href": category_detector}):
                num_childs = int(number_detector.search(candidate.parent.text).group(1).replace(',', ''))
                if num_childs > 0:
                    categories.append(Category(url=candidate['href']))

            if page_content.find('a', {'class':'mw-nextlink'}) is None:
                break

        return { category.url : category for category in categories }

    def filter_categories(categories, min_articles):
        date_regex = re.compile('^((January)|(February)|(March)|(April)|(May)|(June)|(July)|(August)|(September)|(October)|(November)|(December))')
        year_regex = re.compile('^[0-9]{4}$')
        wiki_regex = re.compile('.*[w|W]iki.*')
        wiki_other_noise = re.compile(""".*(
        (Archived)|
        (Dialog )|
        (Files)|
        (Sockpuppets)|
        (Sources\/)|
        (Requests for)|
        (Peer reviewed)|
        (Non-)|
        (articles?)|
        (Media)|
        (Categor[y|(ies)])|
        (CC[-| ]BY)|
        (Assistant)|
        (Pages*)|
        (Checkuser)|
        (Template)|
        (WWC)|
        (Failed)|
        (Abandoned)|
        (Local)|
        (Live)|
        (Wikinewsie)|
        (User)|
        (UoW)|
        (No publish)|
        (Published)|
        (AutoArchived)|
        (Original reporting)|
        (Audio reports)|
        (Out of date stories)|
        (News of the World)|
        (Writing Contests)
        ).*""", re.IGNORECASE | re.VERBOSE)
        
        filtered = {
            key: value for key, value in categories.items() 
            if not date_regex.match(key) 
            and not year_regex.match(key) 
            and not wiki_regex.match(key) 
            and not wiki_other_noise.match(key)
        }

        filtered = {key: value for key, value in filtered.items() if len(value) >= min_articles}
        return filtered

    def __str__(self):
        return '"{}":\n\tArticles: {}\n\tSubcategories: {}\n'.format(self.name, self.articles, self.sub_categories)

In [10]:
from knowledgestore import ks
from collections import defaultdict

def articles_to_categories(articles):
    categories = defaultdict(set)
    for article in articles:
        for category in article.categories:
            categories[category].add(article)
    return categories

def create_article(link):
    return Article(link[22:])

FILE_CACHE = Path("articles.pickle")
if not FILE_CACHE.exists():
    print("Generating articles for KnowledgeStore")
    with Pool(6) as pool:
        articles = pool.map(create_article, ks.get_all_resource_uris())
    with open(FILE_CACHE, "wb") as file:
        pickle.dump(articles, file)
else:
    with open(FILE_CACHE, "rb") as file:
        articles = pickle.load(file)
    
print("{} articles loaded".format(len(articles)))

categories = articles_to_categories(articles)
print("Categories before filtering: \t", len(categories))
categories = Category.filter_categories(categories, 5)
print("Categories after filtering: \t", len(categories))

19737 articles loaded
Categories before filtering: 	 5833
Categories after filtering: 	 1612


In [25]:
import random

class User:
    def __init__(self, categories, num_interests):
        self.__categories = categories
        self.interests = random.sample(categories.keys(), num_interests)
    
    def get_positive_sample(self, num_articles_per_interest):
        interests_articles = [
            random.sample(self.__categories[interest], num_articles_per_interest + 1) 
            for interest in self.interests
        ]
        
        input_data = []
        true_labels = []
        for articles in interests_articles:
            true_labels.append(articles[0])
            input_data.extend(articles[1:])
            
        random.shuffle(input_data)
        return input_data, random.choice(true_labels)
    
    def get_negative_sample(self):
        candidates = set()
        while len(candidates) == 0:
            negative_category = random.choice(tuple(categories.keys()))
            if negative_category not in self.interests:
                candidates = categories[negative_category]
                for interest in self.interests:
                    candidates = candidates.difference(categories[interest])
        return random.choice(tuple(candidates))
    
user = User(categories, 2)
interesting_articles, positive_sample = user.get_positive_sample(3)
print("Interests:\t\t", user.interests)
for interesting_article in interesting_articles:
    print("\t\t\t", interesting_article)
print("Positive sample:\t", positive_sample)
print("Negative sample:\t", user.get_negative_sample())

Interests:		 ['Sony', 'Nairobi']
			 "Samsung to sell dual-standard DVD player"
			 "American console sales continue to decline throughout April"
			 "Sudanese parties sign peace pledge"
			 "Mob attack on church in Kenya leaves 30 dead"
			 "Robot goes to preschool"
			 "Bombing in Kenya's capital city Nairobi"
Positive sample:	 "Sony may cut PS3 prices"
Negative sample:	 "A weak spot in HIV spotted"
