# Wikipedia

In [94]:
from urllib.parse import urlencode
import requests
from pymongo import MongoClient
import time

## MongoDB Setup

In [95]:
# Creates client with default server and port
client = MongoClient()

# Get database "wikipedia"
db = client.wikipedia

# Get references to *pages* and *categories* collections
mongo_pages = db.pages
mongo_categories = db.categories

## Download categories and pages from Wikipedia

In [62]:
# https://pt.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3AArgentina&cmlimit=200
url_base = 'https://pt.wikipedia.org/w/api.php'
f = { 
    'action': 'query',
    'format': 'json',
    'list': 'categorymembers',
     'cmlimit': '200',
     'cmprop': 'ids|title|type'
}
def parameters_category_members(cmtitle = None, cmpageid = -1):
    fc = f.copy()
    if cmpageid != -1:
        fc['cmpageid'] = cmpageid
    else:
        fc['cmtitle'] = cmtitle
    return fc

def get_url(parameters):
    return f'{url_base}{parameters}'

def get_url_category(cmtitle):
    parameters_argentina = parameters_category_members('Category:Argentina')
    return get_url(urlencode(parameters_argentina))

def get_category(cmtitle = None, cmpageid = -1):
    params_category = parameters_category_members(cmtitle, cmpageid)
    category_r = requests.get(url_base, params=params_category)
    category_content = []
    if category_r.status_code == 200:
        category_content = category_r.json()
    return category_content

def get_category_members(cmtitle = None, cmpageid = -1):
    return get_category(cmtitle=cmtitle, cmpageid=cmpageid)['query']['categorymembers']

## Insert and retrieve data from MongoDB

- Categories status
    * Check: waiting for someone to check if it has to be downloaded
    * Waiting: waiting for dowload
    * Skeep: skeep download of this category
    * Done: downloaded
- Pages status
    * Waiting: waiting for download

In [136]:
def get_categories_for_download():
    waiting_criteria = { 'Download': 'Waiting' }
    return list(mongo_categories.find(waiting_criteria))

def add_categories_for_check(categories_for_check):
    add_categories_for_status(categories_for_check, 'Check')

def add_categories_for_download(categories_for_download):
    add_categories_for_status(categories_for_download, 'Waiting')

def add_categories_for_status(categories_for_status, status):
    if len(categories_for_status) > 0:
        for category_for_status in categories_for_status:
            category_for_status['Download'] = status
        mongo_categories.insert_many(categories_for_status)
    
def category_exists(cmpageid):
    return mongo_categories.count_documents({'pageid': cmpageid}) > 0

## Algorithm

In [137]:
def insert_pages(pages, cmpageid, cmtitle):
    if len(pages) > 0:
        for page in pages:
            page['category_id'] = cmpageid
            page['category_title'] = cmtitle
            page['download'] = 'Waiting'
        mongo_pages.insert_many(pages)

def download_subcategories(subcategories):
    if len(subcategories) > 0:
        for subcategory in subcategories:
            download_category_tree(subcategory)

def download_category_tree(category):
    # Sleep for 1 second before starting download one category tree
    time.sleep(1)

    cmpageid = category['pageid']
    cmtitle = category["title"]
    print(f'Retrieving pages for category {cmtitle} [{cmpageid}]')
    
    members = get_category_members(cmpageid=cmpageid)
    pages = [m for m in members if m['type'] == 'page']
    insert_pages(pages, cmpageid, cmtitle)
    subcats = [m for m in members if m['type'] == 'subcat']
    subcats_for_check = [s for s in subcats if not category_exists(s['pageid'])]
    add_categories_for_check(subcats_for_check)

### Teste para categoria *Geografia da Argentina*

In [132]:
members_categories_argentina = get_category_members(cmtitle='Category:Argentina')
pages_argentina = [m for m in members_categories_argentina if m['type'] == 'page']
subcategories_argentina = [m for m in members_categories_argentina if m['type'] == 'subcat']


In [135]:
add_categories_for_download(subcategories_argentina)

In [131]:
add_categories_for_download([{ 'pageid': 0, 'title': 'Categoria:Argentina', 'type': 'subcat'}])
download_subcategories(get_categories_for_download())

Retrieving pages for category Categoria:Argentina [0]


KeyError: 'query'