# Wikipedia

In [54]:
from urllib.parse import urlencode
import requests
from pymongo import MongoClient
import time
import pandas as pd

## MongoDB Setup

In [2]:
# Creates client with default server and port
client = MongoClient()

# Get database "wikipedia"
db = client.wikipedia

# Get references to *pages* and *categories* collections
mongo_pages = db.pages
mongo_categories = db.categories

## Download categories and pages from Wikipedia

In [4]:
# https://pt.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3AArgentina&cmlimit=200
url_base = 'https://pt.wikipedia.org/w/api.php'
f = { 
    'action': 'query',
    'format': 'json',
    'list': 'categorymembers',
     'cmlimit': '200',
     'cmprop': 'ids|title|type'
}
def parameters_category_members(cmtitle = None, cmpageid = -1):
    fc = f.copy()
    if cmpageid != -1:
        fc['cmpageid'] = cmpageid
    else:
        fc['cmtitle'] = cmtitle
    return fc

def get_url(parameters):
    return f'{url_base}{parameters}'

def get_url_category(cmtitle):
    parameters_argentina = parameters_category_members('Category:Argentina')
    return get_url(urlencode(parameters_argentina))

def get_category(cmtitle = None, cmpageid = -1):
    params_category = parameters_category_members(cmtitle, cmpageid)
    category_r = requests.get(url_base, params=params_category)
    category_content = []
    if category_r.status_code == 200:
        category_content = category_r.json()
    return category_content

def get_category_members(cmtitle = None, cmpageid = -1):
    return get_category(cmtitle=cmtitle, cmpageid=cmpageid)['query']['categorymembers']

## Insert and retrieve data from MongoDB

- Categories status
    * Check: waiting for someone to check if it has to be downloaded
    * Waiting: waiting for dowload
    * Skeep: skeep download of this category
    * Done: downloaded
- Pages status
    * Waiting: waiting for download

In [29]:
def get_categories_for_download():
    waiting_criteria = { 'download': 'Waiting' }
    return list(mongo_categories.find(waiting_criteria))

def add_categories_for_check(categories_for_check):
    add_categories_for_status(categories_for_check, 'Check')

def add_category_for_download(pageid):
    mongo_categories.update_one(
        { "pageid": pageid },
        { "$set": { 'download': 'Waiting' } }
    )

def set_category_done(pageid):
    mongo_categories.update_one(
        { "pageid": pageid },
        { "$set": { 'download': 'Done' } }
    )

def add_categories_for_status(categories_for_status, status):
    if len(categories_for_status) > 0:
        for category_for_status in categories_for_status:
            category_for_status['download'] = status
        mongo_categories.insert_many(categories_for_status)
    
def category_exists(cmpageid):
    return mongo_categories.count_documents({'pageid': cmpageid}) > 0

## Algorithm

In [26]:
def insert_pages(pages, cmpageid, cmtitle):
    if len(pages) > 0:
        for page in pages:
            page['category_id'] = cmpageid
            page['category_title'] = cmtitle
            page['download'] = 'Waiting'
        mongo_pages.insert_many(pages)

def download_subcategories(subcategories):
    if len(subcategories) > 0:
        for subcategory in subcategories:
            download_category_tree(subcategory)

def download_category_tree(category):
    # Sleep for 1 second before starting download one category tree
    time.sleep(1)

    cmpageid = category['pageid']
    cmtitle = category["title"]
    print(f'Retrieving pages for category {cmtitle} [{cmpageid}]')
    
    members = get_category_members(cmpageid=cmpageid)
    pages = [m for m in members if m['type'] == 'page']
    insert_pages(pages, cmpageid, cmtitle)
    subcats = [m for m in members if m['type'] == 'subcat']
    for s in subcats:
        s['parent'] = cmpageid
    subcats_for_check = [s for s in subcats if not category_exists(s['pageid'])]
    add_categories_for_check(subcats_for_check)
    set_category_done(cmpageid)

### Inicializa todas as subcategorias da principal *Argentina*

In [7]:
members_categories_argentina = get_category_members(cmtitle='Category:Argentina')
pages_argentina = [m for m in members_categories_argentina if m['type'] == 'page']
subcategories_argentina = [m for m in members_categories_argentina if m['type'] == 'subcat']
add_categories_for_check(subcategories_argentina)

### Marca categorias para download

In [49]:
ids = [2547449, 148415, 178774, 1861964, 199887, 439525, 2037050, 2464777, 5359760, 2039114, 5079160, 1498369, 4152823, 65861, 5412037]
for id in ids:
    add_category_for_download(pageid=id)

### Executa download das páginas das categorias marcadas para Download

In [50]:
# Busca categorias marcadas para download
categories_for_download = get_categories_for_download()
print(f'{len(categories_for_download)} categoria(s) marcadas para download')

# Faz download de cada categoria
for c in categories_for_download:
    download_category_tree(c)

15 categoria(s) marcadas para download
Retrieving pages for category Categoria:História da Argentina [148415]
Retrieving pages for category Categoria:Política da Argentina [178774]
Retrieving pages for category Categoria:Sociedade da Argentina [1861964]
Retrieving pages for category Categoria:!Esboços sobre geografia da Argentina [199887]
Retrieving pages for category Categoria:Demografia da Argentina [439525]
Retrieving pages for category Categoria:Fronteiras da Argentina [2037050]
Retrieving pages for category Categoria:Geologia da Argentina [2464777]
Retrieving pages for category Categoria:Listas de geografia da Argentina [5359760]
Retrieving pages for category Categoria:Localidades da Argentina [2039114]
Retrieving pages for category Categoria:Meio ambiente da Argentina [2547449]
Retrieving pages for category Categoria:Pradarias da Argentina [5079160]
Retrieving pages for category Categoria:Subdivisões da Argentina [1498369]
Retrieving pages for category Categoria:Corpos de água da

### Analisar categorias para Check

In [51]:
for_check = list(mongo_categories.find({ 'download': 'Check'}))

### Backup do que já foi feito download

In [52]:
# Categorias
categories = list(mongo_categories.find())
categories_df = pd.DataFrame(categories)
categories_df.to_csv('categories.csv')

In [62]:
pages = list(mongo_pages.find())
pages_df = pd.DataFrame(pages)
pages_df.to_csv('pages.csv')

In [61]:
pages_df.shape

(498, 8)