# Wikipedia

In [1]:
from urllib.parse import urlencode
import requests
from pymongo import MongoClient
import time
import pandas as pd

## MongoDB Setup

In [2]:
# Creates client with default server and port
client = MongoClient()

# Get database "wikipedia"
db = client.wikipedia

# Get references to *pages* and *categories* collections
mongo_pages = db.pages
mongo_categories = db.categories

## Download categories and pages from Wikipedia

In [3]:
# https://pt.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3AArgentina&cmlimit=200
url_base = 'https://pt.wikipedia.org/w/api.php'
f = { 
    'action': 'query',
    'format': 'json',
    'list': 'categorymembers',
     'cmlimit': '200',
     'cmprop': 'ids|title|type'
}
def parameters_category_members(cmtitle = None, cmpageid = -1):
    fc = f.copy()
    if cmpageid != -1:
        fc['cmpageid'] = cmpageid
    else:
        fc['cmtitle'] = cmtitle
    return fc

def get_url(parameters):
    return f'{url_base}{parameters}'

def get_url_category(cmtitle):
    parameters_argentina = parameters_category_members('Category:Argentina')
    return get_url(urlencode(parameters_argentina))

def get_category(cmtitle = None, cmpageid = -1):
    params_category = parameters_category_members(cmtitle, cmpageid)
    category_r = requests.get(url_base, params=params_category)
    category_content = []
    if category_r.status_code == 200:
        category_content = category_r.json()
    return category_content

def get_category_members(cmtitle = None, cmpageid = -1):
    return get_category(cmtitle=cmtitle, cmpageid=cmpageid)['query']['categorymembers']

## Insert and retrieve data from MongoDB

- Categories status
    * Check: waiting for someone to check if it has to be downloaded
    * Waiting: waiting for dowload
    * Skeep: skeep download of this category
    * Done: downloaded
- Pages status
    * Waiting: waiting for download

In [24]:
def get_categories_for_download():
    waiting_criteria = { 'download': 'Waiting' }
    return list(mongo_categories.find(waiting_criteria))

def add_categories_for_check(categories_for_check):
    add_categories_for_status(categories_for_check, 'Check')

def add_category_for_download(pageid):
    mongo_categories.update_one(
        { "pageid": pageid },
        { "$set": { 'download': 'Waiting' } }
    )

def set_category_done(pageid):
    mongo_categories.update_one(
        { "pageid": pageid },
        { "$set": { 'download': 'Done' } }
    )

def set_category_skeep(pageid):
    set_category_status(pageid, category_status='Skeep')

def set_category_status(pageid, category_status):
    mongo_categories.update_one(
        { "pageid": pageid },
        { "$set": { 'download': category_status } }
    )

def add_categories_for_status(categories_for_status, status):
    if len(categories_for_status) > 0:
        for category_for_status in categories_for_status:
            category_for_status['download'] = status
        mongo_categories.insert_many(categories_for_status)
    
def category_exists(cmpageid):
    return mongo_categories.count_documents({'pageid': cmpageid}) > 0

## Algorithm

In [13]:
def insert_pages(pages, cmpageid, cmtitle):
    if len(pages) > 0:
        for page in pages:
            page['category_id'] = cmpageid
            page['category_title'] = cmtitle
            page['download'] = 'Waiting'
        mongo_pages.insert_many(pages)

def download_subcategories(subcategories):
    if len(subcategories) > 0:
        for subcategory in subcategories:
            download_category_tree(subcategory)

def download_category_tree(category, country):
    # Sleep for 1 second before starting download one category tree
    time.sleep(1)

    cmpageid = category['pageid']
    cmtitle = category["title"]
    print(f'Retrieving pages for category {cmtitle} [{cmpageid}]')
    
    members = get_category_members(cmpageid=cmpageid)
    pages = [m for m in members if m['type'] == 'page']
    for p in pages:
        p['country'] = country
    insert_pages(pages, cmpageid, cmtitle)
    subcats = [m for m in members if m['type'] == 'subcat']
    for s in subcats:
        s['parent'] = cmpageid
        s['country'] = country
    subcats_for_check = [s for s in subcats if not category_exists(s['pageid'])]
    add_categories_for_check(subcats_for_check)
    set_category_done(cmpageid)

### Inicializa todas as subcategorias da principal *Argentina*

In [7]:
members_categories_argentina = get_category_members(cmtitle='Category:Argentina')
pages_argentina = [m for m in members_categories_argentina if m['type'] == 'page']
subcategories_argentina = [m for m in members_categories_argentina if m['type'] == 'subcat']
add_categories_for_check(subcategories_argentina)

### Inicializa todas as subcategorias da principal Chile

In [8]:
members_categories_chile = get_category_members(cmtitle='Category:Chile')
for mcc in members_categories_chile:
    mcc['country'] = 'Chile'
subcategories_chile = [m for m in members_categories_chile if m['type'] == 'subcat']
add_categories_for_check(subcategories_chile)

### Inicializa todas as subcategorias da Nova Zelândia

In [46]:
members_categories_nz = get_category_members(cmtitle='Categoria:Nova_Zelândia')
for mcnz in members_categories_nz:
    mcnz['country'] = 'Nova Zelandia'
subcategories_nz = [m for m in members_categories_nz if m['type'] == 'subcat']
add_categories_for_check(subcategories_nz)

In [None]:
# Categoria:Estados_Unidos

### Marca categorias para download

In [51]:
ids = [994848, 134030, 924189, 3177252, 4131772, 3455506, 3206668]
for id in ids:
    add_category_for_download(pageid=id)

In [58]:
ids_fix = [994848, 134030, 924189, 3177252, 4131772, 3455506, 3206668]
for id_fix in ids_fix:
    # mongo_categories.update_many({"parent": id_fix}, {"$set": { "country": "Nova Zelandia"}})
    mongo_pages.update_many({"category_id": id_fix}, {"$set": { "country": "Nova Zelandia"}})

### Executa download das páginas das categorias marcadas para Download

In [52]:
# Busca categorias marcadas para download
categories_for_download = get_categories_for_download()
print(f'{len(categories_for_download)} categoria(s) marcadas para download')

# Faz download de cada categoria
for c in categories_for_download:
    download_category_tree(c, 'Nova Zelandia')

7 categoria(s) marcadas para download
Retrieving pages for category Categoria:Cultura da Nova Zelândia [994848]
Retrieving pages for category Categoria:Geografia da Nova Zelândia [134030]
Retrieving pages for category Categoria:História da Nova Zelândia [924189]
Retrieving pages for category Categoria:Listas da Nova Zelândia [3177252]
Retrieving pages for category Categoria:Meio ambiente da Nova Zelândia [4131772]
Retrieving pages for category Categoria:!Esboços sobre a Nova Zelândia [3455506]
Retrieving pages for category Categoria:!Portal Nova Zelândia [3206668]


### Analisar categorias para Check

In [55]:
for_skeep = []
if (len(for_skeep) > 0):
    for sk in for_skeep:
        set_category_skeep(sk)

for_check = list(mongo_categories.find({ 'download': 'Check', 'country': 'Nova Zelandia'}))

In [56]:
# [994848, 134030, 924189, 3177252, 4131772, 3455506, 3206668]
[(fc['pageid'], fc['title']) for fc in for_check]

[(4562929, 'Categoria:Ciência e tecnologia na Nova Zelândia'),
 (486820, 'Categoria:Economia da Nova Zelândia'),
 (2039047, 'Categoria:Educação na Nova Zelândia'),
 (1646640, 'Categoria:Forças armadas da Nova Zelândia'),
 (6080643, 'Categoria:Fósseis da Nova Zelândia'),
 (893777, 'Categoria:Mídia da Nova Zelândia'),
 (563782, 'Categoria:Niue'),
 (423663, 'Categoria:Política da Nova Zelândia'),
 (3922680, 'Categoria:Sociedade da Nova Zelândia'),
 (2766537, 'Categoria:Toquelau'),
 (425125, 'Categoria:!Predefinições sobre a Nova Zelândia')]

### Backup do que já foi feito download

In [59]:
# Categorias
categories = list(mongo_categories.find())
categories_df = pd.DataFrame(categories)
categories_df.to_csv('categories.csv')

In [60]:
pages = list(mongo_pages.find())
pages_df = pd.DataFrame(pages)
pages_df.to_csv('pages.csv')

In [61]:
pages_df.shape

(498, 8)

### Verificar categorias e páginas sem country

In [37]:
# Pages
pages_no_country = list(db.pages.find({"country": {"$exists": False }}))
print(f'{len(pages_no_country)} pages found with no country')

# Categories
categories_no_country = list(db.categories.find({"country": {"$exists": False }}))
print(f'{len(categories_no_country)} categories found with no country')

0 pages found with no country
0 categories found with no country
