# Wikipedia

In [1]:
from urllib.parse import urlencode
import requests
from pymongo import MongoClient
import time
import pandas as pd

## MongoDB Setup

In [2]:
# Creates client with default server and port
client = MongoClient()

# Get database "wikipedia"
db = client.wikipedia

# Get references to *pages* and *categories* collections
mongo_pages = db.pages
mongo_categories = db.categories
mongo_content = db.pages_content

## Download categories and pages from Wikipedia

In [3]:
# https://pt.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3AArgentina&cmlimit=200
url_base = 'https://pt.wikipedia.org/w/api.php'
f = { 
    'action': 'query',
    'format': 'json',
    'list': 'categorymembers',
     'cmlimit': '200',
     'cmprop': 'ids|title|type'
}
def parameters_category_members(cmtitle = None, cmpageid = -1):
    fc = f.copy()
    if cmpageid != -1:
        fc['cmpageid'] = cmpageid
    else:
        fc['cmtitle'] = cmtitle
    return fc

def get_url(parameters):
    return f'{url_base}{parameters}'

def get_url_category(cmtitle):
    parameters_argentina = parameters_category_members('Category:Argentina')
    return get_url(urlencode(parameters_argentina))

def get_category(cmtitle = None, cmpageid = -1):
    params_category = parameters_category_members(cmtitle, cmpageid)
    category_r = requests.get(url_base, params=params_category)
    category_content = []
    if category_r.status_code == 200:
        category_content = category_r.json()
    return category_content

def get_category_members(cmtitle = None, cmpageid = -1):
    return get_category(cmtitle=cmtitle, cmpageid=cmpageid)['query']['categorymembers']

## Insert and retrieve data from MongoDB

- Categories status
    * Check: waiting for someone to check if it has to be downloaded
    * Waiting: waiting for dowload
    * Skeep: skeep download of this category
    * Done: downloaded
- Pages status
    * Waiting: waiting for download

In [4]:
def get_categories_for_download():
    waiting_criteria = { 'download': 'Waiting' }
    return list(mongo_categories.find(waiting_criteria))

def add_categories_for_check(categories_for_check):
    add_categories_for_status(categories_for_check, 'Check')

def add_category_for_download(pageid):
    mongo_categories.update_one(
        { "pageid": pageid },
        { "$set": { 'download': 'Waiting' } }
    )

def set_category_done(pageid):
    mongo_categories.update_one(
        { "pageid": pageid },
        { "$set": { 'download': 'Done' } }
    )

def set_category_skeep(pageid):
    set_category_status(pageid, category_status='Skeep')

def set_category_status(pageid, category_status):
    mongo_categories.update_one(
        { "pageid": pageid },
        { "$set": { 'download': category_status } }
    )

def add_categories_for_status(categories_for_status, status):
    if len(categories_for_status) > 0:
        for category_for_status in categories_for_status:
            category_for_status['download'] = status
        mongo_categories.insert_many(categories_for_status)
    
def category_exists(cmpageid):
    return mongo_categories.count_documents({'pageid': cmpageid}) > 0

## Algorithm

In [10]:
def insert_pages(pages, cmpageid, cmtitle):
    if len(pages) > 0:
        for page in pages:
            page['category_id'] = cmpageid
            page['category_title'] = cmtitle
            page['download'] = 'Waiting'
        mongo_pages.insert_many(pages)

def download_subcategories(subcategories):
    if len(subcategories) > 0:
        for subcategory in subcategories:
            download_category_tree(subcategory)

def download_category_tree(category, country):
    # Sleep for 1 second before starting download one category tree
    time.sleep(1)

    cmpageid = category['pageid']
    cmtitle = category["title"]
    print(f'Retrieving pages for category {cmtitle} [{cmpageid}]')
    
    members = get_category_members(cmpageid=cmpageid)
    pages = [m for m in members if m['type'] == 'page']
    for p in pages:
        p['country'] = country
    insert_pages(pages, cmpageid, cmtitle)
    subcats = [m for m in members if m['type'] == 'subcat']
    for s in subcats:
        s['parent'] = cmpageid
        s['country'] = country
    subcats_for_check = [s for s in subcats if not category_exists(s['pageid'])]
    add_categories_for_check(subcats_for_check)
    set_category_done(cmpageid)

### Inicializa todas as subcategorias da principal *Argentina*

In [5]:
members_categories_argentina = get_category_members(cmtitle='Category:Argentina')
pages_argentina = [m for m in members_categories_argentina if m['type'] == 'page']
subcategories_argentina = [m for m in members_categories_argentina if m['type'] == 'subcat']
# add_categories_for_check(subcategories_argentina)

### Inicializa todas as subcategorias da principal Chile

In [8]:
members_categories_chile = get_category_members(cmtitle='Category:Chile')
for mcc in members_categories_chile:
    mcc['country'] = 'Chile'
subcategories_chile = [m for m in members_categories_chile if m['type'] == 'subcat']
# add_categories_for_check(subcategories_chile)

### Inicializa todas as subcategorias da Nova Zelândia

In [46]:
members_categories_nz = get_category_members(cmtitle='Categoria:Nova_Zelândia')
for mcnz in members_categories_nz:
    mcnz['country'] = 'Nova Zelandia'
subcategories_nz = [m for m in members_categories_nz if m['type'] == 'subcat']
# add_categories_for_check(subcategories_nz)

### Inicializa todas as subcategorias dos Estados Unidos

In [16]:
# Categoria:Estados_Unidos
members_categories_usa = get_category_members(cmtitle='Categoria:Estados_Unidos')
for mcusa in members_categories_usa:
    mcusa['country'] = 'Estados Unidos'
subcategories_usa = [m for m in members_categories_usa if m['type'] == 'subcat']
add_categories_for_check(subcategories_usa)

### Marca categorias para download

In [120]:
ids = [5568913, 5600226, 2976735, 3229009, 3228959, 5543207, 206176,3228965, 1935902,3266151,  3572256, 3544176, 1715183, 4852814, 4814502, 2178241,5580178, 3472896,  4814499, 5577558, 2201392]
for id in ids:
    add_category_for_download(pageid=id)

### Executa download das páginas das categorias marcadas para Download

In [121]:
# Busca categorias marcadas para download
categories_for_download = get_categories_for_download()
print(f'{len(categories_for_download)} categoria(s) marcadas para download')

# Faz download de cada categoria
for c in categories_for_download:
    download_category_tree(c, 'Estados Unidos')

21 categoria(s) marcadas para download
Retrieving pages for category Categoria:Corpos de água do Alasca [5568913]
Retrieving pages for category Categoria:Arquipélagos do Alasca [5600226]
Retrieving pages for category Categoria:Baías do Alasca [2976735]
Retrieving pages for category Categoria:Cabos do Alasca [3229009]
Retrieving pages for category Categoria:Estreitos do Alasca [3228959]
Retrieving pages for category Categoria:Golfos e baías do Alasca [5543207]
Retrieving pages for category Categoria:Ilhas do Alasca [206176]
Retrieving pages for category Categoria:Penínsulas do Alasca [3228965]
Retrieving pages for category Categoria:Rios do Alasca [1935902]
Retrieving pages for category Categoria:Vulcões do Alasca [3266151]
Retrieving pages for category Categoria:California Trail [3572256]
Retrieving pages for category Categoria:Oregon Trail [3544176]
Retrieving pages for category Categoria:Vale da Morte [1715183]
Retrieving pages for category Categoria:Parque Nacional da Sequoia [48528

### Analisar categorias para Check

In [122]:
for_skeep = []
if (len(for_skeep) > 0):
    for sk in for_skeep:
        set_category_skeep(sk)

for_check = list(mongo_categories.find({ 'download': 'Check', 'country': 'Estados Unidos'}))

# usa = [5577536, 3203291, 2403918, 2671885, 4914662, 4400845, 3439705, 4149018,2739967, 4133805, 2739980, 5589663, 4133798]
usa_4_check = [(fc['pageid'], fc['title']) for fc in for_check]

In [19]:
usa = [3003786, 2966383, 2511655, 2541119, 3166610]

### Backup do que já foi feito download

In [131]:
# Categorias
categories = list(mongo_categories.find())
categories_df = pd.DataFrame(categories)
categories_df.to_csv('categories.csv')

In [132]:
pages = list(mongo_pages.find())
pages_df = pd.DataFrame(pages)
pages_df.to_csv('pages.csv')

In [61]:
pages_df.shape

(498, 8)

### Verificar categorias e páginas sem country

In [37]:
# Pages
pages_no_country = list(db.pages.find({"country": {"$exists": False }}))
print(f'{len(pages_no_country)} pages found with no country')

# Categories
categories_no_country = list(db.categories.find({"country": {"$exists": False }}))
print(f'{len(categories_no_country)} categories found with no country')

0 pages found with no country
0 categories found with no country


### Corrigir categorias e paginas dos Estados Unidos marcadas como Nova Zelandia

In [26]:
nz = list(mongo_categories.find({ 'country': 'Nova Zelandia'}))

nz_with_usa = [c for c in nz if "Estados Unidos" in c['title']]
for cat_fix in nz_with_usa:
    mongo_categories.update_one(
        { '_id': cat_fix['_id']},
        { '$set': { 'country': 'Estados Unidos'}}
    )

In [41]:
nz = list(mongo_pages.find({ 'country': 'Nova Zelandia'}))
nz_with_usa = [c for c in nz if "Estados Unidos" in c['title']]

for pag_fix in nz_with_usa:
    mongo_pages.update_one(
        { '_id': pag_fix['_id']},
        { '$set': { 'country': 'Estados Unidos'}}
    )

In [129]:
nz = list(mongo_pages.find({ 'country': 'Nova Zelandia'}))
nz_with_usa = [c for c in nz if "Estados Unidos" in c['category_title']]

for pag_fix in nz_with_usa:
    mongo_pages.update_one(
        { '_id': pag_fix['_id']},
        { '$set': { 'country': 'Estados Unidos'}}
    )

## Download de páginas

In [57]:
url_base = 'https://pt.wikipedia.org/w/api.php'

def get_pages_for_download(limite=100):
    return list(mongo_pages.find({ 'download': 'Waiting' }).limit(limite))

def get_pages_downloads_counts():
    return list(mongo_pages.aggregate([
        { '$group': { '_id': '$download', 'total': { '$sum': 1 } } }
    ]))

# https://en.wikipedia.org/w/api.php?action=parse&format=json&pageid=3276454&prop=wikitext&formatversion=2
f_page = { 
    'action': 'parse',
    'format': 'json',
    'formatversion': '2'
}

def parameters_wikitext(pageid):
    fc = f_page.copy()
    fc['pageid'] = pageid
    fc['prop'] = 'wikitext'
    return fc

def parameters_text(pageid):
    fc = f_page.copy()
    fc['pageid'] = pageid
    fc['prop'] = 'text'
    return fc

def get_wikitext(pageid):
    params_wikitext = parameters_wikitext(pageid)
    wikitext_r = requests.get(url_base, params=params_wikitext)
    wikitext_content = {}
    if wikitext_r.status_code == 200:
        wikitext_content = wikitext_r.json()
    else:
        print(f'Erro ao fazer download de wikitext de pagina {pageid}')
    return wikitext_content

def get_text(pageid):
    params_text = parameters_text(pageid)
    text_r = requests.get(url_base, params=params_text)
    text_content = {}
    if text_r.status_code == 200:
       text_content = text_r.json()
    else:
        print(f'Erro ao fazer download de text de pagina {pageid}')
    return text_content

def copy_page_with_wikitext(page):
    p_wikitext = page.copy()
    p_wikitext['wikitext'] = get_wikitext(page['pageid'])['parse']['wikitext']
    return p_wikitext

# get_pages_for_download()[0]

### Download page wikitexts

In [58]:
total = 1000
current = 1
pages_for_download = get_pages_for_download(total)
for page in pages_for_download:
    page_wikitext = copy_page_with_wikitext(page)
    del page_wikitext['_id']
    mongo_content.insert_one(page_wikitext)
    mongo_pages.update_one(
        { 'pageid': page['pageid'] },
        { '$set': { 'download': 'Done' }}
    )
    print(f'Page [{page["pageid"]}]({page["title"]}) downloaded successfuly ({current} of {total})')
    current = current + 1
    time.sleep(2)

063037](Deserto de Kaʻū) downloaded successfuly (744 of 1000)
Page [3055210](Deserto de Maine) downloaded successfuly (745 of 1000)
Page [2252364](Jornada del Muerto) downloaded successfuly (746 of 1000)
Page [3055074](Planalto do Colorado) downloaded successfuly (747 of 1000)
Page [25753](Grandes Lagos da América do Norte) downloaded successfuly (748 of 1000)
Page [5679132](Lago Elmer Thomas) downloaded successfuly (749 of 1000)
Page [6347162](Lagos Haynach) downloaded successfuly (750 of 1000)
Page [3293272](Lista de estados dos Estados Unidos por altitude) downloaded successfuly (751 of 1000)
Page [4201012](Lista de picos ultraproeminentes dos Estados Unidos) downloaded successfuly (752 of 1000)
Page [1846884](Planalto Apalache) downloaded successfuly (753 of 1000)
Page [3055074](Planalto do Colorado) downloaded successfuly (754 of 1000)
Page [2126108](Planalto de Edwards) downloaded successfuly (755 of 1000)
Page [1846894](Planalto do Missouri) downloaded successfuly (756 of 1000)


In [59]:
pdc = get_pages_downloads_counts()
print(f'Page download counts: {pdc}')
pages_wikitext_counts = mongo_content.count_documents({})
print(f'Pages with wikitext: {pages_wikitext_counts}')


Page download counts: [{'_id': 'Waiting', 'total': 737}, {'_id': 'Done', 'total': 1877}]
Pages with wikitext: 2360


### Download page texts

In [None]:
total = 100
current = 1
pages_contents = mongo_content.find({}).limit(total)
for pc in pages_contents:
    pageid = pc['pageid']
    text = get_text(pageid)
    pages_contents.update_many(
        { 'pageid': pageid },
        { '$set': { 'text': text } }
    )
    print(f'Page text [{page["pageid"]}]({page["title"]}) downloaded successfuly ({current} of {total})')
