### Pipeline 1 - download

In [5]:
import pandas as pd

In [1]:
import os
os.chdir('../')

In [2]:
import yaml
import lib.database_module as dm
import lib.encoding_module as enc
import lib.wiki_module as wiki

### Acquire: Get categories to query from a yaml file

In [3]:
with open('data/categories.yml') as f:
    categories = yaml.load(f)

In [4]:
categories

{'categories': ['desserts', 'sports cars', 'health care']}

### Acquire: Query Wikipedia by category for pages

In [14]:
categories['categories']

['desserts', 'sports cars', 'health care']

In [15]:
responses = []
for cat in categories['categories']:
    response = wiki.query_category(cat)
    responses.append(response)

In [16]:
responses

[{'categoryid': '691015',
  'pages': [{'pageid': 44760, 'title': 'Confectionery'},
   {'pageid': 7976, 'title': 'Dessert'},
   {'pageid': 41535911, 'title': 'List of desserts'},
   {'pageid': 4513949, 'title': 'Afikoman'},
   {'pageid': 45618991, 'title': 'Aish as-Saraya'},
   {'pageid': 40931081, 'title': 'Alcohol-infused whipped cream'},
   {'pageid': 704765, 'title': 'Alfajor'},
   {'pageid': 50347020, 'title': 'Apfelkuchle'},
   {'pageid': 1510907, 'title': 'Apple crisp'},
   {'pageid': 15139867, 'title': 'Qatayef'},
   {'pageid': 231451, 'title': 'Artificial cream'},
   {'pageid': 11692282, 'title': 'Asham (dessert)'},
   {'pageid': 18707192, 'title': 'Baklava'},
   {'pageid': 45402203, 'title': 'Banana custard'},
   {'pageid': 37569562, 'title': 'Banh com'},
   {'pageid': 37569808, 'title': 'Banh mat'},
   {'pageid': 42277357, 'title': 'Bint al-sahn'},
   {'pageid': 5286585, 'title': 'Bizcocho'},
   {'pageid': 19736180, 'title': 'Blachinda'},
   {'pageid': 26349802, 'title': 'Bol

### Acquire: Write category info to database

In [87]:
# first, connect to database?
conn, cursor = dm.connect_to_postgres()

Connected to server joshuacook.me.


In [17]:
cat_ids = [response['categoryid'] for response in responses]

In [18]:
cat_ids

['691015', '695196', '35880954']

In [25]:
zipped = zip(cat_ids, categories['categories'])
zipped

[('691015', 'desserts'),
 ('695196', 'sports cars'),
 ('35880954', 'health care')]

In [26]:
for cat in zipped:
    print(cat)
    dm.create_or_update_category_in_database(cat[0], cat[1])

('691015', 'desserts')
Connected to server joshuacook.me.
('695196', 'sports cars')
Connected to server joshuacook.me.
('35880954', 'health care')
Connected to server joshuacook.me.


In [27]:
# pd.DataFrame(responses[0]['pages'])

In [88]:
# remember to close connection, cursor at the end of each query/database command
cursor.close()
conn.close()

### Acquire: Query Wikipedia by page id for content, then write page content to database:

In [85]:
for each_category in responses:
    category_id = each_category['categoryid']
    
    for each_page in each_category['pages']:
        page_id = each_page['pageid']
        page_title = each_page['title']
        
        page_response = wiki.query_page(str(page_id))
        page_text = page_response['text']
        
        dm.create_or_update_page_in_database(page_id, category_id, 
                                             page_title, page_text, location = 'remote')

Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server 