In [1]:
import pandas as pd
import numpy as np
import csv
import json

from numpy.testing import assert_equal, assert_array_equal
from xml.etree import ElementTree as ET

In [2]:
%load_ext pycodestyle_magic
%pycodestyle_on

## Problem 1
Create a function `list_recipes` that returns the list of ingredients in `recipe_xml` in order of appearance.

In [66]:
def list_recipes(recipe_xml):
    """Return list of ingredients in order of appearance

    Parameters
    ----------
    recipe_xml : xml
        Input file

    Returns
    -------
    list
        List of ingredients in the input file
    """
    p_1 = ET.fromstring(recipe_xml)
    return [ing.text for ing in p_1.findall('ingredient')]

In [4]:
# sample xml from 
# https://www.arangodb.com/docs/stable/drivers/java-examples-xml-data.html
recipe_xml = """
<recipe _id="RawDocument/6834407522" _key="6834407522" _rev="6834407522"
         cook_time="3 hours" name="bread" prep_time="5 mins">
  <title>Basic bread</title>
  <ingredient amount="8" unit="dL">Flour</ingredient>
  <ingredient amount="10" unit="grams">Yeast</ingredient>
  <ingredient amount="4" state="warm" unit="dL">Water</ingredient>
  <ingredient amount="1" unit="teaspoon">Salt</ingredient>
  <instructions>
    <step>Mix all ingredients together.</step>
    <step>Knead thoroughly.</step>
    <step>Cover with a cloth, and leave for one hour in warm room.</step>
    <step>Knead again.</step>
    <step>Place in a bread baking tin.</step>
    <step>Cover with a cloth, and leave for one hour in warm room.</step>
    <step>Bake in the oven at 180(degrees)C for 30 minutes.</step>
  </instructions>
</recipe>
"""
assert_equal(list_recipes(recipe_xml), ['Flour', 'Yeast', 'Water', 'Salt'])

1:18: W291 trailing whitespace


## Problem 2
Create a function `catalog_sizes` that returns the unique catalog sizes in `catalog_xml`.

In [64]:
def catalog_sizes(catalog_xml):
    """Return the unique catalog sizes in catalog_xml

    Parameters
    ----------
    catalog_xml : xml
        Input file

    Returns
    -------
    list
        List of catalog sizes from input file
    """
    p_2 = ET.fromstring(catalog_xml)
    return list(set([sz.get('description') for sz in p_2.findall('.//size')]))

In [65]:
# sample xml from 
# https://www.service-architecture.com/articles/object-oriented-databases/xml_file_for_complex_data.html
catalog_xml = """<?xml version="1.0"?>
<?xml-stylesheet href="catalog.xsl" type="text/xsl"?>
<!DOCTYPE catalog SYSTEM "catalog.dtd">
<catalog>
   <product description="Cardigan Sweater" product_image="cardigan.jpg">
      <catalog_item gender="Men's">
         <item_number>QWZ5671</item_number>
         <price>39.95</price>
         <size description="Medium">
            <color_swatch image="red_cardigan.jpg">Red</color_swatch>
            <color_swatch image="burgundy_cardigan.jpg">Burgundy</color_swatch>
         </size>
         <size description="Large">
            <color_swatch image="red_cardigan.jpg">Red</color_swatch>
            <color_swatch image="burgundy_cardigan.jpg">Burgundy</color_swatch>
         </size>
      </catalog_item>
      <catalog_item gender="Women's">
         <item_number>RRX9856</item_number>
         <price>42.50</price>
         <size description="Small">
            <color_swatch image="red_cardigan.jpg">Red</color_swatch>
            <color_swatch image="navy_cardigan.jpg">Navy</color_swatch>
            <color_swatch image="burgundy_cardigan.jpg">Burgundy</color_swatch>
         </size>
         <size description="Medium">
            <color_swatch image="red_cardigan.jpg">Red</color_swatch>
            <color_swatch image="navy_cardigan.jpg">Navy</color_swatch>
            <color_swatch image="burgundy_cardigan.jpg">Burgundy</color_swatch>
            <color_swatch image="black_cardigan.jpg">Black</color_swatch>
         </size>
         <size description="Large">
            <color_swatch image="navy_cardigan.jpg">Navy</color_swatch>
            <color_swatch image="black_cardigan.jpg">Black</color_swatch>
         </size>
         <size description="Extra Large">
            <color_swatch image="burgundy_cardigan.jpg">Burgundy</color_swatch>
            <color_swatch image="black_cardigan.jpg">Black</color_swatch>
         </size>
      </catalog_item>
   </product>
</catalog>
"""
assert_equal(sorted(catalog_sizes(catalog_xml)), 
             ['Extra Large', 'Large', 'Medium', 'Small'])

1:18: W291 trailing whitespace
46:49: W291 trailing whitespace


## Problem 3
Create a function `find_fantasy` that returns the list of book titles having `Fantasy` as `genre` in `books_xml`, in order of appearance.

In [35]:
def find_fantasy(books_xml):
    """Return the list of book titles from books_xml

    Parameters
    ----------
    books_xml : xml
        Input file

    Returns
    -------
    list
        List of book titles having Fantasy as genre in books_xml
    """
    p_3 = ET.fromstring(books_xml)
    return ([t.find('title').text for t in p_3.findall('./book')
             if t.find('genre').text == 'Fantasy'])

In [36]:
# sample xml from
# https://docs.microsoft.com/en-us/previous-versions/windows/desktop/ms762271(v=vs.85)
books_xml = '''<?xml version="1.0"?>
<catalog>
   <book id="bk101">
      <author>Gambardella, Matthew</author>
      <title>XML Developer's Guide</title>
      <genre>Computer</genre>
      <price>44.95</price>
      <publish_date>2000-10-01</publish_date>
      <description>An in-depth look at creating applications 
      with XML.</description>
   </book>
   <book id="bk102">
      <author>Ralls, Kim</author>
      <title>Midnight Rain</title>
      <genre>Fantasy</genre>
      <price>5.95</price>
      <publish_date>2000-12-16</publish_date>
      <description>A former architect battles corporate zombies, 
      an evil sorceress, and her own childhood to become queen 
      of the world.</description>
   </book>
   <book id="bk103">
      <author>Corets, Eva</author>
      <title>Maeve Ascendant</title>
      <genre>Fantasy</genre>
      <price>5.95</price>
      <publish_date>2000-11-17</publish_date>
      <description>After the collapse of a nanotechnology 
      society in England, the young survivors lay the 
      foundation for a new society.</description>
   </book>
   <book id="bk104">
      <author>Corets, Eva</author>
      <title>Oberon's Legacy</title>
      <genre>Fantasy</genre>
      <price>5.95</price>
      <publish_date>2001-03-10</publish_date>
      <description>In post-apocalypse England, the mysterious 
      agent known only as Oberon helps to create a new life 
      for the inhabitants of London. Sequel to Maeve 
      Ascendant.</description>
   </book>
   <book id="bk105">
      <author>Corets, Eva</author>
      <title>The Sundered Grail</title>
      <genre>Fantasy</genre>
      <price>5.95</price>
      <publish_date>2001-09-10</publish_date>
      <description>The two daughters of Maeve, half-sisters, 
      battle one another for control of England. Sequel to 
      Oberon's Legacy.</description>
   </book>
   <book id="bk106">
      <author>Randall, Cynthia</author>
      <title>Lover Birds</title>
      <genre>Romance</genre>
      <price>4.95</price>
      <publish_date>2000-09-02</publish_date>
      <description>When Carla meets Paul at an ornithology 
      conference, tempers fly as feathers get ruffled.</description>
   </book>
   <book id="bk107">
      <author>Thurman, Paula</author>
      <title>Splish Splash</title>
      <genre>Romance</genre>
      <price>4.95</price>
      <publish_date>2000-11-02</publish_date>
      <description>A deep sea diver finds true love twenty 
      thousand leagues beneath the sea.</description>
   </book>
   <book id="bk108">
      <author>Knorr, Stefan</author>
      <title>Creepy Crawlies</title>
      <genre>Horror</genre>
      <price>4.95</price>
      <publish_date>2000-12-06</publish_date>
      <description>An anthology of horror stories about roaches,
      centipedes, scorpions  and other insects.</description>
   </book>
   <book id="bk109">
      <author>Kress, Peter</author>
      <title>Paradox Lost</title>
      <genre>Science Fiction</genre>
      <price>6.95</price>
      <publish_date>2000-11-02</publish_date>
      <description>After an inadvertant trip through a Heisenberg
      Uncertainty Device, James Salway discovers the problems 
      of being quantum.</description>
   </book>
   <book id="bk110">
      <author>O'Brien, Tim</author>
      <title>Microsoft .NET: The Programming Bible</title>
      <genre>Computer</genre>
      <price>36.95</price>
      <publish_date>2000-12-09</publish_date>
      <description>Microsoft's .NET initiative is explored in 
      detail in this deep programmer's reference.</description>
   </book>
   <book id="bk111">
      <author>O'Brien, Tim</author>
      <title>MSXML3: A Comprehensive Guide</title>
      <genre>Computer</genre>
      <price>36.95</price>
      <publish_date>2000-12-01</publish_date>
      <description>The Microsoft MSXML3 parser is covered in 
      detail, with attention to XML DOM interfaces, XSLT processing, 
      SAX and more.</description>
   </book>
   <book id="bk112">
      <author>Galos, Mike</author>
      <title>Visual Studio 7: A Comprehensive Guide</title>
      <genre>Computer</genre>
      <price>49.95</price>
      <publish_date>2001-04-16</publish_date>
      <description>Microsoft Visual Studio 7 is explored in depth,
      looking at how Visual Basic, Visual C++, C#, and ASP+ are 
      integrated into a comprehensive development 
      environment.</description>
   </book>
</catalog>'''
assert_equal(
    find_fantasy(books_xml), 
    ['Midnight Rain', 'Maeve Ascendant', 
     "Oberon's Legacy", 'The Sundered Grail'])

11:61: W291 trailing whitespace
20:65: W291 trailing whitespace
21:63: W291 trailing whitespace
30:58: W291 trailing whitespace
31:54: W291 trailing whitespace
40:62: W291 trailing whitespace
41:60: W291 trailing whitespace
42:53: W291 trailing whitespace
51:61: W291 trailing whitespace
52:59: W291 trailing whitespace
61:59: W291 trailing whitespace
70:59: W291 trailing whitespace
89:62: W291 trailing whitespace
98:62: W291 trailing whitespace
107:61: W291 trailing whitespace
108:69: W291 trailing whitespace
118:64: W291 trailing whitespace
119:50: W291 trailing whitespace
124:29: W291 trailing whitespace
125:41: W291 trailing whitespace


## Problem 4
Create a function `dct2str` that accepts `dct` and returns it as a JSON string. Create a function `dct2file` that accepts `dct` and saves it into a JSON file `dct.json`.

In [48]:
def dct2str(dct):
    """Accept dct and return a JSON string.

    Parameters
    ----------
    dct : dict
        Input dictionary
    """
    return json.dumps(dct)


def dct2file(dct):
    """Accepts dct and save into a JSON file.

    Parameters
    ----------
    dct : dict
        Input file to be converted to dictionary
    """
    with open('dct.json', 'w') as f:
        json.dump(dct, f)

In [11]:
dct = {
    'j': 1,
    's': '''Multi-line
String''',
    'o': {
        'a': 1.2,
        'b': [1, 2, 3]
    },
    'n': None
}

assert_equal(dct2str(dct),
             '{"j": 1, "s": "Multi-line\\nString", '
             '"o": {"a": 1.2, "b": [1, 2, 3]}, "n": null}')

!rm -f dct.json
dct2file(dct)
assert_equal(json.load(open('dct.json')), dct)

16:6: E225 missing whitespace around operator


## Problem 5
Create a function `count_journals` that reads `/mnt/data/public/covid19-lake/alleninstitute/CORD19/json/metadata/part-00000-81803174-7752-4489-8eeb-081318af9653-c000.json` and returns the list of journals along with their count sorted by decreasing count then by ascending lexicographic order of the journal name.

In [67]:
def count_journals():
    """Read a json file then return a list of journals and their counts

    Returns
    -------
    list
        List of journals along with their count sorted by
        decreasing count then by ascending lexicographic
        order of the journal name.
    """
    df_5 = pd.read_json('/mnt/data/public/covid19-lake/alleninstitute/CORD19/'
                        'json/metadata/part-00000-81803174-7752-4489-8eeb-081'
                        '318af9653-c000.json', lines=True)
    df_count = df_5['journal'].value_counts().reset_index()
    return [(a, b) for a, b in zip(df_count['index'].to_list(),
            df_count['journal'].to_list())]

In [13]:
journal_count = count_journals()
assert_equal(len(journal_count), 4886)
assert_equal(
    journal_count[:10],
    [('', 4067),
     ('Journal of Virology', 1740),
     ('PLoS One', 1567),
     ('Virology', 865),
     ('Emerg Infect Dis', 749),
     ('The Lancet', 597),
     ('Viruses', 569),
     ('Arch Virol', 504),
     ('Virus Research', 495),
     ('Sci Rep', 491)]
)

## Problem 6
Create a function `business_labels` that reads `/mnt/data/public/yelp/challenge12/yelp_dataset/yelp_academic_dataset_photo.json` and returns a `pandas` `Series` where the index is the `business_id` and the values are the set of `label`s posted for that business.

In [68]:
def business_labels():
    """Return a Series with buisiness id and labels posted

   Returns:
    -------
    pandas.Series
        Series where the index is the business_id and
        the values are the set of labels posted for that business
    """
    df_6 = pd.read_json('/mnt/data/public/yelp/challenge12/yelp_dataset/'
                        'yelp_academic_dataset_photo.json', lines=True)
    return df_6.groupby('business_id')['label'].apply(set)

In [15]:
bl = business_labels()
assert isinstance(bl, pd.Series)
assert_equal(len(bl), 32976)
assert_array_equal(
    bl.index[:10],
    ['--1UhMGODdWsrMastO9DZw', '--6MefnULPED_I942VcFNA',
     '--9e1ONYQuAa-CB_Rrw7Tw', '--DaPTJW3-tB1vP-PfdTEg',
     '--FBCX-N37CMYDfs790Bnw', '--KCl2FvVQpvjzmZSPyviA',
     '--Ni3oJ4VOqfOEu7Sj2Vzg', '--S62v0QgkqQaVUhFnNHrw',
     '--SrzpvFLwP_YFwB_Cetow', '--cZ6Hhc9F7VkKXxHMVZSQ']
)
assert_equal(
    bl[:10].tolist(),
    [{'inside'},
     {'food'},
     {'drink', 'food', 'inside', 'outside'},
     {'outside'},
     {'food', 'outside'},
     {'food'},
     {'food'},
     {'drink', 'food'},
     {'food', 'inside'},
     {'drink', 'food', 'inside', 'outside'}]
)

## Problem 7
Create a function `get_businesses` that reads the first 10000 lines of `/mnt/data/public/yelp/challenge12/yelp_dataset/yelp_academic_dataset_business.json` and returns a `pandas` `DataFrame`. The columns of the data frame correspond to the keys in all of the lines with nested keys separated by a dot (`.`). The index of the data frame correspond to the `business_id`.

In [69]:
def get_businesses():
    """Return a dataframe with indices corresponding to business_id

    Returns
    -------
    pandas.DataFrame
        The columns of the data frame correspond to the keys in all
        of the lines with nested keys separated by a dot (.).
        The index of the data frame correspond to the business_id.
    """
    df_7 = pd.read_json('/mnt/data/public/yelp/challenge12/yelp_dataset/'
                        'yelp_academic_dataset_business.json',
                        lines=True,
                        nrows=10000)
    json7 = df_7.to_json(orient='records')
    load7 = json.loads(json7)
    return pd.json_normalize(load7).set_index('business_id')

In [17]:
df_businesses = get_businesses()
df_businesses = df_businesses[df_businesses.columns.sort_values()]
assert_equal(df_businesses.shape, (10000, 60))
assert_array_equal(
    df_businesses.columns[:10],
    ['address', 'attributes', 'attributes.AcceptsInsurance',
     'attributes.AgesAllowed', 'attributes.Alcohol', 'attributes.Ambience',
     'attributes.BYOB', 'attributes.BYOBCorkage', 'attributes.BestNights',
     'attributes.BikeParking']
)
assert_array_equal(
    df_businesses.index[:10],
    ['Apn5Q_b6Nz61Tq4XzPdf9A', 'AjEbIBw6ZFfln7ePHha9PA',
     'O8S5hYJ1SMc8fA4QBtVujA', 'bFzdJJ3wp3PZssNEsyU23g',
     '8USyCYqpScwiNEb58Bt6CA', '45bWSZtniwPRiqlivpS8Og',
     '9A2quhZLyWk0akUetBd8hQ', '6OuOZAok8ikONMS_T3EzXg',
     '8-NRKkPY1UiFXW20WXKiXg', 'UTm5QZThPQlT35mkAcGOjg']
)
assert_equal(
    df_businesses.iloc[:10,:10].to_numpy().tolist(),
    [['1314 44 Avenue NE', np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 
      np.nan, np.nan, 'False'],
     ['', np.nan, np.nan, np.nan, 'none', np.nan, np.nan, np.nan, np.nan, 
      'False'],
     ['1335 rue Beaubien E', np.nan, np.nan, np.nan, 'beer_and_wine',
      "{'romantic': False, 'intimate': False, 'classy': False, "
      "'hipster': False, 'touristy': False, 'trendy': False, "
      "'upscale': False, 'casual': False}", np.nan, np.nan, np.nan, 'True'],
     ['211 W Monroe St', np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 
      np.nan, np.nan, np.nan],
     ['2005 Alyth Place SE', np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 
      np.nan, np.nan, np.nan],
     ['20235 N Cave Creek Rd, Ste 1115', np.nan, np.nan, np.nan, np.nan, 
      np.nan, np.nan, np.nan, np.nan, 'True'],
     ['631 Bloor St W', np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 
      np.nan, np.nan, np.nan],
     ['3417 Derry Road E, Unit 103', np.nan, np.nan, np.nan, 'none', np.nan, 
      np.nan, np.nan, np.nan, np.nan],
     ['1440 N. Dysart Ave', np.nan, np.nan, np.nan, 'none',
      "{'romantic': False, 'intimate': False, 'classy': False, "
      "'hipster': False, 'divey': False, 'touristy': False, "
      "'trendy': False, 'upscale': False, 'casual': True}",
      np.nan, np.nan, np.nan, 'True'],
     ['209 Oakland Ave', np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 
      np.nan, np.nan, 'True']] 
)
assert_equal(
    df_businesses.iloc[-10:,:10].to_numpy().tolist(),
    [['3302 E Main St', np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 
      np.nan, np.nan, np.nan],
     ['2714 E University Dr', np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 
      np.nan, np.nan, 'True'],
     ['', np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 
      np.nan],
     ['340 Eagle Street, Unit 3', np.nan, np.nan, np.nan, np.nan, np.nan, 
      np.nan, np.nan, np.nan, np.nan],
     ['86 Gerrard Street E', np.nan, np.nan, np.nan, 'none', np.nan, np.nan, 
      np.nan, np.nan, 'True'],
     ['15215 N Kierland Blvd, Ste 165B1A', np.nan, np.nan, np.nan, np.nan, 
      np.nan, np.nan, np.nan, np.nan, np.nan],
     ['7950 S Rainbow Blvd', np.nan, np.nan, np.nan, 'full_bar',
      "{'romantic': False, 'intimate': False, 'classy': False, "
      "'hipster': False, 'divey': False, 'touristy': False, 'trendy': False, "
      "'upscale': False, 'casual': False}", np.nan, np.nan, np.nan, 'False'],
     ['375 Queen Street W', np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 
      np.nan, np.nan, 'True'],
     ['1409 Potomac Ave', np.nan, np.nan, np.nan, 'none',
      "{'romantic': False, 'intimate': False, 'classy': False, "
      "'hipster': False, 'divey': False, 'touristy': False, 'trendy': False, "
      "'upscale': False, 'casual': True}",
      np.nan, np.nan, np.nan, 'True'],
     ['1353 E McKellips Rd', np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 
      np.nan, np.nan, 'True']]
)

20:27: E231 missing whitespace after ','
21:75: W291 trailing whitespace
23:74: W291 trailing whitespace
29:73: W291 trailing whitespace
31:77: W291 trailing whitespace
33:73: W291 trailing whitespace
35:72: W291 trailing whitespace
37:77: W291 trailing whitespace
44:73: W291 trailing whitespace
45:31: W291 trailing whitespace
48:28: E231 missing whitespace after ','
49:72: W291 trailing whitespace
51:78: W291 trailing whitespace
53:74: W291 trailing whitespace
55:74: W291 trailing whitespace
57:77: W291 trailing whitespace
59:75: W291 trailing whitespace
65:76: W291 trailing whitespace
72:77: W291 trailing whitespace


## Problem 8
Create a function `pop_ncr` that reads `NCR.xlsx` and returns a `pandas` `DataFrame` with two columns, `City, Municipality, and Barangay` and `Total Population`. Empty cells should be excluded from the DataFrame.

In [55]:
def pop_ncr():
    """Return a data frame with select columns

    Returns
    -------
    df_8 : pandas.DataFrame
        Columns include City, Municipality, and Barangay
        and Total Population. Empty cells are excluded
        from the DataFrame.
    """
    df_8 = pd.read_excel('NCR.xlsx', skiprows=5, usecols=[1, 2]).dropna()
    df_8.columns = ['City, Municipality, and Barangay', 'Total Population']
    return df_8

In [19]:
df_ncr = pop_ncr()
assert_equal(df_ncr.shape, (1738, 2))
assert_equal(df_ncr.iloc[0].values.tolist(), 
             ['NATIONAL CAPITAL REGION', 12877253.0])
assert_equal(df_ncr.iloc[100].values.tolist(), 
             ['Barangay 104', 5334.0])
assert_equal(df_ncr.iloc[1000].values.tolist(), 
             ['Bagumbayan', 13832.0])
assert_equal(df_ncr.iloc[-1].values.tolist(), 
             ['South Signal Village ', 39214.0])


3:45: W291 trailing whitespace
5:47: W291 trailing whitespace
7:48: W291 trailing whitespace
9:46: W291 trailing whitespace
11:1: W391 blank line at end of file


## Problem 9
Create a function `dump_airbnb_beds` that reads `/mnt/data/public/insideairbnb/data.insideairbnb.com/united-kingdom/england/london/2015-04-06/data/listings.csv.gz` then creates the Excel file `airbnb.xlsx`. Each sheet of the file corresponds to the `bed_type` and contains the `host_location` and `price` only.

In [70]:
def dump_airbnb_beds():
    """Create an excel file with each sheet corresponding to each bed type"""
    df_9 = pd.read_csv('/mnt/data/public/insideairbnb/data.insideairbnb.com/'
                       'united-kingdom/england/london/2015-04-06/data/'
                       'listings.csv.gz',
                       compression='gzip',
                       usecols=['bed_type', 'host_location', 'price'])
    unique = df_9['bed_type'].sort_values().unique()
    with pd.ExcelWriter('airbnb.xlsx') as writer:
        for btype in unique:
            (df_9[df_9['bed_type'] == 'Airbed'][['host_location', 'price']]
             .to_excel(writer, index=None, sheet_name=btype))

In [58]:
!rm -f airbnb.xlsx
dump_airbnb_beds()
airbnb_sheets = pd.read_excel('airbnb.xlsx', None)
assert_equal(list(airbnb_sheets.keys()), 
             ['Airbed', 'Couch', 'Futon', 'Pull-out Sofa', 'Real Bed'])
for df in airbnb_sheets.values():
    assert_equal(df.columns.tolist(), ['host_location', 'price'])

## Problem 10
Create a function `age_counts` that will read `/mnt/data/public/census/_PHILIPPINES_Statistical Tables.xls` and return a `pandas` `DataFrame` with index corresponding to the single-year age and columns corresponding to `Both Sexes`, `Male`, and `Female` `Total` and `Household` population.

In [59]:
# import os

In [24]:
# os.getcwd()

In [25]:
###!cp '/mnt/data/public/census/_PHILIPPINES_Statistical Tables.xls' /home/msds2022/jgacal/dmw2022/fileformats-exercises

1:1: E265 block comment should start with '# '
1:80: E501 line too long (120 > 79 characters)


In [71]:
def age_counts():
    """Return a dataframe with age as the index and details in the columns

    Returns
    -------
    df_10_final : pandas.DataFrame
        df with index corresponding to the single-year age and columns
        corresponding to Both Sexes, Male, and Female
        Total and Household population.
    """
    df1_10 = pd.read_excel('/mnt/data/public/census/_PHILIPPINES_Statistical '
                           'Tables.xls',
                           sheet_name='T2',
                           skiprows=6,
                           nrows=81,
                           usecols='A:B, D:E',
                           names=['SYA', 'Both Sexes (Total)',
                                  'Male (Total)', 'Female (Total)']).dropna()
    df2_10 = (pd.read_excel('/mnt/data/public/census/_PHILIPPINES_Statistical'
                            ' Tables.xls',
                            sheet_name='T3',
                            skiprows=6,
                            nrows=81,
                            usecols='A:D',
                            names=['SYA', 'Both Sexes (Household)',
                                   'Male (Household)', 'Female (Household)'])
              .dropna())
    df_10_final = df1_10.merge(df2_10, how='left', on='SYA')
    return df_10_final.set_index('SYA')

In [63]:
df_age_counts = age_counts()
assert_equal(df_age_counts.shape, (81, 6))
assert_array_equal(
    df_age_counts.columns,
    ['Both Sexes (Total)', 'Male (Total)', 'Female (Total)',
     'Both Sexes (Household)', 'Male (Household)', 'Female (Household)'])
assert_equal(
    df_age_counts.index[:10].tolist(),
    ['Under  1', 1, 2, 3, 4, 5, 6, 7, 8, 9]
)
assert_equal(
    df_age_counts.iloc[:10].to_numpy().tolist(),
    [[2076015.0, 1073401.0, 1002614.0, 2075441.0, 1073092.0, 1002349.0],
     [2090987.0, 1079503.0, 1011484.0, 2090349.0, 1079136.0, 1011213.0],
     [2191622.0, 1131865.0, 1059757.0, 2191020.0, 1131539.0, 1059481.0],
     [2235349.0, 1155396.0, 1079953.0, 2234812.0, 1155121.0, 1079691.0],
     [2224958.0, 1150320.0, 1074638.0, 2224376.0, 1150008.0, 1074368.0],
     [2181326.0, 1127218.0, 1054108.0, 2180700.0, 1126910.0, 1053790.0],
     [2173491.0, 1120396.0, 1053095.0, 2172796.0, 1120019.0, 1052777.0],
     [2237482.0, 1156157.0, 1081325.0, 2236650.0, 1155729.0, 1080921.0],
     [2140435.0, 1104671.0, 1035764.0, 2139569.0, 1104226.0, 1035343.0],
     [2110186.0, 1088395.0, 1021791.0, 2109160.0, 1087838.0, 1021322.0]]
)