In [1]:

import json
import sqlite3
import re
import requests
import pandas as pd
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from numpy.testing import assert_equal

For Problems 1 to 4, refer to the SQLite database at `/mnt/data/public/imdb.db`.

# Problem 1

Create a function `count_keywords` that will return a SQL statement and a connection string such that the resulting table has columns `title` and `keywords`, and the rows are `movies` with their number of unique `keywords`. Rows should be sorted by decreasing number of `keywords` then increasing lexicographic order of `title`.

In [2]:
def count_keywords():
    """Return a SQL statement and a connection string such that the resulting 
    table has columns title and keywords, and the rows are movies with their 
    number of unique keywords
    """
    with sqlite3.connect('/mnt/data/public/imdb.db') as conn:
        query = """
        SELECT m.title, COUNT(DISTINCT(mk.idkeywords)) as keywords
        FROM movies as m
        JOIN movies_keywords as mk
        ON m.idmovies = mk.idmovies
        GROUP BY m.idmovies
        ORDER BY keywords DESC, m.title ASC
        """
    return (query, conn)

In [3]:
# conn = sqlite3.connect('/mnt/data/public/imdb.db')
# cursor = conn.cursor()

# table_list = [a for a in cursor.execute("SELECT name FROM sqlite_master WHERE type = 'table'")]
# print(table_list)
# conn.close()

[('movies',), ('movies_genres',), ('genres',), ('series',), ('movies_keywords',), ('keywords',), ('aka_titles',)]


In [4]:
sql, conn = count_keywords()
df_keywords = pd.read_sql(sql, conn)
assert_equal(df_keywords.shape, (61383, 2))
assert_equal(df_keywords.columns.tolist(), ['title', 'keywords'])
assert_equal(
    df_keywords.iloc[:10].to_numpy().tolist(),
    [['The Stand', 435],
     ['Rose Red', 333],
     ['Girls Bravo', 322],
     ['Life', 311],
     ['Kenp denki beruseruku', 247],
     ['Going Postal', 231],
     ['Aika', 229],
     ['Anne of Green Gables', 229],
     ['Ikki tsen', 226],
     ['Empire Falls', 220]]
)

# Problem 2

Create a function `has_keyword` that will return a SQL statement and a connection string such that the resulting table has column `title` and the rows are unique `title`s (not movies) that have the given case-insensitive keyword. Rows should be sorted by increasing lexicographic order of `title`.

In [5]:
def has_keyword():
    """Return a SQL statement and a connection string such that the resulting 
    table has column title and the rows are unique titles (not movies) that 
    have the given case-insensitive keyword.
    """
    with sqlite3.connect('/mnt/data/public/imdb.db') as conn:
        query = """
            SELECT DISTINCT m.title
            FROM movies_keywords as mk
            INNER JOIN keywords as k ON mk.idkeywords = k.idkeywords
            INNER JOIN movies as m ON mk.idmovies = m.idmovies
            WHERE k.keyword LIKE LOWER(?)
            ORDER BY m.title ASC
    """
    return (query, conn)

In [6]:
sql, conn = has_keyword()
df_keyword = pd.read_sql(sql, conn, params=['data'])
assert_equal(df_keyword.columns.tolist(), ['title'])
assert_equal(
    df_keyword.to_numpy().tolist(),
    [['24'],
     ['Cyberchase'],
     ['Person of Interest'],
     ['Pine Gap'],
     ['The Raising of America'],
     ['Unnatural Causes']]
)
df_keyword = pd.read_sql(sql, conn, params=['science'])
assert_equal(df_keyword.shape, (424, 1))
assert_equal(df_keyword.columns.tolist(), ['title'])
assert_equal(
    df_keyword.iloc[:10].to_numpy().tolist(),
    [['100 Greatest Discoveries'],
     ['2025: Webisodes'],
     ['2057'],
     ['3-2-1 Contact'],
     ['5 kulmaa kosmologiaan'],
     ['@discovery.ca'],
     ['A Window Looking In'],
     ['Abenteuer Forschung'],
     ['After the Warming'],
     ['Against the Elements']]
)

# Problem 3

Create a function `aka_phils` that will return a SQL statement and a connection string such that the resulting table has columns `year` and `title`, and the rows are the year and the number of movies with an alternate title for the `location` `Philippines: English title`. Return only years that have at least 10 movies. Sort by decreasing `count`.

In [7]:
def aka_phils():
    """Return a SQL statement and a connection string such that the resulting
    table has columns year and title, and the rows are the year and the number
    of movies with an alternate title for the location Philippines: English
    title. Return only years that have at least 10 movies. Sort by decreasing 
    count.
    """
    with sqlite3.connect('/mnt/data/public/imdb.db') as conn:
        query = """SELECT a.year, COUNT(a.title) as title
            FROM aka_titles AS a
            WHERE a.location = 'Philippines: English title'
            GROUP BY a.year
            HAVING COUNT(a.title) >= 10
            ORDER BY title DESC"""
    return (query, conn)

In [8]:
sql, conn = aka_phils()
df_aka = pd.read_sql(sql, conn)
assert_equal(df_aka.shape, (5, 2))
assert_equal(
    df_aka.iloc[:3].to_numpy().tolist(),
    [[2010.0, 30.0],
     [2009.0, 27.0],
     [2008.0, 23.0]]
)

# Problem 4

Create a function `convert_twitter` that returns a SQLite connection to an in-memory database. It should contain a table with the following columns:
 - `id` (integer)
 - `text` (text)
 - `is_quote_status` (boolean)
 - `favorite_count` (integer)
 - `created_at` (text)
 - `timestamp_ms` (integer)

Each row corresponds to a tweet in `data_twitter_sample.json`.

In [9]:
def convert_twitter():
    """Returns a SQLite connection to an in-memory database."""
    df = (pd.read_json('data_twitter_sample.json',
                      lines=True,
                      convert_dates=False)[['id',
                                            'text',
                                            'is_quote_status',
                                            'favorite_count',
                                            'created_at',
                                            'timestamp_ms']])
          
    df['timestamp_ms'] = df['timestamp_ms'].astype(int)
    
    conn = sqlite3.connect(':memory:')
    df.to_sql('twitter', conn, index=False)
    conn.commit()
    return conn

In [10]:
conn = convert_twitter()
conn.rollback()
assert isinstance(conn, sqlite3.Connection)
assert_equal(
    tuple(conn.execute('SELECT count(*) FROM twitter')),
    ((8908,),)
)
assert_equal(
    tuple(conn.execute(
        'SELECT * FROM twitter WHERE id = 1014298721671139328')),
    ((1014298721671139328, 'get ready for thos shadow bans', 0, 0,
      'Wed Jul 04 00:03:54 +0000 2018', 1530662634662),)
)

For Problems 5 to 7, you will be scraping the website at `http://jojie.accesslab.aim.edu:9095/dmw_scraping/ASIA.html` which contains links to the 2016 Philippine senatorial elections results for each Overseas Absentee Voting (OAV) precint in Asia as well as links to the other continents where OAV was conducted. On Jojie, you can access it at `http://192.168.212.2:9095/dmw_scraping/ASIA.html`. Do not escape spaces in the URLs.

# Problem 5

Create a function `get_continent_urls` that will return a dictionary with continent name as key and absolute url to that OAV page as value.

In [11]:
def get_continent_urls():
    """Return a dictionary with continent name as key and absolute url to
    that OAV page as value.
    """
    url = 'http://192.168.212.2:9095/dmw_scraping/'
    asia = 'http://192.168.212.2:9095/dmw_scraping/ASIA.html'
    page = requests.get(asia)
    soup = BeautifulSoup(page.text)
    dict_ = {}
    for i in soup.find_all('h3'):
        dict_[i.text] = (url + i.find('a')['href'])
    dict_['ASIA'] = asia
    return dict_

In [12]:
continent_url = get_continent_urls()
assert_equal(len(continent_url), 4)
assert_equal(
    continent_url['MIDDLE EAST AND AFRICAS'], 
    'http://192.168.212.2:9095/dmw_scraping/MIDDLE EAST AND AFRICAS.html'
)

# Problem 6

Create a function `get_fsp_urls` that accepts a `continent` and returns a dictionary with the foreign service posts in that `continent` as key and the list of urls to the precinct JSON files as values.

In [13]:
def get_fsp_urls(continent):
    """Accept a continent and returns a dictionary with the foreign service
    posts in that continent as key and the list of urls to the precinct JSON
    files as values.
    
    Parameters:
    ------------
    continent: str
    
    Return:
    ------------
    dict6: dictionary
    """
    def get_continent_urls():
        """Return a dictionary with continent name as key and absolute url to
        that OAV page as value.
        """
        url = 'http://192.168.212.2:9095/dmw_scraping/'
        asia = 'http://192.168.212.2:9095/dmw_scraping/ASIA.html'
        page = requests.get(asia)
        soup = BeautifulSoup(page.text)
        dict_ = {}
        for i in soup.find_all('h3'):
            dict_[i.text] = (url + i.find('a')['href'])
        dict_['ASIA'] = asia
        return dict_

    base_url = 'http://192.168.212.2:9095/dmw_scraping/'
    url = get_continent_urls()[continent]
    page = requests.get(url)
    soup = BeautifulSoup(page.text)
    dict6 = {}
    keys = pd.Series(
        [" ".join(i.text.split()[:-2]) for i in soup.find_all('a')[4:]]
    ).unique()
    for key in keys:
        li = [i['href'] for i in soup.find_all('a') if key in i.text]
        dict6[key] = [base_url + i for i in li]
    return dict6

In [14]:
fsp_urls = get_fsp_urls('ASIA')
assert_equal(
    sorted(fsp_urls.keys()),
    ['HONGKONG PCG',
     'KUALA LUMPUR PE',
     'OSAKA PCG',
     'SEOUL PE',
     'SINGAPORE PE',
     'TOKYO PE']
)
assert_equal(
    sorted(fsp_urls['TOKYO PE']),
    ['http://192.168.212.2:9095/dmw_scraping/ASIA/JAPAN/TOKYO PE/90160021/'
     'SENATOR PHILIPPINES.json',
     'http://192.168.212.2:9095/dmw_scraping/ASIA/JAPAN/TOKYO PE/90160023/'
     'SENATOR PHILIPPINES.json',
     'http://192.168.212.2:9095/dmw_scraping/ASIA/JAPAN/TOKYO PE/90160024/'
     'SENATOR PHILIPPINES.json',
     'http://192.168.212.2:9095/dmw_scraping/ASIA/JAPAN/TOKYO PE/90160033/'
     'SENATOR PHILIPPINES.json',
     'http://192.168.212.2:9095/dmw_scraping/ASIA/JAPAN/TOKYO PE/90160035/'
     'SENATOR PHILIPPINES.json',
     'http://192.168.212.2:9095/dmw_scraping/ASIA/JAPAN/TOKYO PE/90160038/'
     'SENATOR PHILIPPINES.json',
     'http://192.168.212.2:9095/dmw_scraping/ASIA/JAPAN/TOKYO PE/90160040/'
     'SENATOR PHILIPPINES.json',
     'http://192.168.212.2:9095/dmw_scraping/ASIA/JAPAN/TOKYO PE/90160041/'
     'SENATOR PHILIPPINES.json']
)

# Problem 7
Create a function `senator_votes` that returns a dictionary with the candidate name as key and the total votes received in the given `continent` and `fsp` as values.

In [15]:
def senator_votes(continent, fsp):
    """Returns a dictionary with the candidate name as key and the total votes
    received in the given continent and fsp as values.
    
    Parameters:
    ------------
    continent: str
    fsp: str
    
    Return:
    ------------
    int
    """
    def get_fsp_urls(continent):
        """Accept a continent and returns a dictionary with the foreign service
        posts in that continent as key and the list of urls to the precinct JSON
        files as values.

        Parameters:
        ------------
        continent: str

        Return:
        ------------
        dict6: dictionary
        """
        def get_continent_urls():
            """Return a dictionary with continent name as key and absolute url to
            that OAV page as value.
            """
            url = 'http://192.168.212.2:9095/dmw_scraping/'
            asia = 'http://192.168.212.2:9095/dmw_scraping/ASIA.html'
            page = requests.get(asia)
            soup = BeautifulSoup(page.text)
            dict_ = {}
            for i in soup.find_all('h3'):
                dict_[i.text] = (url + i.find('a')['href'])
            dict_['ASIA'] = asia
            return dict_

        base_url = 'http://192.168.212.2:9095/dmw_scraping/'
        url = get_continent_urls()[continent]
        page = requests.get(url)
        soup = BeautifulSoup(page.text)
        dict6 = {}
        keys = pd.Series(
            [" ".join(i.text.split()[:-2]) for i in soup.find_all('a')[4:]]
        ).unique()
        for key in keys:
            li = [i['href'] for i in soup.find_all('a') if key in i.text]
            dict6[key] = [base_url + i for i in li]
        return dict6

    df7 = pd.DataFrame()

    for i in get_fsp_urls(continent)[fsp]:


        df_sen = pd.DataFrame.from_dict(
            json.loads(requests.get(i).text)['results']
        )


        df7 = df7.append(df_sen)

    df7['votes'] = df7['votes'].astype(int)
    return dict(df7.groupby('bName')['votes'].sum())

In [16]:
can_votes = senator_votes('ASIA', 'TOKYO PE')
assert_equal(len(can_votes), 50)
assert_equal(can_votes['PACQUIAO, MANNY (UNA)'], 1595)
assert_equal(can_votes['SOTTO, VICENTE (NPC)'], 1545)
assert_equal(can_votes['GADON, LARRY (KBL)'], 105)

# Problem 8
Create a function `get_books` that returns a list of tuples with book title as the first element and paperback selling price as the second element for all the books in `data_wrangling.html`. Set price to `None` if not there's no price indicated. Sort them in order of appearance.

In [17]:
def get_books():
    """Return a list of tuples with book title as the first element and
    paperback selling price as the second element for all the books in
    data_wrangling.html. Set price to None if not there's no price indicated.
    Sort them in order of appearance.
    """
    with open('data_wrangling.html') as f:
        soup = BeautifulSoup(f)
    div = soup.find_all('div', class_='sg-col-4-of-12 sg-col-8-of-16 sg-col-'
                        '16-of-24 sg-col-12-of-20 sg-col-24-of-32 sg-col '
                        'sg-col-28-of-36 sg-col-20-of-28')

    li = []

    for elem in div:
        title = (elem.find('span', 
                           class_='a-size-medium a-color-base a-text-normal')
                 .text)

        try:
            price = (elem.find('span', class_='a-size-base a-color-base')
                     .findNext('span').text)
        except AttributeError:
            whole = elem.find('span', class_='a-offscreen')

            try:
                price = whole.text
            except AttributeError:
                price = None

        li.append((title, price))
    return li


In [18]:
title_price = get_books()
assert_equal(len(title_price), 16)
assert_equal(
    title_price[:10],
    [('Python for Data Analysis: Data Wrangling with Pandas, NumPy, and '
      'IPython', '$35.65'),
     ('Data Wrangling with Python: Tips and Tools to Make Your Life Easier',
      '$32.59'),
     ('Data Wrangling with R (Use R!)', '$67.49'),
     ('Data Wrangling with Python: Creating actionable data from raw sources',
      '$39.99'),
     ('Practical Data Wrangling: Expert techniques for transforming your raw '
      'data into a valuable source for analytics', '$29.99'),
     ('Python Data Science Handbook: Essential Tools for Working with Data',
      '$28.12'),
     ('Think Like a Data Scientist: Tackle the data science process '
      'step-by-step', '$27.92'),
     ('Principles of Data Wrangling: Practical Techniques for Data '
      'Preparation', '$27.83'),
     ('Data Wrangling with JavaScript', '$40.78'),
     ('Data Wrangling: Munging in R with SQL and MongoDB for Financial '
      'Applications', '$45.39')]
)

# Problem 9

Create a function `get_revisions_timeseries` that returns a `pandas` `Series` with index equal to months in `YYYY-MM` format and values corresponding to the number of revisions made in the English Wikipedia article `Data science` for that month. Include months since creation of the article until before July 2021 but exclude those months where no revision was made. Sort by chronological order.

In [19]:
def get_revisions_timeseries():
    """Return a pandas Series with index equal to months in YYYY-MM format and
    values corresponding to the number of revisions made in the English
    Wikipedia article Data science for that month. Include months since
    creation of the article until before July 2021 but exclude those months
    where no revision was made. Sort by chronological order.
    """
    params = {
        'action': 'query',
        'prop': 'revisions',
        'titles': 'Data science',
        'format': 'json',
        'rvlimit': 'max',
        'rvprop': 'ids|timestamp',
        'continue': '||',
        'rvend': '2021-07-01T00:00:00',
        'rvdir': 'newer'                   
    }

    df9 = pd.DataFrame()
    looper = True
    while looper:
        res = requests.get('https://en.wikipedia.org/w/api.php',
                          params=params)
        data = res.json()
        display(data.keys())
        query = data['query']
        page_index = list(data['query']['pages'].keys())[0]
        x = query['pages'][page_index]['revisions']
        df = pd.DataFrame(x)
        df9 = df9.append(df)
        if 'continue' in data.keys():
            print('continue in data keys')
            params['rvcontinue'] = data['continue']['rvcontinue']
            print(params['rvcontinue'])
        else:
            looper = False        

    def time(x):
        year = str(x.year)
        month = str(x.month)

        if len(month) != 2:
            month = '0'+month
        return year + '-' + month

    df9['timestamp'] = pd.to_datetime(df9['timestamp'])
    df9['timestamp'] = df9['timestamp'].apply(time)
    return df9.groupby('timestamp')['revid'].count()

In [20]:
rev_ts = get_revisions_timeseries()
assert_equal(rev_ts.shape, (109,))
assert_equal(
    rev_ts.index[:10].tolist(),
    ['2012-04',
     '2012-05',
     '2012-06',
     '2012-07',
     '2012-08',
     '2012-10',
     '2012-11',
     '2012-12',
     '2013-01',
     '2013-02']
)
assert_equal(
    rev_ts[:10].tolist(),
    [12, 6, 13, 10, 9, 16, 16, 9, 28, 11]
)

dict_keys(['continue', 'query', 'limits'])

continue in data keys
20160725205502|731513124


dict_keys(['continue', 'query', 'limits'])

continue in data keys
20200130053259|938280158


dict_keys(['batchcomplete', 'query', 'limits'])

# Problem 10

Create a function `get_datasci_link_revs_asof` that returns the list of the revision ID, as of 1 July 2021 UTC, of each linked page in revision id `1027747892` of the English Wikipedia.

In [21]:
def get_datasci_link_revs_asof():
    """Return the list of the revision ID, as of 1 July 2021 UTC, of each
    linked page in revision id 1027747892 of the English Wikipedia.
    """
    params = {
        'action': 'query',
        'prop': 'revisions',
        'revids': '1027747892',
        'format': 'json',
        'generator': 'links',
        'gpllimit': 'max'
    }

    res = requests.get('https://en.wikipedia.org/w/api.php',
                      params = params)
    data = res.json()
    df_data = pd.DataFrame()
    for i in data['query']['pages']:
        df = pd.json_normalize(data['query']['pages'][i],
                               'revisions',
                               ['pageid', 'ns', 'title'])
        df_data = df_data.append(df)
    df_data = df_data.reset_index(drop=True)
    df_data['timestamp'] = pd.to_datetime(df_data.timestamp)
    df_data = df_data[df_data['ns']==0]
    df_data = df_data[['title', 'revid', 'timestamp', 'ns']]
    df_correct = pd.DataFrame()
    for title in df_data.title:
        params = {
            'action': 'query',
            'prop': 'revisions',
            'rvprop': 'ids|timestamp',
            'titles': f'{title}',
            'format': 'json',
            'rvstart': '2021-07-01T00:00:00',
            'rvlimit': '1'
        }

        res_wrong = requests.get('https://en.wikipedia.org/w/api.php',
                                 params = params)
        data_wrong = res_wrong.json()
        page_id = list(data_wrong['query']['pages'].keys())[0]
        df = pd.DataFrame(data_wrong['query']['pages'][page_id]['revisions'], index=[title])
        df.index.name = 'title'
        df = df.reset_index().drop(columns=['parentid'], axis=1)

        df_correct = df_correct.append(df)
    ser = df_correct.reset_index(drop=True).sort_values(by='timestamp').revid
    return ser.tolist() 



In [22]:
revs_asof = get_datasci_link_revs_asof()
assert_equal(len(revs_asof), 50)
assert_equal(
    revs_asof[:10],
    [285436293,
     433201973,
     650889829,
     775426682,
     778233465,
     889336491,
     932609783,
     937074297,
     946577178,
     960265088])