In [1]:
import requests
import json
import gzip
import bz2
import csv
import re
import sqlite3
import numpy as np
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
from numpy.testing import (assert_equal, assert_array_equal, 
                           assert_array_almost_equal)

# Problem 1 [25 pts]

## Problem 1a [10 pts]

Create a function `count_foreignlang` that will read `/mnt/data/public/mtgjson/AllIdentifiers.json` and return a list of tuples where the first element is a `foreignData` `language` and the second element is the number of times that `language` is found in a `foreignData`. Return only the 10 most frequent `language` sorted by decreasing number of occurrences.

In [54]:
def count_foreignlang():
    with open('/mnt/data/public/mtgjson/AllIdentifiers.json') as f:
        j = json.load(f)
    count_list = []
    for key in j['data']:
        try:
            language_list = j['data'][key]['foreignData']

        except:
            continue
        else:
            language_list = j['data'][key]['foreignData']
            for i in language_list:
                count_list.append(i['language'])
    ser = pd.Series(count_list).value_counts()[:10]
    return [(a,b) for a,b in zip(ser.index, ser)]


In [55]:
cfl = count_foreignlang()
assert_equal(
    cfl[:5],
    [('Japanese', 34191),
     ('French', 32260),
     ('German', 31849),
     ('Italian', 31660),
     ('Spanish', 31595)]
)

## Problem 1b [10 pts]

Create a function `count_masters` that reads `/mnt/data/public/discogs/2011/discogs_20111110_masters.xml.gz` and returns a list of tuples where the first element is a year and the second element is the number of masters released on that year. Return only the 10 most frequent years sorted by decreasing frequency.

Hint: you may create a file object for a gzipped file `foo.gz` using `gzip.open('foo.gz')`

genre = pop

In [56]:
from xml.etree import ElementTree as ET

In [57]:

def count_masters():
    with gzip.open('/mnt/data/public/discogs/2011/discogs_20111110_masters.xml.gz', 'rb') as f:
        tree = ET.parse(f)
        root = tree.getroot()
    ser = pd.Series([i.text for i in root.findall('.//genre[.="Pop"]/../..//year')]).value_counts()
    return [(a,b) for a,b in zip(ser.index, ser)][:10]  

In [58]:
cm = count_masters()
assert_equal(
    cm[:5],
    [('1988', 1317),
     ('1989', 1279),
     ('1990', 1222),
     ('2009', 1147),
     ('1987', 1134)]
)

## Problem 1c [5 pts]
Create a function `barangay_population` that reads sheet `NCR by barangay` of `/mnt/data/public/census/2020/NCR.xlsx` and returns a `pandas` `DataFrame` with columns `Barangay` and `Population` sorted by decreasing population.

In [59]:
def barangay_population():
    df = pd.read_excel('/mnt/data/public/census/2020/NCR.xlsx',
                  sheet_name='NCR by barangay',
                 skiprows=10,
                 nrows=1769,
                 usecols='C:D',
                 names=['Barangay','Population'])
    df = df.dropna()
    df = df[~(df['Barangay'].str.contains('CITY|PATEROS|TONDO|BINONDO|QUIAPO|SAN NICOLAS|SANTA CRUZ|SAMPALOC|SAN MIGUEL|ERMITA|INTRAMUROS|MALATE|PACO|PANDACAN|PORT|SANTA ANA', regex=True))]
    return df.sort_values(by='Population', ascending=False)

In [60]:
df_bp = barangay_population()
assert_equal(df_bp.shape, (1710, 2))
assert_equal(df_bp.columns.tolist(), ['Barangay', 'Population'])
assert_equal(
    df_bp.iloc[:10].to_numpy().tolist(),
    [['Barangay 176', 261729.0],
     ['Commonwealth', 213229.0],
     ['Batasan Hills', 166572.0],
     ['Pinagbuhatan', 163598.0],
     ['Payatas', 139740.0],
     ['Poblacion', 120115.0],
     ['Holy Spirit', 111901.0],
     ['Barangay 171', 111713.0],
     ['Pasong Tamo', 110738.0],
     ['Barangay 178', 110224.0]]
)

# Problem 2

## Problem 2a [15 pts]

Create a function `find_streets` that reads `'/mnt/data/public/gutenberg/1/3/135/135-0.txt` and returns a list of tuples. The first element are street names in the form of `Rue de ___` (case-sensitive) and the second element corresponds to the number of times they occurred in the text. Don't concatenate lines. Return only the 10 most frequent streets sorted by decreasing frequency then name.

Hint: look at the resulting captured text then clean them further as required

In [278]:
def find_streets():
    with open('/mnt/data/public/gutenberg/1/3/135/135-0.txt', 'r') as f:
        text = f.read()
    reg = re.findall('Rue de (?:[\w\'\"]+\s*[\w\'\"]*)', text)
    reg
    return pd.Series(reg).value_counts()

In [37]:
with open('/mnt/data/public/gutenberg/1/3/135/135-0.txt', 'r') as f:
    text = f.read()
reg = re.findall('Rue de (?:[\w\'\"]+\s*[\w\'\"]*)', text)
reg

['Rue de Chaffaut',
 'Rue de Chaffaut',
 'Rue de La Harpe',
 'Rue de Rivoli',
 'Rue de Paris',
 "Rue de l'Ourcine",
 'Rue de Pontoise',
 "Rue de l'Épée",
 "Rue de l'Arbalète",
 'Rue de la Clef',
 'Rue de Pontoise',
 'Rue de Pontoise',
 'Rue de Sèvres',
 'Rue de Vaugirard',
 "Rue de l'Université",
 'Rue de Normandie and',
 'Rue de Saintonge there',
 "Rue de l'Ouest\nside",
 "Rue de l'Ouest",
 "Rue de l'Ouest",
 "Rue de l'Ouest",
 "Rue de l'Ouest",
 "Rue de l'Ouest",
 'Rue de la Tour',
 'Rue de Pontoise',
 'Rue de Pontoise',
 'Rue de la Barrière',
 'Rue de Charonne were',
 'Rue de Cotte',
 'Rue de Reuilly\nfound',
 'Rue de la\nPaix',
 'Rue de Grenelle',
 'Rue de Grès',
 'Rue de Vaugirard',
 'Rue de la\nVerrerie',
 'Rue de la Santé',
 'Rue de Babylone was',
 'Rue de Babylone',
 "Rue de l'Ouest",
 "Rue de l'Homme Armé",
 "Rue de l'Ouest",
 'Rue de Babylone and',
 'Rue de Babylone',
 'Rue de Babylone',
 'Rue de Varennes a',
 'Rue de Bourgogne and',
 "Rue de l'Ouest",
 "Rue de l'Ouest",
 "Ru

In [48]:
reg = re.findall('(Rue de (?:[\w\'\"]+\s*[\w\'\"]*))', text)
reg

['Rue de Chaffaut',
 'Rue de Chaffaut',
 'Rue de La Harpe',
 'Rue de Rivoli',
 'Rue de Paris',
 "Rue de l'Ourcine",
 'Rue de Pontoise',
 "Rue de l'Épée",
 "Rue de l'Arbalète",
 'Rue de la Clef',
 'Rue de Pontoise',
 'Rue de Pontoise',
 'Rue de Sèvres',
 'Rue de Vaugirard',
 "Rue de l'Université",
 'Rue de Normandie and',
 'Rue de Saintonge there',
 "Rue de l'Ouest\nside",
 "Rue de l'Ouest",
 "Rue de l'Ouest",
 "Rue de l'Ouest",
 "Rue de l'Ouest",
 "Rue de l'Ouest",
 'Rue de la Tour',
 'Rue de Pontoise',
 'Rue de Pontoise',
 'Rue de la Barrière',
 'Rue de Charonne were',
 'Rue de Cotte',
 'Rue de Reuilly\nfound',
 'Rue de la\nPaix',
 'Rue de Grenelle',
 'Rue de Grès',
 'Rue de Vaugirard',
 'Rue de la\nVerrerie',
 'Rue de la Santé',
 'Rue de Babylone was',
 'Rue de Babylone',
 "Rue de l'Ouest",
 "Rue de l'Homme Armé",
 "Rue de l'Ouest",
 'Rue de Babylone and',
 'Rue de Babylone',
 'Rue de Babylone',
 'Rue de Varennes a',
 'Rue de Bourgogne and',
 "Rue de l'Ouest",
 "Rue de l'Ouest",
 "Ru

In [None]:
fs = find_streets()
assert_equal(
    fs[:5],
    [('Rue de la Chanvrerie', 27),
     ("Rue de l'Homme Armé", 26),
     ("Rue de l'Ouest", 12),
     ('Rue de la', 11),
     ('Rue de Babylone', 9)]
)

## Problem 2b [10 pts]

Create a function `find_anti` that reads `/mnt/data/public/agora/Agora.csv` and returns a pandas `Series` with index corresponding to the first case-insensitive alphabetic word in `Item Description` that begins with `anti` or `anti-`, and value corresponding to the number of rows that word was identified. Return only the 10 most frequent words sorted by decreasing frequency then by ascending name.

sensistive dapat

In [53]:
def find_anti():
    df10 = pd.read_csv('/mnt/data/public/agora/Agora.csv').dropna(subset=[' Item Description'])
    df_anti = df10[df10[' Item Description'].str.contains('[^\w]\s?anti-?', regex=True)]
    df_anti['anti'] = df_anti[' Item Description'].apply(lambda x: re.findall(r'(anti-?\w+)', x))
    df_anti['anti'] = df_anti['anti'].astype(str)
    df_anti['anti'] = df_anti['anti'].apply(lambda x: x[1:-1])
    df_anti['anti'] = df_anti['anti'].apply(lambda x: x[1:-1])
    return df_anti.groupby('anti')[' Item'].count().sort_values(ascending=False)

In [87]:
df10 = pd.read_csv('/mnt/data/public/agora/Agora.csv').dropna(subset=[' Item Description'])
# df10 = pd.read_csv('/mnt/data/public/agora/Agora.csv')

In [61]:
df10.isna().sum()

Vendor                   0
 Category                0
 Item                    0
 Item Description        0
 Price                   2
 Origin               9875
 Destination         49134
 Rating                 11
 Remarks             97058
dtype: int64

In [73]:
df10['anti-'] = df10[' Item Description'].apply(lambda x: re.match('\S?anti-?',x))

Unnamed: 0,Vendor,Category,Item,Item Description,Price,Origin,Destination,Rating,Remarks
1394,KryptykOG,Data/Software,Portable USB APPS MEGAPACK *Huge*,Great for business trips! This listing is a pe...,0.007499999999999978 BTC,Torland,,4.93/5,
1456,sereal,Data/Software,Malwarebytes Anti-Malware Premium : license,Malwarebytes Anti-Malware Premium license. Ful...,0.05078811515873015 BTC,Torland,,4.97/5,
2168,captainkirk,Information/eBooks,eBay Listings That Sell For Dummies,No matter what you want to sell on eBay—auto p...,0.003456964111111111 BTC,Internet,Everywhere,4.988/5,
2554,stiffstyles,Drugs/Steroids,100iu HGH- Kigtropin human growth hormone,NO AUSSIE ORDERS 100iu HGH- Kigtropin human g...,2.334650155 BTC,Torland,,4.92/5,Average price may be skewed outliar > .5 BTC f...
2878,theben,Drugs/Steroids,Tamoxifen Citrate (Genesis) 100 tabs x 10mg,Nolvadex® a trade name for the drug tamoxife...,0.16246157464285724 BTC,EU,,5.00/5,
...,...,...,...,...,...,...,...,...,...
105285,hammau21,Drugs/Benzos,10x Xanax 2mg,2mg Xanax Bars. Xanax (alprazolam) is a benzod...,0.07319572 BTC,USA,,~5/5,
105292,hammau21,Drugs/Benzos,25x Xanax 2mg,2mg Xanax Bars. Xanax (alprazolam) is a benzod...,0.15684798 BTC,USA,USA,~5/5,
105604,RushDelivery,Drugs/Cannabis/Synthetics,Cesamet (nabilone) 0.5mg capsules (brand) [x5],This listing is for 5 Canadian brand name Cesa...,0.15332472 BTC,Canada,Worldwide,[0 deals],
109219,fake,Counterfeits/Electronics,Telefonul anti interceptare,Afla acum daca telefonul iti este ascultat. T...,1.2678413350000004 BTC,,,4.782/5,Average price may be skewed outliar > .5 BTC f...


In [138]:
antis = find_anti()
assert_equal(
    antis.index[:5].tolist(),
    ['anti-inflammatory', 'anti-depressant', 'antidepressant', 'antibiotic',
     'anti-magnetic']
)
assert_equal(
    antis[:5].tolist(),
    [31, 29, 26, 24, 21]
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_anti['anti'] = df_anti[' Item Description'].apply(lambda x: re.findall(r'(anti-?\w+)', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_anti['anti'] = df_anti['anti'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_anti['anti'] = df_anti['anti'].apply(lambda x: x[1:-1])


AssertionError: 
Items are not equal:
item=0

 ACTUAL: "'anti-inflammatory'"
 DESIRED: 'anti-inflammatory'

# Problem 3 [25 pts]

For this problem, you will work on the sqlite database `/mnt/data/public/nle2019.db`.

The schema of the tables within the database is shown below.

<img src="election_schema.png" width="600px" />

## Problem 3a [10 pts]

Create a function `get_num_members` that returns a pandas data framee of all party names and the number of candidates in each party running for a national position. The list should be sorted first in descending number of members then in ascending party names.

In [61]:
def get_num_members():
    with sqlite3.connect('/mnt/data/public/nle2019.db') as conn:
        query = """
        SELECT party, COUNT(name) as count
        FROM candidate
        WHERE level = 'national'
        GROUP BY party
        ORDER BY count DESC, party ASC
        """
    df = pd.read_sql(query, conn)
    return df

In [62]:
conn = sqlite3.connect('/mnt/data/public/nle2019.db')
cursor = conn.cursor()

table_list = [a for a in cursor.execute("SELECT name FROM sqlite_master WHERE type = 'table'")]
df_cand = pd.read_sql('select * from candidate', conn)
df_result = pd.read_sql('select * from result', conn)
print(table_list)
conn.close()

KeyboardInterrupt: 

In [63]:
df_parties = get_num_members()
assert_equal(df_parties.shape, (153, 2))
assert_equal(
    df_parties.iloc[:10].to_numpy().tolist(), 
    [('LGBTQ PARTY', 17),
     ('LABOR PARTY PHILIPPINES', 8),
     ('LIBERAL PARTY', 6),
     ('KATIPUNAN NG DEMOKRATIKONG PILIPINO(KDP)', 5),
     ('PARTIDO DEMOKRATIKO PILIPINO LAKAS NG BAYAN', 5),
     ('NACIONALISTA PARTY', 3),
     ('LAKAS CHRISTIAN  MUSLIM DEMOCRATS', 2),
     ("NATIONALIST PEOPLE'S COALITION", 2),
     ('PARTIDO FEDERAL NG PILIPINAS', 2),
     ('PWERSA NG MASANG  PILIPINO', 2)])

## Problem 3b [15 pts]

Create a function `count_votes` that accepts a case-insensitive search key `name` and returns a pandas data frame of matched candidate names and the total vote for the candidate/s. The input search key can match anywhere in the candidate name and the output should be sorted in descending number of votes then in ascending candidate name.

In [64]:
def count_votes(name):
    name = name.replace('_', '\_').replace('%','\%')
    name = f"'%{name}%'"
    with sqlite3.connect('/mnt/data/public/nle2019.db') as conn:
        query = f"""
        SELECT c.name, SUM(r.votes) as sum_v
        FROM candidate as c
        JOIN result as r
        ON c.id = r.candidate_id
        WHERE LOWER(c.name) LIKE LOWER({name})
        GROUP BY c.name
        ORDER BY sum_v DESC, c.name ASC
        """
    df = pd.read_sql(query, conn)
    return df

In [30]:
# def count_votes(name):
#     name = name.replace('_', '\_').replace('%','\%')
#     name = f"'%{name}%'"
#     with sqlite3.connect('/mnt/data/public/nle2019.db') as conn:
#         query = f"""
#         SELECT c.name, SUM(r.votes) as sum_v
#         FROM candidate as c
#         JOIN result as r
#         ON c.id = r.candidate_id
#         WHERE LOWER(c.name) LIKE LOWER({name})
#         GROUP BY c.name
#         ORDER BY sum_v DESC, c.name ASC
#         """
#     df = pd.read_sql(query, conn)
#     return df

In [31]:
count_votes('vIlLaR').iloc[:10]

Unnamed: 0,name,sum_v
0,"VILLAR, CYNTHIA (NP)",25128563
1,"VILLARICA,ATORNI HENRY(PDPLBN)",191992
2,"VILLAR, CAMILLE (NP)",173917
3,"VILLAROSA, FLORA (NP)",88638
4,"VILLAROSA, JOSE (UNA)",65862
5,"VILLAROSA, JASON (LAKAS)",61440
6,"VILLARICA, LINABELLE (PDPLBN)",44774
7,"VILLARANTE, LITO (NPC)",43786
8,"VILLARANTE, ROMEO (NPC)",32175
9,"VILLARICA, WILLIAM (PDPLBN)",31355


In [65]:
df_votes = count_votes('vIlLaR')
assert_equal(df_votes.shape, (94, 2))
assert_array_equal(
    df_votes.iloc[:10].to_numpy().tolist(), 
    [['VILLAR, CYNTHIA (NP)', 25128563],
     ['VILLARICA,ATORNI HENRY(PDPLBN)', 191992],
     ['VILLAR, CAMILLE (NP)', 173917],
     ['VILLAROSA, FLORA (NP)', 88638],
     ['VILLAROSA, JOSE (UNA)', 65862],
     ['VILLAROSA, JASON (LAKAS)', 61440],
     ['VILLARICA, LINABELLE (PDPLBN)', 44774],
     ['VILLARANTE, LITO (NPC)', 43786],
     ['VILLARANTE, ROMEO (NPC)', 32175],
     ['VILLARICA, WILLIAM (PDPLBN)', 31355]])
assert_array_equal(count_votes('%hi').shape, (0, 2))
assert_array_equal(count_votes('hi_').shape, (0, 2))

# Problem 4 [25 pts]

## Problem 4a [15 pts]

Write a function `get_results` that would return a list of tuples containing page titles and page urls from the search engine results page in https://server.accesslab.aim.edu/exam1/search.htm. Ignore sponsored, feedback, and video results.

In [231]:
def get_results():
    page = requests.get('https://server.accesslab.aim.edu/exam1/search.htm')
    page
    soup = BeautifulSoup(page.text)
    li=[]
    for i in soup.find_all('div', class_="r")[4:]:
        li.append((i.find('h3').text,i.find('a')['href']))
    return li

In [232]:
search_res = get_results()
assert_equal(len(search_res), 9)
assert_equal(len(search_res[0]), 2)
assert_equal(
    search_res[:5],
    [('Data mining - Wikipedia', 'https://en.wikipedia.org/wiki/Data_mining'),
     ('What is data mining? | SAS',
      'https://www.sas.com/en_ph/insights/analytics/data-mining.html'),
     ('Data Mining Definition - Investopedia',
      'https://www.investopedia.com/terms/d/datamining.asp'),
     ('What is Data Mining? Definition of Data Mining, Data Mining ...',
      'https://economictimes.indiatimes.com/definition/data-mining'),
     ('Data Mining Explained | MicroStrategy',
      'https://www.microstrategy.com/us/resources/introductory-guides/'
      'data-mining-explained')]
)

# Problem 4b [10 pts]

Create a function `crawl_page` that crawls the API responses from https://server.accesslab.aim.edu/exam1/scraping.html and returns the text message of the last API page.

In [295]:
def crawl_page():
    page = requests.get('https://server.accesslab.aim.edu/exam1/scraping.html')
    return page

In [296]:
requests.get('https://server.accesslab.aim.edu/exam1/scraping.html')

<Response [200]>

In [None]:
output = crawl_page()
assert_equal(type(output), str)

# Lab mini-project 1 contribution

Please answer the form [here](https://forms.office.com/Pages/ResponsePage.aspx?id=Zz3KImWBbke5GEsuMeBHur1KLEXNIepKmjRDCXWIriRUNEVXUkdFRkxWNFpFOEJRSjBTUFQzWlk2TC4u).