In [1]:
# stdin and stdout
# egrep.py
import sys, re

# sys.argv is the list of command-line arguments
# sys.argv[0] is the name of the program itself
# sys.argv[1] will be the regex specified at the command line
regex = sys.argv[1]

# for every line passed into the script
for line in sys.stdin:
    # if it matches the regex, write it to stdout
    if re.search(regex, line):
        sys.stdout.write(line)

In [2]:
# line_count.py
import sys

count = 0
for line in sys.stdin:
    count += 1
    
# print goes to sys.stdout
print(count)

0


In [None]:
# most_common_words.py
import sys
from collections import Counter

# pass in number of words as first argument
try:
    num_words = int(sys.argv[1])
except:
    print("usage: most_common_words.py num_words")
    sys.exit(1) # nonzero exit code indicates error
    
counter = Counter(word.lower()
                  for line in sys.stdin
                  for word in line.strip().split()
                  if word)

for word, count in counter.most_common(num_words):
    sys.stdout.write(str(count))
    sys.stdout.write("\t")
    sys.stdout.write(word)
    sys.stdout.write("\n")

In [None]:
# The Basic of Text Files
file_for_reading = open('reading_file.txt', 'r')
file_for_reading2 = open('reading_file.txt')

file_for_writing = open('writing_file.txt')

file_for_appending = open('appending_file.txt')

file_for_writing.close()

In [None]:
# files will be closed automatic using with block
with open(filename) as f:
    data = function_that_gets_data_from(f)
    
# don't try to use f
process(data)

In [None]:
starts_with_hash = 0

with open('input.txt') as f:
    for line in f:
        if re.match("^#", line):
            starts_with_hash += 1

In [9]:
def get_domain(email_address: str) -> str:
    return email_address.lower().split("@")[-1]

assert get_domain("joelgrus@gmail.com") == 'gmail.com'
assert get_domain('joel@m.datasciencester.com') == 'm.datasciencester.com'

In [None]:
from collections import Counter

with open('email_addresses.txt', 'r') as f:
    domain_counts = Counter(get_domain(line.strip())
                           for line in f
                           if "@" in line)
    

In [None]:
import csv

with open('tab_delimited_stock_prices.txt') as f:
    tab_reader = csv.reader(f, delimiter='\t')
    for row in tab_reader:
        date = row[0]
        symbol = row[1]
        closing_price = float(row[2])
        process(date, symbol, closing_price)

In [None]:
with open('colon_delimited_stock_prices.txt') as f:
    colon_reader = csv.DictReader(f, delimiter=':')
    for dict_row in colon_reader:
        date = dict_row['date']
        symbol = dict_row['symbol']
        closing_price = float(dict_row['closing-price'])
        process(date, symbol, closing_price)

In [None]:
todays_prices = {'AAPL': 90.91, 'MSFT': 41.68, 'FB': 64.5}

with open('comma_delimited_stock_prices.txt', 'w') as f:
    csv_writer = csv.writer(f, delimiter=',')
    for stock, price in todays_prices.items():
        csv_writer.writerow([stock, price])

In [None]:
results = [["test1", "success", "Monday"],
           ["test2", "success, kind of", "Tuesday"],
           ["test3", "failure, kind of", "Wednesday"],
           ["test4", "failure, utter", "Thursday"]]

# don't do this
with open('bad_csv.txt', 'w') as f:
    for row in results:
        f.write(",".join(map(str, row)))
        f.write("\n")

In [1]:
# Scraping the Web
from bs4 import BeautifulSoup
import requests

url = ("https://raw.githubusercontent.com/joelgrus/data/master/getting-data.html")
html = requests.get(url).text
soup = BeautifulSoup(html, 'html5lib')

In [2]:
first_paragraph = soup.find('p') # or just soup.p

In [3]:
first_paragraph

<p id="p1">This is the first paragraph.</p>

In [4]:
first_paragraph_text = soup.p.text
first_paragraph_words = soup.p.text.split()

In [5]:
print(first_paragraph_text)
print(first_paragraph_words)

This is the first paragraph.
['This', 'is', 'the', 'first', 'paragraph.']


In [6]:
first_paragraph_id = soup.p['id'] # raises KeyError if no 'id'
first_paragraph_id2 = soup.p.get('id') # returns None if no 'id'

In [7]:
print(first_paragraph_id)
print(first_paragraph_id2)

p1
p1


In [8]:
# You can get multiple tags at once
all_paragraphs = soup.find_all('p') # or just soup('p')
paragraphs_with_ids = [p for p in soup('p') if p.get('id')]

In [9]:
print(all_paragraphs)

[<p id="p1">This is the first paragraph.</p>, <p class="important">This is the second paragraph.</p>]


In [10]:
print(paragraphs_with_ids)

[<p id="p1">This is the first paragraph.</p>]


In [11]:
important_paragraphs = soup('p', {'class' : 'important'})
important_paragraphs2 = soup('p', 'important')
important_paragraphs3 = [p for p in soup('p') if 'important' in p.get('class', [])]

In [12]:
spans_inside_divs = [span for div in soup('div')
                     for span in div('span')]

In [13]:
spans_inside_divs

[<span id="name">Joel</span>,
 <span id="twitter">@joelgrus</span>,
 <span id="email">joelgrus-at-gmail</span>]

In [15]:
# Example: Keeping Tabs on Congress
from bs4 import BeautifulSoup
import requests

url = "https://www.house.gov/representatives"
text = requests.get(url).text
soup = BeautifulSoup(text, 'html5lib')

all_urls = [a['href'] for a in soup('a') if a.has_attr('href')]

print(len(all_urls))

964


In [18]:
import re

regex = r"^https?://.*\.house\.gov/?$"

assert re.match(regex, "http://joel.house.gov")
assert re.match(regex, "https://joel.house.gov")
assert re.match(regex, "http://joel.house.gov/")
assert re.match(regex, "https://joel.house.gov/")
assert not re.match(regex, "joel.house.gov")
assert not re.match(regex, "http://joel.house.com")
assert not re.match(regex, "https://joel.house.gov/biography")

good_urls = [url for url in all_urls if re.match(regex, url)]

print(len(good_urls))

876


In [19]:
# Only 435, duplicates
good_urls = list(set(good_urls))

print(len(good_urls))

438


In [21]:
html = requests.get("https://jayapal.house.gov").text
soup = BeautifulSoup(html, 'html5lib')

# Use a set because the links might appear multiple times
links = {a['href'] for a in soup('a') if 'press releases' in a.text.lower()}
print(links)

{'https://jayapal.house.gov/category/press-releases/'}


In [22]:
from typing import Dict, Set

press_releases: Dict[str, Set[str]] = {}

for house_url in good_urls:
    html = requests.get(house_url).text
    soup = BeautifulSoup(html, 'html5lib')
    pr_links = {a['href'] for a in soup('a') if 'press releases' in a.text.lower()}
    print(f"{house_url}: {pr_links}")
    press_releases[house_url] = pr_links

https://wilson.house.gov/: set()
https://arrington.house.gov: {'https://arrington.house.gov/press-releases/'}
https://lacyclay.house.gov: {'/media-center/press-releases'}
https://kelly.house.gov: {'/press-releases'}
https://tipton.house.gov/: {'/media/press-releases'}
https://hoyer.house.gov/: set()
https://payne.house.gov: {'/press-releases'}
https://gomez.house.gov/: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://bergman.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://escobar.house.gov: {'/media/press-releases'}
https://correa.house.gov: {'/news'}
https://susanwbrooks.house.gov: {'/media-center/press-releases'}
https://roby.house.gov/: {'/newsroom/press-releases'}
https://nunes.house.gov/: {'/News/DocumentQuery.aspx?DocumentTypeID=2133'}
https://bass.house.gov/: {'/media-center/press-releases'}
https://luetkemeyer.house.gov/: {'/news/DocumentQuery.aspx?DocumentTypeID=2270', '#tabb-2'}
https://jayapal.house.gov: {'https://jayapal.house.gov/category/press-rele

https://aguilar.house.gov/: {'/media-center/press-releases'}
https://case.house.gov/: {'/media/press-releases'}
https://watsoncoleman.house.gov/: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://delauro.house.gov/: {'/media-center/press-releases'}
https://allred.house.gov/: {'/media/press-releases'}
https://stauber.house.gov: {'/media/press-releases'}
https://torres.house.gov/: {'/media-center/press-releases'}
https://algreen.house.gov: {'/press-releases'}
https://panetta.house.gov: {'/media/press-releases'}
https://andylevin.house.gov/: {'/media/press-releases'}
https://pingree.house.gov/: set()
https://engel.house.gov: set()
https://biggs.house.gov: {'/media/press-releases'}
https://mccollum.house.gov: {'/media/press-releases'}
https://swalwell.house.gov: {'/media-center/press-releases'}
https://schakowsky.house.gov: {'/press-releases/'}
https://gohmert.house.gov/: {'/News/DocumentQuery.aspx?DocumentTypeID=1954'}
https://stanton.house.gov/: {'/media/press-releases'}
https://cun

https://babin.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://torressmall.house.gov/: {'/media/press-releases'}
https://soto.house.gov: {'/media/press-releases'}
https://robbishop.house.gov: {'/media/press-releases'}
https://timmons.house.gov/: {'/media/press-releases'}
https://gonzalez.house.gov: {'/media/press-releases'}
https://dennyheck.house.gov: {'/media-center/press-releases'}
https://titus.house.gov/: set()
https://huffman.house.gov: set()
https://steil.house.gov: {'/media/press-releases'}
https://crawford.house.gov/: {'/News/DocumentQuery.aspx?DocumentTypeID=2080'}
https://delbene.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://mcgovern.house.gov/: {'/news/documentquery.aspx?DocumentTypeID=2472'}
https://suozzi.house.gov: {'/media/press-releases'}
https://smucker.house.gov/: {'/media/press-releases'}
https://luria.house.gov: {'/media/press-releases'}
https://johnrose.house.gov/: {'/media/press-releases'}
https://cohen.house.gov/: {'/media-ce

https://dustyjohnson.house.gov/: {'/media/press-releases'}
https://mccaul.house.gov: {'/media-center/press-releases', '/frontpage?qt-home_page_tabs=1#qt-home_page_tabs'}
https://riggleman.house.gov: {'/media/press-releases'}
https://crow.house.gov/: {'/media/press-releases'}
https://lawrence.house.gov/: {'/media-center/press-releases'}
https://jeffduncan.house.gov/: {'/media/press-releases'}
https://stevens.house.gov/: {'/media/press-releases'}
https://walberg.house.gov/: {'/media/press-releases'}
https://pelosi.house.gov/: {'/news/press-releases'}
https://turner.house.gov/: {'/media-center/press-releases', '/frontpage?qt-home_page_tabs=0#qt-home_page_tabs'}
https://spanberger.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://buddycarter.house.gov/: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://foster.house.gov: {'/media/press-releases'}
https://woodall.house.gov/: {'/media-center/press-releases'}
https://porter.house.gov/: {'/media/press-releases'}
https://spe

In [23]:
def paragraph_mentions(text: str, keyword: str) -> bool:
    """Returns True if a <p> inside the text mentions {keyword}"""
    soup = BeautifulSoup(text, 'html5lib')
    paragraphs = [p.get_text() for p in soup('p')]
    return any(keyword.lower() in paragraph.lower()
               for paragraph in paragraphs)

In [24]:
text = """<body><h1>Facebook</h1><p>Twitter</p>"""
assert paragraph_mentions(text, "twitter")
assert not paragraph_mentions(text, "facebook")

In [None]:
for house_url, pr_links in press_releases.items():
    for pr_link in pr_links:
        url = f"{house_url}/{pr_link}"
        text = requests.get(url).text
        
        if paragraph_mentions(text, "data"):
            print(f"{house_url}")
            break # done with this house_url

In [2]:
# Using APIs (application programming interfaces). 
# To explictly request data in a structured format.
# This saves you the trouble of having to scrape them.
import json

serialized = """{ "title" : "Data Science Book",
                  "author" : "Joel Grus",
                  "publicationYear" : 2019, 
                  "topics": ["data", "science", "data science"]}"""

# parse the JSON to create a Python dict
deserialized = json.loads(serialized)
assert deserialized["publicationYear"] == 2019
assert "data science" in deserialized["topics"]

In [3]:
# Using an Unauthenticated API
# GitHub API first
import requests, json
github_user = "ibacaraujo"
endpoint = f"https://api.github.com/users/{github_user}/repos"

repos = json.loads(requests.get(endpoint).text)

In [5]:
from collections import Counter
from dateutil.parser import parse

dates = [parse(repo["created_at"]) for repo in repos]
month_counts = Counter(date.month for date in dates)
weekday_counts = Counter(date.weekday() for date in dates)

In [6]:
print(month_counts)
print(weekday_counts)

Counter({7: 7, 1: 4, 10: 4, 5: 3, 9: 3, 11: 3, 12: 3, 8: 2, 3: 1})
Counter({3: 9, 4: 7, 5: 5, 1: 4, 0: 3, 2: 2})


In [8]:
last_5_repositories = sorted(repos, key=lambda r: r["pushed_at"], reverse=True)[:5]
print([repo['name'] for repo in last_5_repositories])

last_5_languages = [repo["language"] for repo in last_5_repositories]
print(last_5_languages)

['data-structures-and-algorithms-in-python', 'data-science-from-scratch', 'fastai-deep-learning-from-the-foundations', 'flask-microblog', 'cyclegan-seed']
['Python', 'Jupyter Notebook', 'Jupyter Notebook', 'Python', 'Jupyter Notebook']
