## Basics of Text File

    'r' means read only
    'w' means write/overwirte
    'a' means append, at end

In [3]:
with open('tab_delimited_stock_prices.txt', 'w') as f:
    f.write("""6/20/2014\tAAPL\t90.91
6/20/2014\tMSFT\t41.68
6/20/2014\tFB\t64.5
6/19/2014\tAAPL\t91.86
6/19/2014\tMSFT\t41.51
6/19/2014\tFB\t64.34
""")

In [6]:
def process(date: str, symbol: str, closing_price: float) -> None:

    assert closing_price > 0.0

In [7]:
import csv

with open('tab_delimited_stock_prices.txt') as f:
    tab_reader = csv.reader(f, delimiter='\t')
    for row in tab_reader:
        date = row[0]
        symbol = row[1]
        closing_price = float(row[2])
        process(date, symbol, closing_price)


In [8]:
with open('colon_delimited_stock_prices.txt', 'w') as f:
    f.write("""date:symbol:closing_price
6/20/2014:AAPL:90.91
6/20/2014:MSFT:41.68
6/20/2014:FB:64.5
""")


with open('colon_delimited_stock_prices.txt') as f:
    colon_reader = csv.DictReader(f, delimiter=':')
    for dict_row in colon_reader:
        date = dict_row["date"]
        symbol = dict_row["symbol"]
        closing_price = float(dict_row["closing_price"])
        process(date, symbol, closing_price)


# Scrapping Web

In [12]:
from bs4 import BeautifulSoup
import requests

url = ("https://raw.githubusercontent.com/joelgrus/data/master/getting-data.html")

html = requests.get(url).text
soup = BeautifulSoup(html,'html5lib')

## 
    find the first tag (and its contents)

In [14]:
first_paragraph = soup.find('p')     # or just soup.p
first_paragraph

<p id="p1">This is the first paragraph.</p>

In [15]:
first_paragraph_text = soup.p.text
# now getting the text contents from the content called

first_paragraph_words = soup.p.text.split()

first_paragraph_text, first_paragraph_words

('This is the first paragraph.', ['This', 'is', 'the', 'first', 'paragraph.'])

In [6]:
# extracting tag's attribute 

first_paragraph_id = soup.p['id']       # raises KeyError if no 'id'
first_paragraph_id2 = soup.p.get('id')  # returns None if no 'id'

first_paragraph_id, first_paragraph_id2

('p1', 'p1')

    Multiple tags at once

In [17]:
all_paragraphs = soup.find_all('p')
paragraphs_with_ids = [p for p in soup('p') if p.get('id')]

all_paragraphs,paragraphs_with_ids

([<p id="p1">This is the first paragraph.</p>,
  <p class="important">This is the second paragraph.</p>],
 [<p id="p1">This is the first paragraph.</p>])

### Tags with specified classes

In [18]:
important_paragraphs = soup('p', {'class':'important'})
important_paragraphs2 = soup('p', 'important')
important_paragraphs3 = [p for p in soup('p') if 'important' in p.get('class',[])]

important_paragraphs,important_paragraphs2, important_paragraphs3

([<p class="important">This is the second paragraph.</p>],
 [<p class="important">This is the second paragraph.</p>],
 [<p class="important">This is the second paragraph.</p>])

## Congress Votes DataSet

In [19]:
from bs4 import BeautifulSoup
import requests

url = "https://www.house.gov/representatives"
text = requests.get(url).text
soup = BeautifulSoup(text, 'html5lib')

all_urls = [a['href'] for a in soup('a') if a.has_attr('href')]

print(len((all_urls)))

967


If you look at them, the ones we want start with either **http://** or **https://**, have some kind of name, and end with either .house.gov or .house.gov/.

In [21]:
import re
    
    # Must start with http:// or https://
    # Must end with .house.gov or .house.gov/
regex = r"^https?://.*\.house\.gov/?$"
    
    # Let's write some tests!
assert re.match(regex, "http://joel.house.gov")
assert re.match(regex, "https://joel.house.gov")
assert re.match(regex, "http://joel.house.gov/")
assert re.match(regex, "https://joel.house.gov/")
assert not re.match(regex, "joel.house.gov")
assert not re.match(regex, "http://joel.house.com")
assert not re.match(regex, "https://joel.house.gov/biography")
    
    # And now apply
good_urls = [url for url in all_urls if re.match(regex, url)]
    
print(len(good_urls))  # still 862 for me
    

878


    Eliminating duplicate elements by putting them in a set

In [22]:
html = requests.get('https://jayapal.house.gov').text
soup = BeautifulSoup(html, 'html5lib')
    
# Use a set because the links might appear multiple times.
links = {a['href'] for a in soup('a') if 'press releases' in a.text.lower()}
    
print(links) # {'/media/press-releases'}
    
    

{'https://jayapal.house.gov/category/press-releases/', 'https://jayapal.house.gov/category/news/'}


In [23]:
    # I don't want this file to scrape all 400+ websites every time it runs.
    # So I'm going to randomly throw out most of the urls.
    # The code in the book doesn't do this.
import random
good_urls = random.sample(good_urls, 5)
print(f"after sampling, left with {good_urls}")
    

after sampling, left with ['https://degette.house.gov', 'https://bush.house.gov', 'https://newman.house.gov', 'https://wassermanschultz.house.gov/', 'https://bera.house.gov']


In [24]:
from typing import Dict, Set
    
press_releases: Dict[str, Set[str]] = {}
    
for house_url in good_urls:
    html = requests.get(house_url).text
    soup = BeautifulSoup(html, 'html5lib')
    pr_links = {a['href'] for a in soup('a') if 'press releases' in a.text.lower()}
    print(f"{house_url}: {pr_links}")
    press_releases[house_url] = pr_links

https://degette.house.gov: {'/newsroom/press-releases'}
https://bush.house.gov: {'/media/press-releases'}
https://newman.house.gov: {'/media/press-releases'}
https://wassermanschultz.house.gov/: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://bera.house.gov: {'/media-center/press-releases'}


In [25]:
def paragraph_mentions(text: str, keyword: str) -> bool:
    """
    Returns True if a <p> inside the text mentions {keyword}
    """
    soup = BeautifulSoup(text, 'html5lib')
    paragraphs = [p.get_text() for p in soup('p')]

    return any(keyword.lower() in paragraph.lower()
               for paragraph in paragraphs)


In [26]:
text = """<body><h1>Facebook</h1><p>Twitter</p>"""
assert paragraph_mentions(text, "twitter")       # is inside a <p>
assert not paragraph_mentions(text, "facebook")  # not inside a <p>

paragraph_mentions(text, 'twitter'),paragraph_mentions(text, 'facebook')

(True, False)

In [27]:
for house_url, pr_links in press_releases.items():
    for pr_link in pr_links:
        url = f"{house_url}/{pr_link}"
        text = requests.get(url).text
    
        if paragraph_mentions(text, 'data'):
            print(f"{house_url}")
            break  # done with this house_url
            

# Using API

In [30]:
import requests, json
    
github_user = "kidwithahalo"
endpoint = f"https://api.github.com/users/{github_user}/repos"
    
repos = json.loads(requests.get(endpoint).text)
repos

[{'id': 401086239,
  'node_id': 'MDEwOlJlcG9zaXRvcnk0MDEwODYyMzk=',
  'name': 'Computational_physics_phy473',
  'full_name': 'kidwithahalo/Computational_physics_phy473',
  'private': False,
  'owner': {'login': 'kidwithahalo',
   'id': 67169034,
   'node_id': 'MDQ6VXNlcjY3MTY5MDM0',
   'avatar_url': 'https://avatars.githubusercontent.com/u/67169034?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/kidwithahalo',
   'html_url': 'https://github.com/kidwithahalo',
   'followers_url': 'https://api.github.com/users/kidwithahalo/followers',
   'following_url': 'https://api.github.com/users/kidwithahalo/following{/other_user}',
   'gists_url': 'https://api.github.com/users/kidwithahalo/gists{/gist_id}',
   'starred_url': 'https://api.github.com/users/kidwithahalo/starred{/owner}{/repo}',
   'subscriptions_url': 'https://api.github.com/users/kidwithahalo/subscriptions',
   'organizations_url': 'https://api.github.com/users/kidwithahalo/orgs',
   'repos_url': 'https://api.gith

In [32]:
from collections import Counter
from dateutil.parser import parse

dates = [parse(repo['created_at']) for repo in repos]

month_count = Counter(date.month for date in dates)
weekly_counts = Counter(date.weekday() for date in dates)


month_count, weekly_counts

(Counter({8: 2, 3: 1, 9: 2}), Counter({6: 2, 3: 1, 0: 2}))

In [36]:
last_5_repositories = sorted(repos, key = lambda r: r['pushed_at'], reverse=True)[:5]

last_5_languages = [repo['language'] for repo in last_5_repositories]

last_5_repositories[2],last_5_languages[2]

({'id': 297250574,
  'node_id': 'MDEwOlJlcG9zaXRvcnkyOTcyNTA1NzQ=',
  'name': 'puzzles',
  'full_name': 'kidwithahalo/puzzles',
  'private': False,
  'owner': {'login': 'kidwithahalo',
   'id': 67169034,
   'node_id': 'MDQ6VXNlcjY3MTY5MDM0',
   'avatar_url': 'https://avatars.githubusercontent.com/u/67169034?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/kidwithahalo',
   'html_url': 'https://github.com/kidwithahalo',
   'followers_url': 'https://api.github.com/users/kidwithahalo/followers',
   'following_url': 'https://api.github.com/users/kidwithahalo/following{/other_user}',
   'gists_url': 'https://api.github.com/users/kidwithahalo/gists{/gist_id}',
   'starred_url': 'https://api.github.com/users/kidwithahalo/starred{/owner}{/repo}',
   'subscriptions_url': 'https://api.github.com/users/kidwithahalo/subscriptions',
   'organizations_url': 'https://api.github.com/users/kidwithahalo/orgs',
   'repos_url': 'https://api.github.com/users/kidwithahalo/repos',
   'even