<a href="https://colab.research.google.com/github/jamestheengineer/data-science-from-scratch-Python/blob/master/Chapter_9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Getting data

# You can pipe data using stdin and stdout

# egrep.py
import sys, re

# sys.argv is the list of command-line arguments
# sys.argv[0] is the name of the program itself
# sys.argv[1] will be the regex specified at the command line
regex = sys.argv[1]

# for every line passed into the script
for line in sys.stdin:
  # if it matches the regex, write it to stdout
  if re.search(regex, line):
    sys.stdout.write(line)


In [0]:
# line_count.py
count = 0
for line in sys.stdin:
  count += 1

# print goes to sys.stdout
print(count)

In [0]:
# If I were to break these files out, you could then pipe like:
# type SomeFile.txt | python egrep.py "[0-9]" | python line_count.py

In [0]:
# another potential script
# most_common_words.py
import sys
from collections import Counter

# pass in number of words as first argument
try:
  num_words : int(sys.argv[1])
except:
  print("usage: most_common_words.py num_words")
  sys.exit(1) # nonzero exit code indicates error

counter = Counter(word.lower()                      # lowercase words
                  for line in sys.stdin
                  for word in line.strip().split()  # split on spaces
                  if word)                          # skip empty 'words'

for word, count in counter.most_common(num_words):
  sys.stdout.write(str(count))
  sys.stdout.write("\t")
  sys.stdout.write(word)
  sys.stdout.write("\n")
  

In [0]:
# Then you could do
# cat the_bible.txt | python most_common_words.py 10

In [0]:
# Reading Files
p = """
Some random text to write out
# lets start some lines like this
# and this
"""
text_file = open("text.txt", "w+");text_file.write(p);text_file.close()

# 'r' means read-only, it's assumed if you leave it out
file_for_reading = open('text.txt', 'r')
file_for_reading2 = open('text.txt')

# 'w' if write -- will destroy the file if it already exists!
file_for_writing = open('writing_file.txt', 'w')

# 'a' is append -- for adding to the end of the file
file_for_appending = open('appending_file.txt', 'a')

# don't forget to close your files when you are done
file_for_writing.close()

In [0]:
# Python will auto close files if you use a with block
# with open('text.txt') as f:
#  data = function_that_get_data_from(f)

# at this point f has already been closed, so don't try to use it
# process(data)

In [0]:
# If you need to read a whole file, you can iterate over the lines of the file
starts_with_hash = 0

with open('text.txt') as f:
  for line in f:              # look at each line in the file
    if re.match("^#", line):  # user a regex to see if it starts with '#'
      starts_with_hash += 1   # if it does, add 1 to the count

print(starts_with_hash)

In [0]:
# Let's get some domain names (although you can trip this particular approach up)
def get_domain(email_address: str) -> str:
  """Split on '@' and return the last piece"""
  return email_address.lower().split("@")[-1]

# a couple of tests
assert get_domain('joelgrus@gmail.com') == 'gmail.com'
assert get_domain('joel@m.datasciencester.com') == 'm.datasciencester.com'

from collections import Counter

#with open('email_addresses.txt', 'r') as f:
#  domain_counts = Counter(get_domain(line.strip())
#                          for line in f
#                          if "@" in line)

In [0]:
# Delimited files are most common. Edge cases are tough when dealing with tabs
# or spaces or commas, so you shouldn't try to parse these yourself

# Stock prices
stock_prices = """6/20/2014\tAA\t90.91
6/20/2014\tMSFT\t41.68
6/20/2014\tFB\t64.50
"""
print(stock_prices)

text_file = open("stock_prices.txt", "w+");text_file.write(stock_prices);text_file.close()

import csv

with open('stock_prices.txt') as f:
  tab_reader = csv.reader(f, delimiter='\t')
  for row in tab_reader:
    print(row)
    date = row[0]
    symbol = row[1]
    closing_price = float(row[2])
    #process(date, symbol, closing_price)


In [0]:
# You can also use a DictReader
with open('colon_delimted_stock_prices.txt') as f:
  colon_reader = csv.DictReader(f, delimiter = ':')
  for dict_row in colon_reader:
    date = dict_row["date"]
    symbol = dict_row["symbol"]
    closing_price = float(dict_row["closing_price"])
    process(date, symbol, closing_price)

# You can still use DictReader even if your data doesn't have headers by passing it the keys as a 'fieldnames' 

In [0]:
# You can write out data using csv.writer
todays_prices = {'AAPL': 90.91, 'MSFT': 41.68, 'FB': 64.5}

with open('comma_delimited_stock_prices.txt', 'w') as f:
  csv_writer = csv.writer(f, delimiter = ',')
  for stock, price in todays_prices.items():
    csv_writer.writerow([stock, price])

In [0]:
# Scraping the web. We'll use a couple external packages (i.e., not from scratch)
!pip install beautifulsoup4 requests html5lib

In [0]:
from bs4 import BeautifulSoup
import requests

url = ("https://raw.githubusercontent.com/"
        "joelgrus/data/master/getting-data.html")
print(url)
html = requests.get(url).text
soup = BeautifulSoup(html, 'html5lib')

first_paragraph = soup.find('p')
first_paragraph_text = soup.p.text
first_paragraph_words = soup.p.text.split()
print(first_paragraph_text, first_paragraph_words)

In [0]:
# You can tag attributes by treating soup like a dict
first_paragraph_id = soup.p['id']
first_paragraph_id2 = soup.p.get('id')
print(first_paragraph_id, first_paragraph_id2)

In [0]:
# You can multiple tags at once
all_paragraphs = soup.find_all('p') # or just soup('p')
paragraphs_with_ids = [p for p in soup('p') if p.get('id')]

In [0]:
# Here's how you find classes
important_paragraphs = soup('p', {'class' : 'important'})
important_paragraphs2 = soup('p', 'important')
important_paragraphs3 = [ p for p in soup('p')
                          if 'important' in p.get('class', [])]
print(important_paragraphs, important_paragraphs2, important_paragraphs3)

In [0]:
# Spans in divs. Warning: will return the same <span> multiple times if it sits
# inside multiple <div>s. Be more clever if that is the case
spans_inside_divs = [span
                     for div in soup('div')
                     for span in div('span')]

In [0]:
# Example: Keeping tabs on Congress
url = "https://www.house.gov/representatives"
text = requests.get(url).text
soup = BeautifulSoup(text, "html5lib")

all_urls = [a['href']
            for a in soup('a')
            if a.has_attr('href')]
print(len(all_urls))

In [0]:
# Too many! Let's regex!
import re

# Must start with http:// or https://
# Must end with .house.gov or .house.gov/
regex = r"^https?://.*\.house\.gov/?$"

In [0]:
# Let's write some tests!
assert re.match(regex, "http://joel.house.gov")
assert re.match(regex, "https://joel.house.gov")
assert re.match(regex, "http://joel.house.gov/")
assert re.match(regex, "https://joel.house.gov/")
assert not re.match(regex, "joel.house.gov")
assert not re.match(regex, "http://joel.house.com")
assert not re.match(regex, "https://joel.house.gov/biography")

# And now apply
good_urls = [url for url in all_urls if re.match(regex, url)]

print(len(good_urls))

In [0]:
# Lots of duplicates, so
good_urls = list(set(good_urls))

print(len(good_urls))

In [0]:
html = requests.get('https://jayapal.house.gov').text
soup = BeautifulSoup(html, 'html5lib')

# Use a set because the links might appear multiple times
links = {a['href'] for a in soup('a') if 'press releases' in a.text.lower()}

print(links) 

In [0]:
# These seem like not relative links like the book says, but let's keep going
from typing import Dict, Set

press_releases: Dict[str, Set[str]] = {}

for house_url in good_urls:
  html = requests.get(house_url).text
  soup = BeautifulSoup(html, 'html5lib')
  pr_links = {a['href'] for a in soup('a') if 'press releases'
                                         in a.text.lower()}
  print(f"{house_url}: {pr_links}")
  press_releases[house_url] = pr_links

In [0]:
# Let's see what press releases mention 'data'
def paragraph_mentions(text: str, keyword: str) -> bool:
  """
  Returns True if a <p> inside the text mentions {keyword}
  """
  soup = BeautifulSoup(text, 'html5lib')
  paragraphs = [p.get_text() for p in soup('p')]

  return any(keyword.lower() in paragraph.lower()
              for paragraph in paragraphs)

# Quick test
text = """<body><h1>Facebook</h1><p>Twitter</p>"""
assert paragraph_mentions(text, "twitter") # is inside a <p>
assert not paragraph_mentions(text, "facebook")

In [0]:
# And now to process the data
for house_url, pr_links in press_releases.items():
  for pr_link in pr_links:
    url = f"{house_url}/{pr_link}"
    text = requests.get(url).text

    if paragraph_mentions(text, 'data'):
      print(f"{house_url}")
      break # done with this house_url

In [0]:
# Using APIs

# A lot of times we'll be parsing json into Python objects
import json

serialized = """{ "title" : "Data Science Book",
                   "author" : "Joel Grus",
                   "publicationYear" : 2019,
                   "topics" : [ "data", "science", "data science"] }"""
      
# parse the JSON to create a Python dict
deserialized = json.loads(serialized)
assert deserialized["publicationYear"] == 2019
assert "data science" in deserialized["topics"]

In [0]:
# Using an unautheticated API. 
import requests, json

github_user = "jamestheengineer"
endpoint = f"https://api.github.com/users/{github_user}/repos"
repos = json.loads(requests.get(endpoint).text)
print(repos)

In [0]:
!pip install python-dateutil

from collections import Counter
from dateutil.parser import parse

dates = [parse(repo["created_at"]) for repo in repos]
month_counts = Counter(date.month for date in dates)
weekday_counts = Counter(date.weekday() for date in dates)
print(dates, month_counts, weekday_counts)

In [0]:
last_5_repos = sorted(repos,
                      key=lambda r: r["pushed_at"],
                      reverse=True)[:5]
last_5_languages = [repo["language"]
                    for repo in last_5_repos]

print(last_5_repos, last_5_languages)
