<a href="https://colab.research.google.com/github/jamestheengineer/data-science-from-scratch-Python/blob/master/Chapter_9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Getting data

# You can pipe data using stdin and stdout

# egrep.py
import sys, re

# sys.argv is the list of command-line arguments
# sys.argv[0] is the name of the program itself
# sys.argv[1] will be the regex specified at the command line
regex = sys.argv[1]

# for every line passed into the script
for line in sys.stdin:
  # if it matches the regex, write it to stdout
  if re.search(regex, line):
    sys.stdout.write(line)


In [0]:
# line_count.py
count = 0
for line in sys.stdin:
  count += 1

# print goes to sys.stdout
print(count)

In [0]:
# If I were to break these files out, you could then pipe like:
# type SomeFile.txt | python egrep.py "[0-9]" | python line_count.py

In [0]:
# another potential script
# most_common_words.py
import sys
from collections import Counter

# pass in number of words as first argument
try:
  num_words : int(sys.argv[1])
except:
  print("usage: most_common_words.py num_words")
  sys.exit(1) # nonzero exit code indicates error

counter = Counter(word.lower()                      # lowercase words
                  for line in sys.stdin
                  for word in line.strip().split()  # split on spaces
                  if word)                          # skip empty 'words'

for word, count in counter.most_common(num_words):
  sys.stdout.write(str(count))
  sys.stdout.write("\t")
  sys.stdout.write(word)
  sys.stdout.write("\n")
  

In [0]:
# Then you could do
# cat the_bible.txt | python most_common_words.py 10

In [0]:
# Reading Files
p = """
Some random text to write out
# lets start some lines like this
# and this
"""
text_file = open("text.txt", "w+");text_file.write(p);text_file.close()

# 'r' means read-only, it's assumed if you leave it out
file_for_reading = open('text.txt', 'r')
file_for_reading2 = open('text.txt')

# 'w' if write -- will destroy the file if it already exists!
file_for_writing = open('writing_file.txt', 'w')

# 'a' is append -- for adding to the end of the file
file_for_appending = open('appending_file.txt', 'a')

# don't forget to close your files when you are done
file_for_writing.close()

In [0]:
# Python will auto close files if you use a with block
# with open('text.txt') as f:
#  data = function_that_get_data_from(f)

# at this point f has already been closed, so don't try to use it
# process(data)

In [0]:
# If you need to read a whole file, you can iterate over the lines of the file
starts_with_hash = 0

with open('text.txt') as f:
  for line in f:              # look at each line in the file
    if re.match("^#", line):  # user a regex to see if it starts with '#'
      starts_with_hash += 1   # if it does, add 1 to the count

print(starts_with_hash)

In [0]:
# Let's get some domain names (although you can trip this particular approach up)
def get_domain(email_address: str) -> str:
  """Split on '@' and return the last piece"""
  return email_address.lower().split("@")[-1]

# a couple of tests
assert get_domain('joelgrus@gmail.com') == 'gmail.com'
assert get_domain('joel@m.datasciencester.com') == 'm.datasciencester.com'

from collections import Counter

#with open('email_addresses.txt', 'r') as f:
#  domain_counts = Counter(get_domain(line.strip())
#                          for line in f
#                          if "@" in line)

In [0]:
# Delimited files are most common. Edge cases are tough when dealing with tabs
# or spaces or commas, so you shouldn't try to parse these yourself

# Stock prices
stock_prices = """6/20/2014\tAA\t90.91
6/20/2014\tMSFT\t41.68
6/20/2014\tFB\t64.50
"""
print(stock_prices)

text_file = open("stock_prices.txt", "w+");text_file.write(stock_prices);text_file.close()

import csv

with open('stock_prices.txt') as f:
  tab_reader = csv.reader(f, delimiter='\t')
  for row in tab_reader:
    print(row)
    date = row[0]
    symbol = row[1]
    closing_price = float(row[2])
    #process(date, symbol, closing_price)


In [0]:
# You can also use a DictReader
with open('colon_delimted_stock_prices.txt') as f:
  colon_reader = csv.DictReader(f, delimiter = ':')
  for dict_row in colon_reader:
    date = dict_row["date"]
    symbol = dict_row["symbol"]
    closing_price = float(dict_row["closing_price"])
    process(date, symbol, closing_price)

# You can still use DictReader even if your data doesn't have headers by passing it the keys as a 'fieldnames' 

In [0]:
# You can write out data using csv.writer
todays_prices = {'AAPL': 90.91, 'MSFT': 41.68, 'FB': 64.5}

with open('comma_delimited_stock_prices.txt', 'w') as f:
  csv_writer = csv.writer(f, delimiter = ',')
  for stock, price in todays_prices.items():
    csv_writer.writerow([stock, price])

In [0]:
# Scraping the web. We'll use a couple external packages (i.e., not from scratch)
!pip install beautifulsoup4 requests html5lib

In [0]:
from bs4 import BeautifulSoup
import requests

url = ("https://raw.githubusercontent.com/"
        "joelgrus/data/master/getting-data.html")
print(url)
html = requests.get(url).text
soup = BeautifulSoup(html, 'html5lib')

first_paragraph = soup.find('p')
first_paragraph_text = soup.p.text
first_paragraph_words = soup.p.text.split()
print(first_paragraph_text, first_paragraph_words)

In [0]:
# You can tag attributes by treating soup like a dict
first_paragraph_id = soup.p['id']
first_paragraph_id2 = soup.p.get('id')
print(first_paragraph_id, first_paragraph_id2)

In [0]:
# You can multiple tags at once
all_paragraphs = soup.find_all('p') # or just soup('p')
paragraphs_with_ids = [p for p in soup('p') if p.get('id')]

In [0]:
# Here's how you find classes
important_paragraphs = soup('p', {'class' : 'important'})
important_paragraphs2 = soup('p', 'important')
important_paragraphs3 = [ p for p in soup('p')
                          if 'important' in p.get('class', [])]
print(important_paragraphs, important_paragraphs2, important_paragraphs3)

In [0]:
# Spans in divs. Warning: will return the same <span> multiple times if it sits
# inside multiple <div>s. Be more clever if that is the case
spans_inside_divs = [span
                     for div in soup('div')
                     for span in div('span')]

In [0]:
# Example: Keeping tabs on Congress