<a href="https://colab.research.google.com/github/jamestheengineer/data-science-from-scratch-Python/blob/master/Chapter_9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Getting data

# You can pipe data using stdin and stdout

# egrep.py
import sys, re

# sys.argv is the list of command-line arguments
# sys.argv[0] is the name of the program itself
# sys.argv[1] will be the regex specified at the command line
regex = sys.argv[1]

# for every line passed into the script
for line in sys.stdin:
  # if it matches the regex, write it to stdout
  if re.search(regex, line):
    sys.stdout.write(line)


In [0]:
# line_count.py
count = 0
for line in sys.stdin:
  count += 1

# print goes to sys.stdout
print(count)

In [0]:
# If I were to break these files out, you could then pipe like:
# type SomeFile.txt | python egrep.py "[0-9]" | python line_count.py

In [0]:
# another potential script
# most_common_words.py
import sys
from collections import Counter

# pass in number of words as first argument
try:
  num_words : int(sys.argv[1])
except:
  print("usage: most_common_words.py num_words")
  sys.exit(1) # nonzero exit code indicates error

counter = Counter(word.lower()                      # lowercase words
                  for line in sys.stdin
                  for word in line.strip().split()  # split on spaces
                  if word)                          # skip empty 'words'

for word, count in counter.most_common(num_words):
  sys.stdout.write(str(count))
  sys.stdout.write("\t")
  sys.stdout.write(word)
  sys.stdout.write("\n")
  

In [0]:
# Then you could do
# cat the_bible.txt | python most_common_words.py 10

In [0]:
# Reading Files
p = """
Some random text to write out
# lets start some lines like this
# and this
"""
text_file = open("text.txt", "w+");text_file.write(p);text_file.close()

# 'r' means read-only, it's assumed if you leave it out
file_for_reading = open('text.txt', 'r')
file_for_reading2 = open('text.txt')

# 'w' if write -- will destroy the file if it already exists!
file_for_writing = open('writing_file.txt', 'w')

# 'a' is append -- for adding to the end of the file
file_for_appending = open('appending_file.txt', 'a')

# don't forget to close your files when you are done
file_for_writing.close()

In [0]:
# Python will auto close files if you use a with block
# with open('text.txt') as f:
#  data = function_that_get_data_from(f)

# at this point f has already been closed, so don't try to use it
# process(data)

In [0]:
# If you need to read a whole file, you can iterate over the lines of the file
starts_with_hash = 0

with open('text.txt') as f:
  for line in f:              # look at each line in the file
    if re.match("^#", line):  # user a regex to see if it starts with '#'
      starts_with_hash += 1   # if it does, add 1 to the count

print(starts_with_hash)