# Chapter 9 - Getting Data

### Import Packages

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys, re

### stdin & stdout

In [3]:
regex = sys.argv[1]

In [4]:
for line in sys.stdin:
    if re.search(regex, line):
        sys.stdout.write(line)

In [5]:
count = 0
for line in sys.stdin:
    count += 1
    
print(count)

0


In [6]:
from collections import Counter

In [8]:
try:
    num_words = int(sys.argv[1])
except:
    print("usage: most_common_words.py num_words")
    sys.exit(1)
    
counter = Counter(word.lower()
                  for line in sys.stdin
                  for word in line.strip().split()
                  if word)

for word, count in counter.most_common(num_words):
    sys.stdout.write(str(count))
    sys.stdout.write("\t")
    sys.stdout.write(word)
    sys.stdout.write("\n")

### Reading Files 

### The Basics of Text Files

In [11]:
file_for_reading = open('reading_file.txt', 'r')
file_for_reading2 = open('reading_file.txt')

file_for_writing = open('writing_file.txt', 'w')

file_for_appending = open('appending_file.txt', 'a')
file_for_writing.close()

# Rather choose
with open(filename) as f:
    data = function_that_gets_data_from(f)

process(data)

# Reading Entire File
starts_wit_hash = 0

with open('input.txt') as f:
    for line in f:
        if re.match("^#", line):
            starts_with_hash += 1

In [13]:
def get_domain(email_address: str) -> str:
    return email_address.lower().split("@")[-1]

assert get_domain('joelgrus@gmail.com') == 'gmail'
assert get_domain('joel@m.datasciencester.com') == 'm.datasciencester.com'

In [14]:
with open('email_address.txt', 'r') as f:
    domain_counts = Counter(get_domain(line.strip())
                            for line in f
                            if "@" in line)

### Delimited Files

In [15]:
import csv

In [17]:
with open('tab_delimited_stock_prices.txt') as f:
    tab_reader = csv.reader(f, delimiter = '\t')
    for row in tab_reader:
        date = row[0]
        symbol = row[1]
        closing_price = float(row[2])
        process(date, symbol, closing_price)

In [18]:
todays_prices = {'APPL': 90.91, 'MSFT': 41.68, 'FB': 64.5}

with open('comma_delimited_stock_prices.txt', 'w') as f:
    csv_writed = csv.writer(f, delimiter=',')
    for stock, price in todays_prices.items():
        csv_writer.writerow([stock, price])

## Scraping The Web

### HTML & The Parsing Thereof

In [19]:
from bs4 import BeautifulSoup
import requests

In [20]:
url = ("https://raw.githubusercontent.com/"
       "joelgrus/data/master/getting-data.html")
html = requests.get(url).text
soup = BeautifulSoup(html, 'html5lib')

In [21]:
first_paragraph = soup.find('p')

In [22]:
first_paragraph_text = soup.p.text
first_paragraph_words = soup.p.text.split()

In [23]:
first_paragraph_id = soup.p['id']
first_paragraph_id2 = soup.p.get('id')

In [24]:
all_paragraphs = soup('p', {'class': 'important'})
paragraphs_with_ids = [p for p in soup('p') if p.get('id')]

In [25]:
important_paragraphs = soup('p', {'class': 'important'})
important_paragraphs2 = soup('p','important')
important_paragraphs3 = [p for p in soup('p')
                         if 'important' in p.get('class',[])]

In [26]:
spans_inside_divs = [span
                     for div in soup('div')
                     for span in div('span')]

### Using API's

### JSON & XML

In [29]:
import json

serialized = """{ "title" : "Data Science Book",
"author" : "Joel Grus",
"publicationYear" : 2019,
"topics" : ["data", "science", "data science"]}"""

In [30]:
deserialized = json.loads(serialized)
assert deserialized["publicationYear"] == 2019
assert "data science" in deserialized["topics"]

### Using an Unauthenticated API

In [31]:
import requests, json

In [32]:
github_user = "joelgrus"
endpoint = f"https://api.github.com/users/{github_user}/repos"

repos = json.loads(requests.get(endpoint).text)

In [33]:
from collections import Counter
from dateutil.parser import parse

In [35]:
dates = [parse(repo["created_at"])for repo in repos]
month_counts = Counter(date.month for date in dates)
weekday_counts = Counter(date.weekday() for date in dates)

In [36]:
last_5_repositories = sorted(repos,
                             key = lambda r: r["pushed_at"],
                             reverse=True)[:5]

last_5_languages = [repo["language"]
                    for repo in last_5_repositories]

### Finding APIs

#### Using the Twitter API's
#### Getting Credentials

In [37]:
import os

In [None]:
# CONSUMER_KEY = os.environ.get("")
# CONSUMER_SECRET = os.environ.get("")

In [43]:
import webbrowser
from twython import Twython

In [None]:
# temp_client = Twython(CONSUMER_KEY, CONSUMER_SECRET)
# temp_creds = temp_client.get_authentication_tokens()
# url = temp_creds['auth_url']

In [None]:
# print((f"go visit {url} and get the PIN code and paste it below"))
# webbrowser.open(url)
# PIN_CODE = input("Please enter the PIN code: ")