<b>Getting Data</b>

In [1]:
# script to read lines of text and spits backout the ones 
# that match a regular experssion
import sys, re
regex = sys.argv[1]
for line in sys.stdin:
    if re.search(regex, line):
        sys.stdout.wrtie(line)

In [2]:
import sys
count = 0
for line in sys.stdin:
    count += 1
print(count)

0


<b>Working with some sample text</b>

In [3]:
with open("sometext.txt") as f:
    for line in f:
        print(line)
        
# content from  - https://en.wikipedia.org/wiki/Text_(literary_theory)

In literary theory, a text is any object that can be "read", whether this object is a work of literature, a street sign, an arrangement of buildings on a city block, or styles of clothing. It is a coherent set of signs that transmits some kind of informative message.[1] This set of signs is considered in terms of the informative message's content, rather than in terms of its physical form or the medium in which it is represented.



Within the field of literary criticism, "text" also refers to the original information content of a particular piece of writing; that is, the "text" of a work is that primal symbolic arrangement of letters as originally composed, apart from later alterations, deterioration, commentary, translations, paratext, etc. Therefore, when literary criticism is concerned with the determination of a "text", it is concerned with the distinguishing of the original information content from whatever has been added to or subtracted from that content as it appears in a give

In [4]:
starts_with_A = 0
with open("sometext.txt") as f:
    for line in f:
        if re.match("^A",line):
            starts_with_A += 1

print(starts_with_A)

0


In [5]:
starts_with_I = 0
with open("sometext.txt") as f:
    for line in f:
        if re.match("^I",line):
            starts_with_I += 1

print(starts_with_I)

1


In [6]:
def get_domain(email:str) -> str:
    return email.lower().split("@")[-1]
email = "data_Science@gmail.com"
get_domain(email)

'gmail.com'

In [7]:
from collections import Counter
with open('emails.txt','r') as f:
    domain_counts = Counter(get_domain(line.strip())
                            for line in f
                            if"@" in line)
    print(domain_counts)

Counter({'mail.com': 1, 'gmail.com': 1, '123_mail.com': 1, 'science.com': 1})


<b>Delimiter Files</b>

In [16]:
import csv
    
with open('tab_delimited_stock_prices.txt') as f:
    tab_reader = csv.reader(f, delimiter='\t')
    for row in tab_reader:
        date = row[0]
        symbol = row[1]
        closing_price = float(row[2])
        print(date,symbol,closing_price)

6/20/2014 AAPL 90.91
6/20/2014 MSFT 41.68
6/20/2014 FB 64.5
6/19/2014 AAPL 91.86
6/19/2014 MSFT 41.51
6/19/2014 FB 64.34


In [18]:
with open('colon_delimited_stock_prices.txt') as f:
    colon_reader = csv.DictReader(f, delimiter=':')
    for dict_row in colon_reader:
        date = dict_row["date"]
        symbol = dict_row["symbol"]
        closing_price = float(dict_row["closing_price"])
        print(date, symbol, closing_price)
        print(dict_row)

6/20/2014 AAPL 90.91
OrderedDict([('date', '6/20/2014'), ('symbol', 'AAPL'), ('closing_price', '90.91')])
6/20/2014 MSFT 41.68
OrderedDict([('date', '6/20/2014'), ('symbol', 'MSFT'), ('closing_price', '41.68')])
6/20/2014 FB 64.5
OrderedDict([('date', '6/20/2014'), ('symbol', 'FB'), ('closing_price', '64.5')])


In [19]:
todays_prices = {'AAPL': 90.91, 'MSFT': 41.68, 'FB': 64.5 }
with open('comma_delimited_stock_prices.txt', 'w') as f:
    csv_writer = csv.writer(f, delimiter=',')
    for stock, price in todays_prices.items():
        csv_writer.writerow([stock, price])

<b>Scraping the Web</b>

In [23]:
from bs4 import BeautifulSoup
import requests

In [26]:
url = ("https://raw.githubusercontent.com/joelgrus/data/master/getting-data.html")
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')

In [28]:
first_paragraph = soup.find('p')
print(first_paragraph)

<p id="p1">This is the first paragraph.</p>


In [29]:
first_paragraph_text = soup.p.text
first_paragraph_words = soup.p.text.split()
print(first_paragraph_text)
print()
print(first_paragraph_words)

This is the first paragraph.

['This', 'is', 'the', 'first', 'paragraph.']


In [30]:
first_paragraph_id = soup.p['id']
first_paragraph_id_2 = soup.p.get('id')
print(first_paragraph_id)
print()
print(first_paragraph_id_2)

p1

p1


In [31]:
all_paragraphs = soup.find_all('p')
print(all_paragraphs)

[<p id="p1">This is the first paragraph.</p>, <p class="important">This is the second paragraph.</p>]


In [33]:
paragraphs_with_id = [p for p in soup('p') if p.get('id')]
print(paragraphs_with_id)

[<p id="p1">This is the first paragraph.</p>]


In [35]:
important_paragraphs = soup('p', {'class' : 'important'})
important_paragraphs2 = soup('p', 'important')
important_paragraphs3 = [p for p in soup('p')
if 'important' in p.get('class', [])]
print(important_paragraphs)
print()
print(important_paragraphs2)
print()
print(important_paragraphs3)
print()

[<p class="important">This is the second paragraph.</p>]

[<p class="important">This is the second paragraph.</p>]

[<p class="important">This is the second paragraph.</p>]



In [36]:
spans_inside_divs = [span 
                    for div in soup('div')
                    for span in div('span')]

Example for Web Scraping

In [38]:
from bs4 import BeautifulSoup
import requests
url = "https://www.house.gov/representatives"
text = requests.get(url).text
soup = BeautifulSoup(text, "html.parser")
all_urls = [a['href']
    for a in soup('a')
    if a.has_attr('href')]
print(len(all_urls))

966


In [48]:
import re
import pandas as pd
regex = r"https?://.*\.house\.gov/?$"
good_urls = [url for url in all_urls if re.match(regex,url)]
print(len(good_urls))

872


In [49]:
good_urls = list(set(good_urls))
print(len(good_urls))

436
