In [None]:
from urllib.request import urlopen

def getSource(url):
    response = urlopen(url)
    html = response.read()
    return html.decode()

def news(url, wordlist):
    
    #decode web response
    news = getSource(url)
    
    #count words
    for word in wordlist:
        occurences = news.count(word)
        print('{} appears {} times'.format(word, occurences))

news('https://bbc.co.uk', ['economy', 'climate', 'education'])

In [None]:
from html.parser import HTMLParser

class LinkParser(HTMLParser):
    def handle_starttag(self, tag, attrs):
        
        if tag == 'a':
            for attr in attrs:
                if attr[0] == 'href':
                    print(attr[1])

infile = open('links.html')
content = infile.read()
infile.close()

parser = LinkParser()
parser.feed(content)

In [None]:
from html.parser import HTMLParser

class MyHTMLParser(HTMLParser):
    
    indent = 0
    
    def handle_starttag(self, tag, attrs):
        print('{}{} start'.format(self.indent*' ', tag))
        self.indent += 4
        
    def handle_endtag(self, tag):
        self.indent -= 4
        print('{}{} end'.format(self.indent*' ', tag))
        
infile = open('w3c.html')
content = infile.read()
infile.close()

myParser = MyHTMLParser()
myParser.feed(content)

In [None]:
from urllib.parse import urljoin
from html.parser import HTMLParser

class Collector(HTMLParser):
    
    def __init__(self, url):
        HTMLParser.__init__(self)
        self.url = url
        self.links = []
        self.text = []
        
    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            for attr in attrs:
                if attr[0] == 'href':
                    absolute = urljoin(self.url, attr[1])
                    if absolute[:5] == 'https':
                        self.links.append(absolute)
    
    def handle_data(self, data):
        self.text.append(data)
    
    def getData(self):
        return self.text
    
    def getLinks(self):
        return self.links

url = 'https://www.w3.org/Consortium/mission.html'
resource = urlopen(url)
content = resource.read().decode()
collector = Collector(url)
collector.feed(content)
for link in collector.getLinks():
    print(link)
for txt in collector.getData():
    print(txt)

In [None]:
from re import findall

def frequency(content):
    words = findall('[a-zA-Z]+', content)
    
    dictionary = {}
    
    for word in words:
        if word in dictionary:
            dictionary[word] += 1
        else:
            dictionary[word] = 1
    
    return dictionary

cc = 'The pure and simple trush is rarely pure and never\ simple'
frequency(cc)

In [None]:
from re import findall

infile = open('frankenstein.txt', 'r')
content = infile.read()
infile.close()

# print(findall('Frankenstein', content))
# print(findall('[\d]+', content))
# print(findall('[\w]+ible', content))
# print(findall('[A-Z][\w]*y', content))
# print(findall('horror of [a-z][a-z]?', content))
# print(findall('[\w]+[\s]death', content))
# print(findall('[\w\s]*laboratory[\w]*[\s]*\.', content))

In [None]:
from re import findall

infile = open('links.html', 'r')
content = infile.read()
infile.close()

print(findall('href="[\S]*"', content))

USD = '$[1-9,]?[\d]?[\d]?[\d]?\.[\d\d]?'
date = '[0-3][\d]/[0-1][\d]/[\d\d\d\d]'
email = '[\w]*@[\w]*\.[\w]*'
url = 'http://[\w]*\.?[\w]*\.[\w]+'

In [None]:
from html.parser import HTMLParser

class UlOl(HTMLParser):
    
    ul = []
    indent = 0
    val = 0
    
    def handle_starttag(self, tag, attrs):
        if tag == 'ul':
            print('<{}>'.format(tag))
            
            
        elif tag == 'li':
            self.indent += 4
            self.li.append('{}'.format(' '*self.indent))
            self.li.append('<{}>'.format(tag))
            self.val += 1
    
    def handle_endtag(self, tag):
        if tag == 'ul':
            print('<\{}>'.format(tag))
        elif tag == 'li':
            self.indent -= 4
            self.li.append('<\{}>'.format(tag))
            print(''.join(self.li))
            self.li.clear()
            self.val -= 1
    
    def handle_data(self, data):
        if self.val == 1:
            self.li.append(data)

infile = open('w3c.html', 'r')
content = infile.read()
infile.close()

par = UlOl()
par.feed(content)

In [None]:
from html.parser import HTMLParser

class ListCollector(HTMLParser):
    
    listOfList = []
    listContainer = []
    val = 0
    
    def handle_starttag(self, tag, attrs):
        if tag == 'li':
            self.val += 1
    
    def handle_endtag(self, tag):
        if tag == 'ul' or tag == 'ol':
            listOfList.append(listContainer)
            listContainer.clear()
            
        elif tag == 'li':
            self.val -= 1
    
    def handle_data(self, data):
        if self.val == 1:
            self.listContainer.append(data)

    def getLists(self):
        return self.listOfList
    
infile = open('w3c.html', 'r')
content = infile.read()
infile.close()

par = UlOl()
par.feed(content)

In [None]:
from re import findall

def scary(filename):
    
    infile = open(filename, 'r')
    content = infile.read()
    infile.close()
    
    wordList = []
    
    words = findall('[\w]+', content.lower())
    
    for word in words:
        if word not in wordList:
            wordList.append(word)
    
    wordList.sort()
    
    outfile = open('dictionary.txt', 'w')
    
    for word in wordList:
        print(word)
        outfile.write(word+'\n')
    
    outfile.close()
        
    
    

scary('frankenstein.txt')

In [None]:
from urllib.request import urlopen
from html.parser import HTMLParser

class ContentParser(HTMLParser):
    val = 0
    
    def handle_starttag(self, tag, attrs):
        if tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'a']:
            self.val += 1
    
    def handle_endtag(self, tag):
        if tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'a']:
            self.val -= 1
    
    def handle_data(self, data):
        if self.val == 1 and data != '' != '\n':
            print(data)

def getSource(url):
    response = urlopen(url)
    html = response.read()
    return html.decode()

def getContent(url):
    content = getSource(url)
    parser = ContentParser()
    parser.feed(content)

getContent('https://www.nytimes.com')

In [None]:
from urllib.request import urlopen
from re import findall

def emails(url):
    content = urlopen(url).read().decode()
    return findall('[\w]+@[\w]*\.?[\w]*\.?[\w]*', content)

urll = 'http://www.cdm.depaul.edu'
emails(urll)

In [1]:
from urllib.request import urlretrieve

urlretrieve('https://google.com', 'google.html')

('google.html', <http.client.HTTPMessage at 0x7fae743ece50>)