## Get a Web Page with Persistent or Non-Persistent HTTP
### by fetching the base HTML Contents and all Images included 

### Let's start with OOP

In [1]:
from urllib.request import urlparse
import socket

class Request:
    def __init__(self, url, headers=None):
        """Create Request object from url"""
        r = urlparse(url)
        if r.scheme != 'http': raise NotImplementedError(r.scheme)
        self.server = r.hostname, r.port if r.port else 80
        self.path = r.path + '?' + r.query if r.query else r.path
        self.headers = {'Host': r.hostname}
        if headers:
            self.add_headers(headers)

    def add_headers(self, headers):
        """Add new headers of dict type
        """
        self.headers.update(headers)

    def build(self):
        """Build a request message
        """
        l = []
        l.append('GET {} HTTP/1.1'.format(self.path))
        for key, value in self.headers.items():
            l.append('{}: {}'.format(key, value))
        message = '\r\n'.join(l) + '\r\n\r\n'
        return message.encode('utf-8')


class Connection:
    def __init__(self, server, sock, infile):
        """Session open by http_open function
        """
        self.server, self.sock, self.infile = server, sock, infile

    def close(self):
        if not self.sock._closed:
            self.infile.close()
            self.sock.close()

    def send(self, request):
        message = request.build()
        print('Sending request:', message, sep='\n')
        return self.sock.sendall(message)

    def get_headers(self):
        """Parse HTTP response message and get status and headers from it
        """

        def parse_headers(file):
            """extract headers as a dict
            """
            headers = {}
            for line in file:
                if line == b'\r\n':  # end of headers
                    break
                header = line.decode().strip()  # remove leading and trailing white spaces
                key, value = header.split(':', maxsplit=1)
                headers[key] = value.strip()
            return headers

        status = self.infile.readline().decode().split()[1]
        headers = parse_headers(self.infile)
        print('Status:', status)
        print('Headers:', headers)
        return status, headers

    def read(self, headers):
        """Read contents according to header definitions"""

        def read_chunked(infile):
            # Chunked transfer encoding for streaming data
            # See https://en.wikipedia.org/wiki/Chunked_transfer_encoding
            chunks = []
            while True:
                hex_str = infile.readline().strip()
                chunk_len = int(hex_str, 16)
                print("chunk", chunk_len)
                if chunk_len == 0:
                    break
                chunk = infile.read(chunk_len)
                chunks.append(chunk)
                infile.readline()  # skip CRLF
            contents = b''.join(chunks)
            infile.readline()  # skip CRLF
            return contents

        content_len = headers.get('Content-Length')
        if content_len:
            # If Content-Length header exists, read the content-length bytes
            contents = self.infile.read(int(content_len))
        elif headers.get('Transfer-Encoding') == 'chunked':
            contents = read_chunked(self.infile)
        else:
            # otherwiese, read until server closing
            contents = self.infile.read()
        print('Contents:', contents[:40])
        return contents

def http_open(server):
    print('open new connection to server', server)
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    sock.connect(server)
    infile = sock.makefile('rb')  # convert incoming socket to file-like object
    return Connection(server, sock, infile)

In [2]:
from urllib.parse import urljoin
import re, time

def get_base_html(url, persistent=True):
    """Get a base html and return with base connection
    """
    hdrs = None if persistent else {'Connection': 'close'}
    request = Request(url, headers=hdrs)
    base_conn = http_open(request.server)
    base_conn.send(request)
    status, headers = base_conn.get_headers()
    base_html = base_conn.read(headers)
    if not persistent:
        base_conn.close()
    return base_html, base_conn

def get_obj(url, base_conn, persistent=True):
    """Get object. Try to use existing connection if persistent HTTP,
    """
    hdrs = None if persistent else {'Connection': 'close'}
    request = Request(url, headers=hdrs)
    if persistent and request.server == base_conn.server:
        # reuse the connection
        conn = base_conn
    else:
        # open new connection
        conn = http_open(request.server)
    conn.send(request)
    status, headers = conn.get_headers()
    obj = conn.read(headers)
    if not persistent:
        conn.close()
    return obj

persistent = True
base_url = "http://mclab.hufs.ac.kr/test/index.html"
# base_url = "http://mclab.hufs.ac.kr/wiki/Lectures/CN/2018"  # Chunked transfering page

start_time = time.time()
base_html, base_conn = get_base_html(base_url, persistent)

# Extract image src URL from the base html contents
pattern = re.compile(r'<(?:img|script).*?src=\"(.*?)\".*?>')
obj_srcs = pattern.findall(base_html.decode())
print('---')
print('Object src:', obj_srcs)

# Convert relative URLs to absolute one
# and divide into internal or external objects same as in the base connection
internal = []
external = []
for src in obj_srcs:
    absolute_url = urljoin(base_url, src)
    if Request(absolute_url).server == base_conn.server:
        internal.append(absolute_url)
    else:
        external.append(absolute_url)
print('Internal objects:', internal)
print('External objects:', external)
print('---')

objs = {}
for url in internal + external:
    obj = get_obj(url, base_conn, persistent)
    objs[url] = obj
end_time = time.time()

print('\nElapsed: {} seconds'.format(end_time - start_time))

open new connection to server ('mclab.hufs.ac.kr', 80)
Sending request:
b'GET /test/index.html HTTP/1.1\r\nHost: mclab.hufs.ac.kr\r\n\r\n'
Status: 200
Headers: {'Date': 'Wed, 10 Oct 2018 15:06:25 GMT', 'Server': 'Apache/2.2.22 (Ubuntu)', 'Last-Modified': 'Tue, 19 Sep 2017 06:13:15 GMT', 'ETag': '"1e982f-569-55984c1337a5f"', 'Accept-Ranges': 'bytes', 'Content-Length': '1385', 'Vary': 'Accept-Encoding', 'Content-Type': 'text/html'}
Contents: b'<html>\n<head>\n<title>Test Page</title>\n<'
---
Object src: ['http://ice.hufs.ac.kr/hufs-image01.jpg', 's3test2.gif', 's3test3.jpg', 's3test4.jpg', 's3test5.jpg']
Internal objects: ['http://mclab.hufs.ac.kr/test/s3test2.gif', 'http://mclab.hufs.ac.kr/test/s3test3.jpg', 'http://mclab.hufs.ac.kr/test/s3test4.jpg', 'http://mclab.hufs.ac.kr/test/s3test5.jpg']
External objects: ['http://ice.hufs.ac.kr/hufs-image01.jpg']
---
Sending request:
b'GET /test/s3test2.gif HTTP/1.1\r\nHost: mclab.hufs.ac.kr\r\n\r\n'
Status: 200
Headers: {'Date': 'Wed, 10 Oct 20

In [3]:
from urllib.request import urlopen
import re

base_url = 'http://mclab.hufs.ac.kr/test/index.html'
f = urlopen(base_url)
base_html = f.read()
pattern = re.compile(r'<img.*?src=\"(.*?)\".*?>')
urls = pattern.findall(base_html.decode())
print(urls)

['http://ice.hufs.ac.kr/hufs-image01.jpg', 's3test2.gif', 's3test3.jpg', 's3test4.jpg', 's3test5.jpg']


In [4]:
from urllib.parse import urljoin

print(urljoin(base_url, 's3test2.gif'))
print(urljoin(base_url, 'http://ice.hufs.ac.kr/hufs-image01.jpg'))

http://mclab.hufs.ac.kr/test/s3test2.gif
http://ice.hufs.ac.kr/hufs-image01.jpg
