In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import logging
import time
import socket
from crawler import *
from lib import *
from base64 import b32encode
import sqlite3

# Improving our `listener` from last time

Where we left off last time

In [None]:
from lib import *

def listener(address):
    # Establish connection
    sock = handshake(address)
    stream = sock.makefile("rb")
    
    # Print every gossip message we receive
    while True:
        print(read_msg(stream)['command'])

In [None]:
ADDRESS = ['grab one from https://bitnodes.earn.com/nodes/', 8333]

listener(ADDRESS)

Press the &#9632; button to kill the cell above.

This just connects to a node and reads off comands forever. It's nothing special, but this is one of the most important things a Bitcoin node does!

While we won't implement a full Bitcoin node -- that's a massive project -- I'd like to write a few more programs that accomplish some of the core tasks of a bitcoin full or light node.

I think the obvious first one is initial block download. But that a lot to chew. I suggest we write a crawler first. Here's how it might work:

* Initialize an `addresses` list
* Remove one address from `addresses` and call `handshake(address)` to connect to a peer
* Send a [`getaddr` message](https://en.bitcoin.it/wiki/Protocol_documentation#getaddr) to our peer requesting a list of their peers. They should respond with an `addr` message.
* Enter a `while` loop which one bitcoin message from our peer every iteration, just like in `listener`. But instead of printing them out, let's:
    * Wait until we receive an `addr` message (`msg['command'] == b'addr'`)
    * When we do, call `read_addr_payload` on its payload to deserialize this list of peer addresses
    * Add each of these addresses to out `addresses` list, and start all over

In this way we could theoretically visit every node in the network. Let's try to implement this.
 

# Naive Crawler

First, we need to study the Bitcoin wiki to learn how to send a [`getaddr` message](https://en.bitcoin.it/wiki/Protocol_documentation#getaddr.

Luckily for us, `getaddr` is one of those messages which doesn't require a payload. Therefore, we can produce on like this:

In [None]:
serialize_msg(command=b"getaddr", payload=b"")

And we can send it like so:

In [None]:
# make a socket
sock = handshake(ADDRESS)

msg = serialize_msg(command=b"getaddr", payload=b"")
sock.sendall(msg)
print('"getaddr" sent!')

Very easy! Let's copy the body of `listener`, rename it to `crawler`, and add this code right before the loop.

In [None]:
def crawler(address):
    # Establish connection
    sock = handshake(address)
    stream = sock.makefile("rb")
    
    # Request list of their peers
    msg = serialize_msg(command=b"getaddr", payload=b"")
    sock.sendall(msg)
    
    # TODO: Wait for `addr` response
    while True:
        print(read_msg(stream)['command'])

In [None]:
crawler(ADDRESS)

Next we modify `listener` to specially handle the `addr` message we just requested. For now, let's just print out the `addr` payload.

In [None]:
from lib import handshake

def crawler(address):
    # Establish connection
    print(f'Connecting to {address[0]}')
    sock = handshake(address)
    stream = sock.makefile("rb")
    
    # Request list of their peers
    msg = serialize_msg(command=b"getaddr", payload=b"")
    sock.sendall(msg)
    
    # Wait for `addr` response
    while True:
        msg = read_msg(stream)
        if msg['command'] == b'addr':
            print(f'Received "addr" payload: {msg["payload"]}')
            return
        elif msg['command'] == b'ping':
            sock.sendall(serialize_msg(b'pong'))
        else:
            print(f'Ignoring {msg["command"]} message')


In [None]:
crawler(ADDRESS)

One of the strange things you might notice is that they don't always 

Next, let's interpret the payload of the `addr` message when it finally arrives.

Visit the [protocol docs](https://en.bitcoin.it/wiki/Protocol_documentation#addr) to see what this will require. 

We see that `addr` messages are just a list of `net_addr`s prefixed with a `varint` so we know how many of them there. Given that, let's write a `read_addr_payload` function that can take a byte stream containing the payload of an `addr` message and return a Python list of dictionaries containing `net_addr` attributes.

In [None]:
# FIXME: should we just return the list? Do we need the dictionary?

def read_addr_payload(stream):
    r = {}
    count = read_varint(stream)
    r["addresses"] = [read_address(stream) for _ in range(count)]
    return r

In [None]:
read_addr_payload(BytesIO(b'\x01hC\x90\\\r\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff_\x1c\xda= \x8d'))

Let's call this function from within `crawler`:

In [None]:
def crawler(address):
    # Establish connection
    print(f'Connecting to {address[0]}')
    sock = handshake(address)
    stream = sock.makefile("rb")
    
    # Request list of their peers
    msg = serialize_msg(command=b"getaddr", payload=b"")
    sock.sendall(msg)
    
    # Wait for `addr` response
    while True:
        msg = read_msg(stream)
        if msg['command'] == b'addr':
            addr_msg = read_addr_payload(BytesIO(msg['payload']))
            print(f'Received "addr" containing {addr_msg["addresses"]}')
            return
        elif msg['command'] == b'ping':
            sock.sendall(serialize_msg(b'pong'))
        else:
            print(f'Ignoring {msg["command"]} message')


In [None]:
crawler(ADDRESS)

Now that we can our peer's address list, let's extend the save them to an `addresses` list.

In [None]:
def crawler(addresses):
    # Get next address
    address = addresses.pop()
    
    # Connect to this node
    while True:
        # Establish connection
        print(f'Connecting to {address[0]}')
        sock = handshake(address)
        stream = sock.makefile("rb")

        # Request list of their peers
        msg = serialize_msg(command=b"getaddr", payload=b"")
        sock.sendall(msg)

        # Wait for `addr` response
        while True:
            msg = read_msg(stream)
            if msg['command'] == b'addr':
                addr_msg = read_addr_payload(BytesIO(msg['payload']))
                print(f'Received "addr" containing {addr_msg["addresses"]}')
                addresses.extend(addr_msg['addresses'])
                break
            elif msg['command'] == b'ping':
                sock.sendall(serialize_msg(b'pong'))
            else:
                print(f'Ignoring {msg["command"]} message')

In [None]:
crawler([ADDRESS])

If you run this code enough times, you'll get output like this:

```
Connecting to 173.244.167.110
Ignoring b'alert' message
Received "addr" containing [{'time': 1552959039, 'services': b'\r\x04\x00\x00\x00\x00\x00\x00', 'ip': '::ffff:173.244.167.110', 'port': 8333}]
Connecting to 173.244.167.110
Received "addr" containing [{'time': 1552959039, 'services': b'\r\x04\x00\x00\x00\x00\x00\x00', 'ip': '::ffff:173.244.167.110', 'port': 8333}]
Connecting to 173.244.167.110
Received "addr" containing [{'time': 1552959039, 'services': b'\r\x04\x00\x00\x00\x00\x00\x00', 'ip': '::ffff:173.244.167.110', 'port': 8333}]
Connecting to 173.244.167.110
Received "addr" containing [{'time': 1552959039, 'services': b'\r\x04\x00\x00\x00\x00\x00\x00', 'ip': '::ffff:173.244.167.110', 'port': 8333}]
...
...
...
```

It's just connecting to the same address over and over again. Why is this?

Because the `addr` message our peer is sending us contains only 1 address: the address of the peer we're currently connected to! So that will go back into the queue, and we'll just keep connecting to the same peer over-and-over.

We need a check for this:

In [None]:
def crawler(addresses):
    # Get next address
    address = addresses.pop()
    
    # Connect to this node
    while True:
        # Establish connection
        print(f'Connecting to {address[0]}')
        sock = handshake(address)
        stream = sock.makefile("rb")

        # Request list of their peers
        msg = serialize_msg(command=b"getaddr", payload=b"")
        sock.sendall(msg)

        # Wait for `addr` response
        while True:
            msg = read_msg(stream)
            if msg['command'] == b'addr':
                addr_msg = read_addr_payload(BytesIO(msg['payload']))
                # Only save if it contains new addresses
                print(len(addr_msg["addresses"]), addr_msg["addresses"][0], address)
                if len(addr_msg["addresses"]) > 1:
                    print(f'Received "addr" containing {addr_msg["addresses"]}')
                    addresses.extend([
                        (a['ip'], a['port']) for a in addr_msg["addresses"]
                    ])
                    break
            elif msg['command'] == b'ping':
                sock.sendall(serialize_msg(b'pong'))
            else:
                print(f'Ignoring {msg["command"]} message')

In [None]:
crawler([ADDRESS])

In [None]:

def crawler(addresses):
    while True:
        # Get next address from addresses and connect
        address = addresses.pop()

        try:
            # Establish connection
            print(f'Connecting to {address}')
            sock = handshake(address)  # FIXME: save the version payload
            stream = sock.makefile('rb')

            # Request peer's peers
            sock.sendall(serialize_msg(b'getaddr'))

            # Print every gossip message we receive
            while True:
                msg = read_msg(stream)
                command = msg['command']
                payload_len = len(msg['payload'])
                print(f'Received a "{command}" containing {payload_len} bytes')

                # Respond to "ping"
                if command == b'ping':
                    res = serialize_msg(command=b'pong', payload=msg['payload'])
                    sock.sendall(res)
                    print("Send 'pong'")

                # Specially handle peer lists
                if command == b'addr':
                    payload = read_addr_payload(BytesIO(msg['payload']))
                    if len(payload['addresses']) > 1:
                        addresses.extend([
                            (a['ip'], a['port']) for a in payload['addresses']
                        ])
                        break
        except Exception as e:
            print(f'Got error: {str(e)}')
            continue


In [None]:
crawler([ADDRESS])