# Imports

In [120]:
import os
import datetime
import hashlib
import json
import configparser
from IPython.display import JSON

# Function explanations

1. reverse(input):

This function takes a string input as its argument.
It checks if the length of the input is even (L % 2 == 0). If it's not even, the function returns None, indicating that the input is not valid for reversing.
If the length of the input is even, it proceeds to reverse the input string in pairs of two characters (bytes).
The reversed string is stored in the variable Res, and it is returned as the result.

2. merkle_root(lst):

This function calculates the Merkle root of a list of elements lst. The Merkle root is used in blockchain data structures to represent a collection of transactions.
The function uses the SHA-256 cryptographic hash function to compute the Merkle root.
It defines a nested function sha256d(x) that computes the double SHA-256 hash of the input x.
The nested function hash_pair(x, y) computes the hash of the concatenation of x and y, with the bytes reversed before hashing.
If the list lst contains only one element, the function returns that element as the Merkle root.
If the list has an odd number of elements, it duplicates the last element to make the list even.
It then recursively computes the Merkle root of the pairs of elements in the list, until there is only one element left, which becomes the final Merkle root.

3. read_bytes(file, n, byte_order='L'):

This function reads n bytes from the given file file and returns the data as a hexadecimal string.
It allows specifying the byte order (endianess) using the byte_order argument, with the default value 'L', which means little-endian.
If the byte order is 'L', the function reverses the bytes read from the file before converting them to a hexadecimal string.
The resulting hexadecimal string is returned.

4. read_varint(file):

This function reads a variable-length integer (varint) from the given file file and returns the value as a hexadecimal string.
Varints are used in Bitcoin's protocol to encode integers of different sizes efficiently.
The function reads the first byte from the file and interprets it as an integer.
If the integer is less than 253, it represents the value directly, and the function returns it as a hexadecimal string.
If the integer is 253, 254, or 255, it indicates a longer varint, and the function reads additional bytes to construct the integer value.
The resulting hexadecimal string representing the varint value is returned.

In [3]:
def reverse(input):
    """
    Reverse the input string by grouping its characters into pairs and reversing each pair.

    Args:
        input (str): The input string to be reversed. Its length should be even.

    Returns:
        str: The reversed string.

    Example:
        >>> reverse("abcdef")
        'badcfe'
    """
    
    L = len(input)
    if (L % 2) != 0:
        return None
    else:
        Res = ''
        L = L // 2
        for i in range(L):
            T = input[i*2] + input[i*2+1]
            Res = T + Res
            T = ''
        return (Res);

def merkle_root(lst): # https://gist.github.com/anonymous/7eb080a67398f648c1709e41890f8c44
    """
    Compute the Merkle root of a list of elements using the SHA-256 hash function.

    Args:
        lst (list): List of elements for which the Merkle root is calculated.

    Returns:
        str: The Merkle root as a hexadecimal string.

    Example:
        >>> merkle_root(['tx1', 'tx2', 'tx3', 'tx4'])
        'b1a7c9bc3f3e5e8f3b4e5ed9122ca4d157a0a63c22d093ef1215da4d44ff0675'
    """
    
    sha256d = lambda x: hashlib.sha256(hashlib.sha256(x).digest()).digest()
    hash_pair = lambda x, y: sha256d(x[::-1] + y[::-1])[::-1]
    if len(lst) == 1: return lst[0]
    if len(lst) % 2 == 1:
        lst.append(lst[-1])
    return merkle_root([hash_pair(x,y) for x, y in zip(*[iter(lst)]*2)])

def read_bytes(file,n,byte_order = 'L'):
    """
    Read `n` bytes from the file and return the data as a hexadecimal string.

    Args:
        file (file): The file object to read from.
        n (int): The number of bytes to read.
        byte_order (str, optional): The byte order ('L' for little-endian, 'B' for big-endian). Defaults to 'L'.

    Returns:
        str: The hexadecimal string representing the read bytes.

    Example:
        >>> with open('data.bin', 'rb') as file:
        ...     data = read_bytes(file, 4)
        ...     print(data)
        'DEADBEEF'
    """
    
    data = file.read(n)
    if byte_order == 'L':
        data = data[::-1]
    data = data.hex().upper()
    return data

def read_varint(file):
    """
    Read a variable-length integer (varint) from the file and return it as a hexadecimal string.

    Args:
        file (file): The file object to read from.

    Returns:
        str: The hexadecimal string representing the varint value.

    Example:
        >>> with open('data.bin', 'rb') as file:
        ...     varint_value = read_varint(file)
        ...     print(varint_value)
        'EF'
    """
    
    b = file.read(1)
    bInt = int(b.hex(),16)
    c = 0
    data = ''
    if bInt < 253:
        c = 1
        data = b.hex().upper()
    if bInt == 253: c = 3
    if bInt == 254: c = 5
    if bInt == 255: c = 9
    for j in range(1,c):
        b = file.read(1)
        b = b.hex().upper()
        data = b + data
    return data



    

# Config

In [121]:
# authentication
config = configparser.ConfigParser()
config.read("./config.ini")
dirA = config['BlockchainParser']['blocksDir']
dirB = config['BlockchainParser']['resultDir']

In [None]:
fList = os.listdir(dirA)
fList = [x for x in fList if (x.endswith('.dat') and x.startswith('blk'))]
fList.sort()

# Parse blockchain

Store data in a merkle tree (python dictionary) ---> save in a json file 

In [None]:
blocks=[]
for i in fList:
    nameSrc = i
    nameRes = nameSrc.replace('.dat', '.json')
    resList = []
    a = 0
    t = dirA + nameSrc
    #resList.append('Start ' + t + ' in ' + str(datetime.datetime.now()))
    print ('Start ' + t + ' in ' + str(datetime.datetime.now()))
    f = open(t,'rb')
    tmpHex = ''
    fSize = os.path.getsize(t)
    
    block = []
    
    while f.tell() != fSize:
        tmpHex = read_bytes(f,4)
        magic_number = tmpHex
        #resList.append('Magic number = ' + tmpHex)
        tmpHex = read_bytes(f,4)
        block_size = tmpHex
        #resList.append('Block size = ' + tmpHex)
        tmpPos3 = f.tell()
        tmpHex = read_bytes(f,80,'B')
        tmpHex = bytes.fromhex(tmpHex)
        tmpHex = hashlib.new('sha256', tmpHex).digest()
        tmpHex = hashlib.new('sha256', tmpHex).digest()
        tmpHex = tmpHex[::-1]        
        tmpHex = tmpHex.hex().upper()
        block_hash = tmpHex
        #resList.append('SHA256 hash of the current block hash = ' + tmpHex)
        f.seek(tmpPos3,0)
        tmpHex = read_bytes(f,4)
        version_number = tmpHex
        #resList.append('Version number = ' + tmpHex)
        tmpHex = read_bytes(f,32)
        previous_block_hash = tmpHex
        #resList.append('SHA256 hash of the previous block hash = ' + tmpHex)
        tmpHex = read_bytes(f,32)
        MerkleRoot = tmpHex
        #resList.append('MerkleRoot hash = ' + tmpHex)
        #MerkleRoot = tmpHex
        tmpHex = read_bytes(f,4)
        timestamp = tmpHex
        #resList.append('Time stamp = ' + tmpHex)
        tmpHex = read_bytes(f,4)
        difficulty = tmpHex
        #resList.append('Difficulty = ' + tmpHex)
        tmpHex = read_bytes(f,4)
        random_number = tmpHex
        #resList.append('Random number = ' + tmpHex)
        tmpHex = read_varint(f)
        txCount = int(tmpHex,16)
        #resList.append('Transactions count = ' + str(txCount))
        #resList.append('')
        tmpHex = ''; RawTX = ''; tx_hashes = []; transactions = []
        for k in range(txCount):
            tmpHex = read_bytes(f,4)
            tx_version_number = tmpHex
            #resList.append('TX version number = ' + tmpHex)
            RawTX = reverse(tmpHex)
            tmpHex = ''
            Witness = False
            b = f.read(1)
            tmpB = b.hex().upper()
            bInt = int(b.hex(),16)
            if bInt == 0:
                tmpB = ''
                f.seek(1,1)
                c = 0
                c = f.read(1)
                bInt = int(c.hex(),16)
                tmpB = c.hex().upper()
                Witness = True
            c = 0
            if bInt < 253:
                c = 1
                tmpHex = hex(bInt)[2:].upper().zfill(2)
                tmpB = ''
            if bInt == 253: c = 3
            if bInt == 254: c = 5
            if bInt == 255: c = 9
            for j in range(1,c):
                b = f.read(1)
                b = b.hex().upper()
                tmpHex = b + tmpHex
            inCount = int(tmpHex,16)
            #resList.append('Inputs count = ' + tmpHex)
            tmpHex = tmpHex + tmpB
            RawTX = RawTX + reverse(tmpHex)
            
            inputs=[]
            for m in range(inCount):
                
                tmpHex = read_bytes(f,32)
                tx_from_hash = tmpHex
                #resList.append('TX from hash = ' + tmpHex)
                RawTX = RawTX + reverse(tmpHex)
                tmpHex = read_bytes(f,4)                
                n_output = tmpHex
                #resList.append('N output = ' + tmpHex)
                RawTX = RawTX + reverse(tmpHex)
                tmpHex = ''
                b = f.read(1)
                tmpB = b.hex().upper()
                bInt = int(b.hex(),16)
                c = 0
                if bInt < 253:
                    c = 1
                    tmpHex = b.hex().upper()
                    tmpB = ''
                if bInt == 253: c = 3
                if bInt == 254: c = 5
                if bInt == 255: c = 9
                for j in range(1,c):
                    b = f.read(1)
                    b = b.hex().upper()
                    tmpHex = b + tmpHex
                scriptLength = int(tmpHex,16)
                tmpHex = tmpHex + tmpB
                RawTX = RawTX + reverse(tmpHex)
                tmpHex = read_bytes(f,scriptLength,'B')
                input_script = tmpHex
                #resList.append('Input script = ' + tmpHex)
                RawTX = RawTX + tmpHex
                tmpHex = read_bytes(f,4,'B')
                sequence_number = tmpHex
                #resList.append('Sequence number = ' + tmpHex)
                RawTX = RawTX + tmpHex
                tmpHex = ''

                inputs_dict = { 
                    'tx_from_hash': tx_from_hash,
                    'n_output': n_output,
                    'input_script': input_script,
                    'sequence_number': sequence_number
                }
                inputs.append(inputs_dict)
                
            b = f.read(1)
            tmpB = b.hex().upper()
            bInt = int(b.hex(),16)
            c = 0
            if bInt < 253:
                c = 1
                tmpHex = b.hex().upper()
                tmpB = ''
            if bInt == 253: c = 3
            if bInt == 254: c = 5
            if bInt == 255: c = 9
            for j in range(1,c):
                b = f.read(1)
                b = b.hex().upper()
                tmpHex = b + tmpHex
            outputCount = int(tmpHex,16)
            tmpHex = tmpHex + tmpB
            #resList.append('Outputs count = ' + str(outputCount))
            RawTX = RawTX + reverse(tmpHex)
            
            outputs=[]
            for m in range(outputCount):
                tmpHex = read_bytes(f,8)
                Value = tmpHex
                RawTX = RawTX + reverse(tmpHex)
                tmpHex = ''
                b = f.read(1)
                tmpB = b.hex().upper()
                bInt = int(b.hex(),16)
                c = 0
                if bInt < 253:
                    c = 1
                    tmpHex = b.hex().upper()
                    tmpB = ''
                if bInt == 253: c = 3
                if bInt == 254: c = 5
                if bInt == 255: c = 9
                for j in range(1,c):
                    b = f.read(1)
                    b = b.hex().upper()
                    tmpHex = b + tmpHex
                scriptLength = int(tmpHex,16)
                tmpHex = tmpHex + tmpB
                RawTX = RawTX + reverse(tmpHex)
                tmpHex = read_bytes(f,scriptLength,'B')
                #resList.append('Value = ' + Value)
                output_script = tmpHex
                #resList.append('Output script = ' + tmpHex)
                RawTX = RawTX + tmpHex
                tmpHex = ''
                
                outputs_dict = { 
                    'value': Value,
                    'output_script': output_script,
                }
                outputs.append(outputs_dict)

            witness=[]
            if Witness == True:
                for m in range(inCount):
                    witness_dict={}
                    tmpHex = read_varint(f)
                    WitnessLength = int(tmpHex,16)
                    witness_dict['witness_count']=WitnessCount
                    witness_dict['items']=[]
                    for j in range(WitnessCount):
                        tmpHex = read_varint(f)
                        WitnessItemLength = int(tmpHex,16)
                        tmpHex = read_bytes(f,WitnessItemLength)
                        witness_item_dict={
                            'witness_item_length': WitnessItemLength,
                            'item': tmpHex
                        }
                        witness_dict['items'].append(witness_item_dict)
                        #resList.append('Witness ' + str(m) + ' ' + str(j) + ' ' + str(WitnessItemLength) + ' ' + tmpHex)
                        tmpHex = ''
                    witness.append(witness_dict)
            Witness = False
            tmpHex = read_bytes(f,4)
            locktime = tmpHex
            #resList.append('Lock time = ' + tmpHex)
            RawTX = RawTX + reverse(tmpHex)
            tmpHex = RawTX
            tmpHex = bytes.fromhex(tmpHex)
            tmpHex = hashlib.new('sha256', tmpHex).digest()
            tmpHex = hashlib.new('sha256', tmpHex).digest()
            tmpHex = tmpHex[::-1]
            tmpHex = tmpHex.hex().upper()
            transaction_hash = tmpHex
            #resList.append('TX hash = ' + tmpHex)
            tx_hashes.append(tmpHex)
            resList.append(''); tmpHex = ''; RawTX = ''

            transaction_dict = {
                'version_number': tx_version_number,
                'inputs_count': inCount,
                'inputs': inputs,
                'outputs_count': outputCount,
                'outputs': outputs,
                'witness': witness,
                'locktime': locktime,
                'transaction_hash': transaction_hash,     
            }

            transactions.append(transaction_dict)
            
        a += 1
        tx_hashes = [bytes.fromhex(h) for h in tx_hashes]
        tmpHex = merkle_root(tx_hashes).hex().upper()
        if tmpHex != MerkleRoot:
            print ('Merkle roots does not match! >',MerkleRoot,tmpHex)

        block_dict = {
            'magic_number': magic_number,
            'block_size': block_size,
            'block_hash': block_hash,
            'version_number': version_number,
            'previous_block_hash': previous_block_hash,
            'merkle_root': MerkleRoot,
            'timestamp': timestamp,
            'difficulty': difficulty,
            'random_number': random_number,
            'transaction_count': txCount,
            'transactions': transactions,
        }

        block.append(block_dict)

    f.close()

    with open(dirB + nameRes, 'w') as json_file:
        json.dump(block, json_file, indent=4)
    
    f.close()

    blocks.append(block)

In [118]:
JSON(blocks[1][11280-1])

<IPython.core.display.JSON object>