In [1]:
import requests
import json
import networkx as nx
import matplotlib.pyplot as plt
#%matplotlib inline
import os
import datetime
import time
import numpy as np
import h5py
import matplotlib.image as mpimg
import glob
import pandas as pd
import math
from operator import itemgetter
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

  from ._conv import register_converters as _register_converters


In [2]:
#converting given dates to milisec to be able to query blocks from blockchain.info by dates
def dates_to_milisec(start,end):

    if start == end:
        days=1
    else:
        date_start = datetime.datetime.strptime(str(start),'%Y%m%d')
        date_end = datetime.datetime.strptime(str(end), '%Y%m%d')
        days = (date_end-date_start).days

    date_list = [date_start + datetime.timedelta(days=x) for x in range(0,days+1)]
    #date_list_in_ms = [date_list[x].timestamp() * 1000 for x in range(len(date_list))]
    date_list_in_ms = [time.mktime(date_list[x].timetuple()) * 1000.0 for x in range(len(date_list))]
    
    return date_list_in_ms



#for each day there is a different array containing the blocks for that day
def query_block_hashes(date_list_in_ms):

    blocks = []

    for i in range(len(date_list_in_ms)):
        r = requests.get("https://blockchain.info/blocks/" + str(int(date_list_in_ms[i])) + "?format=json")
        data = r.json()
        blocks.append(data)

    block_hashes = []

    for i in range(len(blocks)):

        hashes_ = [blocks[i]['blocks'][n]['hash'] for n in range(len(blocks[i]['blocks']))]
        block_hashes.append(hashes_)

    return block_hashes



#BTC block parser
def parse_block_data(block_hash):


    r = requests.get("https://blockchain.info/rawblock/" + block_hash)
    data = r.json()

    given_block_data = {}
    block_height = int(data["height"])
    block_hash_ = block_hash
    
    creation_time = str(datetime.datetime.utcfromtimestamp(float(data["time"])))   # -1 hour for my time zone
    #other metadata to store...

    for i in range(0,len(data["tx"])):

        tr_hash = data["tx"][i]["hash"]
        inputs = []
        inputs_values = []
        for k in range(len(data["tx"][i]["inputs"])):
            if "prev_out" in data["tx"][i]["inputs"][k]:
                inputs += [data["tx"][i]["inputs"][k]["prev_out"]["addr"]]
                inputs_values += [data["tx"][i]["inputs"][k]["prev_out"]["value"]]
            else:
                continue

        outputs = []
        outputs_values = []
        for k in range(len(data["tx"][i]["out"])):

            if "addr" in data["tx"][i]["out"][k]:
                outputs += [data["tx"][i]["out"][k]["addr"]]
                outputs_values += [data["tx"][i]["out"][k]["value"]]
            else:
                continue

        given_block_data[tr_hash] = [(inputs,inputs_values),(outputs,outputs_values)]

    return given_block_data,block_height,creation_time, block_hash_



#create networkx graph from one block's transactions. Because BTC input and output transactions can't be directly assigned to each other,
#an auxiliary node is drawed for every transactions. This auxiliary node represents the given transaction's hash. Input transactions run in, and
#output transactions arise from this auxiliary node.
def create_tr_graph(given_block_data):

        DG = nx.MultiDiGraph()

        for key, value in given_block_data.items():

            tr_hash = key
            for i in range(len(given_block_data[tr_hash][0][0])):
                DG.add_edges_from([(given_block_data[tr_hash][0][0][i], tr_hash)], weight = given_block_data[tr_hash][0][1][i] / 10**8)

            for i in range(len(given_block_data[tr_hash][1][0])):
                DG.add_edges_from([(tr_hash, given_block_data[tr_hash][1][0][i])], weight = given_block_data[tr_hash][1][1][i] / 10**8)

        return DG

    
    

#generate list of dates in str format    
def generate_date_list(start,end):

    if start == end:
        days=1
    else:
    
        date_start = datetime.datetime.strptime(str(start),'%Y%m%d')
        date_end = datetime.datetime.strptime(str(end), '%Y%m%d')
        days = (date_end-date_start).days
        date_list = [date_start + datetime.timedelta(days=x) for x in range(0,days+1)]
        date_list = [x.strftime('%Y-%m-%d') for x in date_list]
    
    return date_list




#create hdf5 file from blocks transaction matrices (adjacency matrices) with corresponding meta data and a transaction graph picture for each block
#block heights identify the given block's transaction matrix and the corresponding graph picture. Transactions graph can not be recovered as networkx
#multidigraph from the adjacency matrices!!! For analysing each block's directed transaction graphs the create_tr_graph function should be used!
def append_to_hdf5_file(matrix,block_height,block_creation_time, block_hash):


    hdf5_file = h5py.File('correlating_datasets_test.hdf5', mode="a")

    if 'transaction_matrices' not in list(hdf5_file.keys()):
        grp = hdf5_file.create_group("transaction_matrices")

    else:
        grp = hdf5_file['transaction_matrices']


    if str(block_height) not in list(hdf5_file['transaction_matrices'].keys()):

        dataset = grp.create_dataset(str(block_height),data=matrix,
                                     dtype=np.uint8,compression='gzip',shuffle=True)

        dataset.attrs['creation_time'] = block_creation_time
        dataset.attrs['block_height'] = block_height
        dataset.attrs['block_hash'] = block_hash
        #other metadata for each block's metadata...
        #image = mpimg.imread(os.getcwd() + "\\" + str(block_height) + ".png")
        #dataset = grp.create_dataset(str(block_height) + "_picture", data=image)


    hdf5_file.close()


    
    
#Finding a correspondig volatility value for a given block creation time. The volatility value is the nearest in a given
#time interval. Lets say the volatility is calculated in a minutely sampled price data with a five long window. The
#time interval is 10 minutes, so the volatility value will be the closest to the block creation time in a 10 minute 
#interval.

def find_corresponding_volatility_and_price(df, creation_time):
    
    creation_time = datetime.datetime.strptime(creation_time, '%Y-%m-%d %H:%M:%S')
    df_ = df[creation_time : creation_time + datetime.timedelta(minutes=10)]
    df_ = df_.iloc[df_.index.get_loc(creation_time,method='nearest')]
    
    return df_['BVOL5M_INDEX'], df_['Weighted_Price']






def append_to_hdf5_file_with_price_and_volatility(matrix,block_height,block_creation_time,block_hash,df_price_vol):

    volatility = 0
    
    hdf5_file = h5py.File('correlating_datasets_test.hdf5', mode="a")

    if 'transaction_matrices' not in list(hdf5_file.keys()):
        grp = hdf5_file.create_group("transaction_matrices")

    else:
        grp = hdf5_file['transaction_matrices']


    if str(block_height) not in list(hdf5_file['transaction_matrices'].keys()):

        dataset = grp.create_dataset(str(block_height),data=matrix,
                                     dtype=np.uint8,compression='gzip',shuffle=True)

        
        volatility, price = find_corresponding_volatility_and_price(df_price_vol, block_creation_time)
        
        dataset.attrs['creation_time'] = block_creation_time
        dataset.attrs['block_height'] = block_height
        dataset.attrs['BVOL5M_INDEX'] = volatility
        dataset.attrs['Weighted_Price'] = price
        dataset.attrs['block_hash'] = block_hash
        
        
        #other metadata for each block's metadata...
        #image = mpimg.imread(os.getcwd() + "\\" + str(block_height) + ".png")
        #dataset = grp.create_dataset(str(block_height) + "_picture", data=image)


    hdf5_file.close()
    
    return volatility
    

    


#create subsets with equal sizes. If sizes mismatch, there will be an error when calculating correlation

def create_subsets(dataframe, length_of_subsets):
    
    n = length_of_subsets
    res = len(dataframe)%n
    df_ = dataframe[:-res]   
    list_df = [df_['BVOL5M_INDEX'][i:i+n] for i in range(0, df_.shape[0], n)]
    
    return list_df


def create_time_subsets(dataframe, length_of_subsets):
    
    n = length_of_subsets
    res = len(dataframe)%n
    df_ = dataframe[:-res]   
    list_df = [df_['Unnamed: 0'][i:i+n] for i in range(0, df_.shape[0], n)]
    
    return list_df


'''
Function to create a DataFrame containing column_name - value pairs where value is the number of correlating subsets in 
each column.
'''

def find_longest_correlating_subsets(df_correlating_subsets):
    
    df = pd.DataFrame()

    for column in df_correlating_subsets.columns:

        length = 0

        for index, row in df_correlating_subsets[[column]].iterrows():

            if pd.isnull(row.values):

                df_ = pd.DataFrame(columns=[column], data=[length])
                df = pd.concat([df,df_], axis=1)
                break

            else:
                length+=1
            
    return df


'''
Function to find the columns where the number of correlating subsets exceed a given threshold. For example if threshold is
30, the column must contain 30 correlating volatility subsets.
'''

def find_columns_that_contain_enough_correlations(df_number_of_correlations, threshold):
    
    l = []

    for i in df_number_of_correlations.columns:

        if df_number_of_correlations[i].values > threshold:

            l.append(i)
            
    return l

In [11]:
dates_in_ms = dates_to_milisec(20180329, 20180430)
block_hashes = query_block_hashes(dates_in_ms)

In [12]:
df = pd.DataFrame()


for k in range(len(block_hashes)):
    
    print('day: ',k)
    
    for n in range(len(block_hashes[k])):
        
        try:
            
            block_height = []
            creation_time = []
            tx_number = []
            block_size = []
            nonce = []
            block_hash = []
            avg_tr_sizes = []
            sum_tr_sizes = []
            mining_fee = []
            all_reward = []
            diff_target = []
            total_btc_output_in_block = []
            
            r = requests.get("https://blockchain.info/rawblock/" + block_hashes[k][n])
            data = r.json()

            block_height.append(data['height'])
            creation_time.append(str(datetime.datetime.utcfromtimestamp(float(data["time"]))))
            tx_number.append(data['n_tx'])
            block_size.append(data['size'])
            nonce.append(data['nonce'])
            block_hash.append(data['hash'])

            avg_tr_size = 0
            sum_tr_size = 0
            for i in range(len(data['tx'])):
                sum_tr_size += data['tx'][i]['size']
                avg_tr_size = sum_tr_size / len(data['tx'])

            avg_tr_sizes.append(avg_tr_size)
            sum_tr_sizes.append(sum_tr_size)

            mining_fee.append(data['tx'][0]['out'][0]['value'] / pow(10,8) - 12.5)
            all_reward.append(data['tx'][0]['out'][0]['value'] / pow(10,8))
            #kell még az USD értékük is majd...

            #Difficulty in decimal
            #target = coefficient*2^(8*(exponent-3))
            difficulty_hexa = "{0:x}".format(data['bits'])
            coefficient = difficulty_hexa[:2]
            exponent = difficulty_hexa[:2]
            diff_target.append(int(coefficient, 16) * pow(2, 8*(int(exponent, 16) -3)))


            #Total BTC outputs in a block
            total_btc_output = 0
            for i in range(len(data['tx'])):
                for j in range(len(data['tx'][i]['out'])):
                    total_btc_output += data['tx'][i]['out'][j]['value']

            total_btc_output_in_block.append(total_btc_output / pow(10,8))
            
            
            df_ = pd.DataFrame({'creation_time':creation_time,
                                'block_height':block_height,
                                 'tx_number':tx_number,
                                 'block_size':block_size,
                                 'nonce':nonce,
                                 'block_hash':block_hash,
                                 'avg_tr_size':avg_tr_sizes,
                                 'sum_tr_size':sum_tr_sizes,
                                 'minin_fee':mining_fee,
                                 'all_reward':all_reward,
                                 'diff_target':diff_target,
                                 'total_btc_output_per_block':total_btc_output_in_block,
                                })
            
            df = pd.concat([df,df_], axis=0)
            
        
            if k % 5 == 0:
            
                df.to_csv('Block_Features_23.csv', index=False)
        
        except ValueError: 
            
            print('Decoding JSON has failed at day :',k,' block hash : ', n)
        
        except KeyError as error:
            
            print('KeyError at day:', k, 'block hash : ', n)

            
df.to_csv('Block_Features_23.csv', index=False)

day:  0
day:  1
day:  2
day:  3
day:  4
day:  5
day:  6
day:  7
day:  8
day:  9
day:  10
day:  11
day:  12
day:  13
day:  14
day:  15
day:  16
day:  17
day:  18
day:  19
day:  20
day:  21
day:  22
day:  23
day:  24
day:  25
day:  26
day:  27
day:  28
day:  29
day:  30
day:  31
day:  32


In [13]:
df.to_csv('Block_Features_23.csv', index=False)