In [1]:
import requests
import json, time, datetime, math
import numpy as np
import pandas as pd
import re

REQUESTS_PER_MIN = 10

#helper functions
def print_time(msg, unix):
    print(msg, time.ctime(int(unix)))
    
def get_readable_time(unix):
    return [time.ctime(int(u)) for u in unix]

def print_progress(iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '+'):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    ETA = (total - iteration) * 60/REQUESTS_PER_MIN
    estimated = datetime.datetime.now() + datetime.timedelta(seconds=ETA)
    ETA = estimated - datetime.datetime.now()
    print('\r%s |%s| %s%% %s - ETA: %s - %s' % (prefix, bar, percent, suffix, ETA, estimated), end = '\r')
    # Print New Line on Complete
    if iteration == total: 
        print()
        
def generate_max_intervals_for_currencies(CURRENCIES):
    CURRENCIES_MAX_INTERVAL = {}
    for c in CURRENCIES:
        r = retrieve_data('https://graphs2.coinmarketcap.com/currencies/%s' % (c))
        data = r.json()
        price_usd = np.array(data['price_usd'])
        print(price_usd[0,0])
        print_time('Start interval of ' + c + ' is:',price_usd[0,0]/1000)
        CURRENCIES_MAX_INTERVAL[c] = int(price_usd[0,0]/1000)
    return CURRENCIES_MAX_INTERVAL
        
def retrieve_data(url):
    r = requests.get(url)    
    if(r.status_code >= 400):
        print('Sleeping for one minute...')
        time.sleep(60)
        return retrieve_data(url)
    else:
        return r

def generate_urls(TODAY, CURRENCIES, MAX_DAYS_TO_TRACK, CURRENCIES_MAX_INTERVAL):
    url_dict = {}
    for x in range(MAX_DAYS_TO_TRACK):
        DAY_BEFORE = (int(TODAY) - ONE_DAY)
        for c in CURRENCIES:
            if c not in url_dict.keys():
                url_dict[c] = []  
            if DAY_BEFORE < CURRENCIES_MAX_INTERVAL[c]:
                if x - 1 < len(url_dict[c]):
                    print('Max interval reached for',c,'skipping to next...')
                    print_time('CURRENT_DAY\t', TODAY)
                continue
            url_dict[c] = url_dict[c] + ['https://graphs2.coinmarketcap.com/currencies/%s/%d/%d/' % (c, DAY_BEFORE*1000, TODAY*1000)]
        for c in GLOBAL_DATA:
            if c not in url_dict.keys():
                url_dict[c] = [] 
            if DAY_BEFORE < CURRENCIES_MAX_INTERVAL[c]:
                if x - 1 < len(url_dict[c]):
                    print('Max interval reached for',c,'skipping to next...')
                    print_time('CURRENT_DAY\t', TODAY)
                continue
            url_dict[c] = url_dict[c] + ['https://graphs2.coinmarketcap.com/global/%s/%d/%d/' % (c, DAY_BEFORE*1000, TODAY*1000)]
        TODAY = TODAY - ONE_DAY
    return url_dict

def is_interval_contained(url_min, url_max, data_min, data_max, offset):
    #print('url_min:', url_min, 'url_max:', url_max)
    #print('data_min:', data_min, 'data_max:', data_max)
    #print('offset', offset)
    return data_min - offset < url_min < data_max + offset and data_min - offset < url_max < data_max + offset
    
### Constants
CURRENCIES = ['bitcoin', 'litecoin', 'ethereum', 'ripple']
#CURRENCIES = ['ethereum']
GLOBAL_DATA = ['marketcap-total']
ONE_DAY = 60 * 60 * 24
DATA_PATH = 'data/coinmarketcap/'

### Beginning interval for currencies
CURRENCIES_MAX_INTERVAL = {
    'bitcoin': 1367174841, 
    'litecoin': 1367174842, 
    'ethereum': 1438958970, 
    'ripple': 1375642265,
    'marketcap-total': 1367174841
}

### if we need to recalculate max intervals for the currencies
#CURRENCIES_MAX_INTERVAL = generate_max_intervals_for_currencies(CURRENCIES)

TODAY = time.mktime(datetime.date.today().timetuple())
print_time('Today:', TODAY)

#MAX_DAYS_TO_TRACK = max([len(url_dict[x]) for x in CURRENCIES])
MAX_DAYS_TO_TRACK = int((int(TODAY) - min([CURRENCIES_MAX_INTERVAL[c] for c in CURRENCIES]))/ONE_DAY)
print('Maximum numbers of days for getting the data:', MAX_DAYS_TO_TRACK)

print(min([CURRENCIES_MAX_INTERVAL[c] for c in CURRENCIES]))
print(get_readable_time(CURRENCIES_MAX_INTERVAL.values()))
#print(CURRENCIES_MAX_INTERVAL)

### Generating urls
url_dict = generate_urls(TODAY, CURRENCIES, MAX_DAYS_TO_TRACK, CURRENCIES_MAX_INTERVAL)

Today: Thu Mar 22 00:00:00 2018
Maximum numbers of days for getting the data: 1788
1367174841
['Sun Apr 28 20:47:21 2013', 'Sun Apr 28 20:47:22 2013', 'Fri Aug  7 16:49:30 2015', 'Sun Aug  4 20:51:05 2013', 'Sun Apr 28 20:47:21 2013']
Max interval reached for ethereum skipping to next...
CURRENT_DAY	 Sat Aug  8 01:00:00 2015
Max interval reached for ripple skipping to next...
CURRENT_DAY	 Mon Aug  5 01:00:00 2013


In [2]:
data_df = {} 
data_interval = {} 
for c in CURRENCIES + GLOBAL_DATA: 
    try: 
        file_name = DATA_PATH + c + '.pkl' 
        data_df[c] = pd.read_pickle(file_name) 
        interval = data_df[c]['time'] 
        data_interval[c] = { 
            'min': min(interval), 
            'max': max(interval) } 
    except: 
        print(c, 'ne postoji')
        
print(data_interval)

for c in GLOBAL_DATA:
    for i in reversed(range(len(url_dict[c]))):
        interval_url = list(map(int, re.findall(r'\d{13}', url_dict[c][i])))
        
        offset = 1000*60*10
        if c in data_interval.keys() and c in data_interval.keys() and is_interval_contained(min(interval_url), max(interval_url), data_interval[c]['min'], data_interval[c]['max'], offset):
            # If already downloaded data for this url, skip to next url
            #print('skipping this url', url_dict[c][i])
            continue
            
        r = retrieve_data(url_dict[c][i])
        data = r.json()
        df = pd.DataFrame({
            'marketcap-total': np.array(data['market_cap_by_available_supply'])[:,1],
            'volume_usd': np.array(data['volume_usd'])[:,1],
            'time': np.array(data['volume_usd'])[:,0],
            'time_readable': get_readable_time(np.array(data['volume_usd'])[:,0]/1000)
        })
        
        if c not in data_df:
            data_df[c] = df
        else:
            data_df[c] = pd.concat([data_df[c], df])
        
        data_df[c].to_pickle(DATA_PATH + c + '.pkl')
        #print(data_df[c])
        print_progress(len(url_dict[c])-i+1, len(url_dict[c]), prefix = c, suffix = 'Complete', length = 50)
        time.sleep(60/REQUESTS_PER_MIN)
        

{'bitcoin': {'min': 1367190002000.0, 'max': 1520809166000.0}, 'litecoin': {'min': 1367190002000.0, 'max': 1520809141000.0}, 'ethereum': {'min': 1438988672000.0, 'max': 1520809152000.0}, 'ripple': {'min': 1375657265000.0, 'max': 1520809141000.0}, 'marketcap-total': {'min': 1367190120000.0, 'max': 1520809020000.0}}
marketcap-total |++++++++++++++++++++++++++++++++++++++++++++++++++| 100.0% Complete - ETA: 0:00:00 - 2018-03-22 23:20:02.608071
marketcap-total |++++++++++++++++++++++++++++++++++++++++++++++++++| 100.1% Complete - ETA: -1 day, 23:59:54 - 2018-03-22 23:20:03.123266

In [3]:
for c in CURRENCIES:
    for i in reversed(range(len(url_dict[c]))):
        interval_url = list(map(int, re.findall(r'\d{13}', url_dict[c][i])))
        offset = 1000*60*10
        if c in data_interval.keys() and c in data_interval.keys() and is_interval_contained(min(interval_url), max(interval_url), data_interval[c]['min'], data_interval[c]['max'], offset):
            # If already downloaded data for this url, skip to next url
            #print('skipping this url', url_dict[c][i])
            continue

        r = retrieve_data(url_dict[c][i])
        data = r.json()
        price_usd = np.array(data['price_usd'])
        market_cap = np.array(data['market_cap_by_available_supply'])
        price_btc = np.array(data['price_btc'])
        volume_usd = np.array(data['volume_usd'])
            
        df = pd.DataFrame({
            'price_usd': price_usd[:,1],
            'market_cap': market_cap[:,1],
            'price_btc': price_btc[:,1],
            'volume_usd': volume_usd[:,1],
            'time': price_usd[:,0],
            'time_readable': get_readable_time(price_usd[:,0]/1000)
        })
        
        if c not in data_df:
            data_df[c] = df
        else:
            data_df[c] = pd.concat([data_df[c], df])
        
        data_df[c].to_pickle(DATA_PATH + c + '.pkl')
        
        #print_time('#End', price_usd[-1,0]/1000)
        print_progress(len(url_dict[c])-i+1, len(url_dict[c]), prefix = c, suffix = 'Complete', length = 50)
        #print(c)
        #print(data_df[c])
        #print(url_dict[c][i])
        #print_time('#Start', price_usd[0,0]/1000)
        #time.sleep(60/REQUESTS_PER_MIN)

bitcoin |++++++++++++++++++++++++++++++++++++++++++++++++++| 100.0% Complete - ETA: 0:00:00 - 2018-03-22 23:20:29.331274
litecoin |++++++++++++++++++++++++++++++++++++++++++++++++++| 100.0% Complete - ETA: 0:00:00 - 2018-03-22 23:20:36.6470384.016100
ethereum |++++++++++++++++++++++++++++++++++++++++++++++++++| 100.0% Complete - ETA: 0:00:00 - 2018-03-22 23:20:41.36758731.359854
ripple |++++++++++++++++++++++++++++++++++++++++++++++++++| 100.0% Complete - ETA: 0:00:00 - 2018-03-22 23:20:47.9805150:35.772484
ripple |++++++++++++++++++++++++++++++++++++++++++++++++++| 100.1% Complete - ETA: -1 day, 23:59:54 - 2018-03-22 23:20:42.569500

In [14]:
x = 0
for t in data_df['marketcap-total']['time']:
    x = x + 1
    if x % 1000 == 0:
        print_time('', t/1000)

 Thu May  2 12:17:00 2013
 Sun May  5 23:37:00 2013
 Thu May  9 10:57:00 2013
 Sun May 12 22:17:00 2013
 Thu May 16 09:37:00 2013
 Sun May 19 20:57:00 2013
 Thu May 23 08:17:00 2013
 Sun May 26 19:37:00 2013
 Thu May 30 06:57:00 2013
 Sun Jun  2 18:17:00 2013
 Thu Jun  6 05:37:00 2013
 Sun Jun  9 16:57:00 2013
 Thu Jun 13 04:17:00 2013
 Sun Jun 16 15:37:00 2013
 Thu Jun 20 02:57:00 2013
 Sun Jun 23 14:17:00 2013
 Thu Jun 27 01:37:00 2013
 Sun Jun 30 12:57:00 2013
 Thu Jul  4 00:17:00 2013
 Sun Jul  7 11:37:00 2013
 Wed Jul 10 22:57:00 2013
 Sun Jul 14 10:17:00 2013
 Wed Jul 17 21:37:00 2013
 Sun Jul 21 08:57:00 2013
 Wed Jul 24 20:17:00 2013
 Sun Jul 28 07:37:00 2013
 Wed Jul 31 18:57:00 2013
 Sun Aug  4 06:17:00 2013
 Wed Aug  7 17:37:00 2013
 Sun Aug 11 04:57:00 2013
 Wed Aug 14 16:17:00 2013
 Sun Aug 18 03:37:00 2013
 Wed Aug 21 14:57:00 2013
 Sun Aug 25 02:17:00 2013
 Wed Aug 28 13:37:00 2013
 Sun Sep  1 00:57:00 2013
 Wed Sep  4 12:17:00 2013
 Sat Sep  7 23:37:00 2013
 Wed Sep 11 