In [None]:
import requests
import importlib
import blizzard_api
import mysql
import mplusdb
import blizzard_credentials
import pandas as pd
import utils
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import datetime
import sys

In [None]:
importlib.reload(mplusdb)
mdb = mplusdb.MplusDatabase('.db_config')
realms = mdb.get_utility_table('realm')
dungeons = mdb.get_utility_table('dungeon')
specs = mdb.get_utility_table('spec')

In [None]:
realm_clusters = realms[['cluster_id', 'region']].drop_duplicates()

In [None]:
auth = blizzard_credentials.Credentials('.api_tokens')
access_token = auth.access_token

In [None]:
# this is a bit anti-pattern, but the fastest way

# generate url calls for every valid combination of
# region/realm_cluster/dungeon/period

# S4 starts with period 734
region_encoder = {1:'us', 2:'kr', 3:'eu', 4:'tw'}

all_urls = []
urls_for_period_region_dungeon = {}
for _, row in dungeons.iterrows():
    dungeon_id = row[0]
    if dungeon_id < 244:
        continue;
    period_start = 734 #row[2]
    period_end = 763
    for _, realm in realm_clusters.iterrows():
        cluster_id = realm[0]
        region = region_encoder[realm[1]]
        url_factory = blizzard_api.UrlFactory(
            access_token = access_token, region=region)
        for period in range(period_start, period_end+1):
            url = url_factory.get_mythic_plus_leaderboard_url(
                dungeon_id = dungeon_id, realm_id = cluster_id,
                period = period)
            all_urls.append(url)
            #assign url to a sub-list by region-period
            key = (region, period, dungeon_id)
            if key in urls_for_period_region_dungeon:
                urls_for_period_region_dungeon[key].append(url)
            else:
                urls_for_period_region_dungeon[key] = []
                urls_for_period_region_dungeon[key].append(url)

In [None]:
len(urls_for_period_region_dungeon)

In [None]:
# check math by hand
calls_per_realm = (10 * (764-662)) + (33 * 2)
print('calls per realm', calls_per_realm)
print('total calls', calls_per_realm * len(realm_clusters))

In [None]:
print('total time (hrs):', len(all_urls) * 0.25 / 3600)
print('total space raw json (Gbs):', 1/1024 * 1.2 * len(all_urls))
print('total space python list (Gbs):', 1/1024 * 0.02 * len(all_urls))

#### How are we going to do this?

So we got 284,000 url calls we need to make. Each call takes on ~0.25 seconds on average (if we use 10 threads), and generates 1.2Mb of data.

```
Total data size = 273,304 * 1.2Mb = 320 Gb
```

The capacity of my DB is just 20Gb. Oh-oh. I don't even have the storage for this.

#### Is it really 1.2Mb per call? I don't think so:

The raw json is 1.2Mb. Once we extract the data, the list is only 20kb. So total is:

```
Total data size = 273,304 * 0.02Mb = 5 Gb
```

I have plenty of space for this. Yay. Let's proceed.

#### This is how we are going to proceed:
* Break up the API calls into segments based on region and time period. Each segment is a time period within a region, and there are 404 total segments.
* Query each segment, one at a time.
* Aggregate data for each segment, and push to DB
* Keep track of which segment is done using some form of logging

In [None]:
import datetime


class MyLogger():
    """wrapper for a simple logger"""
    __fp = 'logs/mdb_segments.log'
    def __init__(self):
        pass
    
    def log(self, segment_id):
        ts = time.time()
        ts = datetime.datetime.fromtimestamp(ts).strftime('%c')
        with open(self.__fp, 'a') as file:
            #file.write('%s\t%s\t%s\t%s\n' % (ts, segment_id))
            file.write('%s\t%s\n' % (ts, segment_id))
            
    def get_logged_keys(self):
        keys = []
        with open(self.__fp, 'r') as file:
            for line in file:
                key_token = line.split()[-1]
                key = key_token.split('_')
                keys.append((key[0], int(key[1]), int(key[2])))
        return keys

In [None]:
logger = MyLogger()
logged_keys = logger.get_logged_keys()
print(len(logged_keys))

In [None]:
1/(87.0/1440)

In [None]:
def divide_chunks(list_, n): 
    # looping till length l 
    for i in range(0, len(list_), n):  
        yield list_[i:i + n] 
        
list(divide_chunks(list(range(10)), 11))

In [None]:
importlib.reload(blizzard_api)
importlib.reload(mplusdb)

def divide_chunks(list_, n): 
    # looping till length l 
    for i in range(0, len(list_), n):  
        yield list_[i:i + n] 
        
        
def api_call(url):
    time.sleep(0.1)
    try:
        response = requests.get(url, timeout = 10)
    except:
        response = requests.get(url)
    #print(time.time())
    #response = 0
    return response

def api_call_session(urls):
    responses = []
    with requests.Session() as session:
        for url in urls:
            try:
                response = session.get(url, timeout = 10)
            except: 
                response = session.get(url, timeout = 10)
            responses.append(response)
    return responses

def multi_threaded_call_chunked(urls):
    """Sends multiple calls to the API at once."""
    
    #chunk the urls into pieces with 10 urls each
    url_chunks = divide_chunks(urls, 10)
        
    threads = []
    with ThreadPoolExecutor(max_workers = 10) as executor:
        for chunk in url_chunks:
            threads.append(executor.submit(api_call_session, chunk))
    agg_result = []
    for task in as_completed(threads):
        agg_result.extend(task.result())
    return agg_result
    

def multi_threaded_call(urls):
    """Sends multiple calls to the API at once."""
    threads = []
    with ThreadPoolExecutor(max_workers = 10) as executor:
        for url in urls:
            threads.append(executor.submit(api_call, url))
    agg_result = []
    for task in as_completed(threads):
        agg_result.append(task.result())
    return agg_result


def agg_leaderboards(responses):
    """Joins output of several leaderboards into single list."""
    parser = blizzard_api.ResponseParser()
    
    runs = []
    comps = []
    rosters = []
            
    for resp in responses:
        leaderboard = parser.parse_keyrun_leaderboard_json(resp.json())
        runs.extend(leaderboard.get_runs_as_tuple_list())
        rosters.extend(leaderboard.get_rosters_as_tuple_list())
        comps.extend(leaderboard.get_run_comps_as_vector_list())
    
    return runs, comps, rosters

mdb = mplusdb.MplusDatabase('.db_config')

segment = []
t0 = time.time()
i = 0

logger = MyLogger()
logged_keys = logger.get_logged_keys() # segments already done

for key, urls in urls_for_period_region_dungeon.items():
    if key in logged_keys:
        i += 1
        print(key, i)
        continue
    print(key)
    print(len(urls))
    print(datetime.datetime.now())
    t00 = time.time()
    responses = None
    #responses = multi_threaded_call(urls)
    responses = multi_threaded_call_chunked(urls)
    t1 = time.time()
    print('api calls', t1 - t00)
    runs, comps, rosters = agg_leaderboards(responses)
    t2 = time.time()
    print('parsing jsons', t2 - t1)
    runs = list(set(runs))
    rosters = list(set(rosters))
    comps = list(set(comps))
    t3 = time.time()
    print('set(data)', t3 - t2)
    
    
    mdb.insert(table = 'run', data = runs)
    t4 = time.time()
    print('inserting runs', t4 - t3)
    
    mdb.insert(table = 'roster', data = rosters)
    t5 = time.time()
    print('inserting rosters', t5 - t4)
    
    mdb.insert(table = 'run_composition', data = comps)
    t5 = time.time()
    print('inserting comps', t5 - t4)
    
    print(len(runs))
    print(len(comps))
    print(len(rosters))
    i += 1
    logger.log('%s_%s_%s' % key)
    print('-------------------')
    if i % 10 == 0:
        time.sleep(2)
    if i == 500:
        break
print('total', time.time() - t0)

In [None]:
test_rq.json()

In [None]:
1.5*1440/60

In [None]:
importlib.reload(utils)
importlib.reload(blizzard_api)
all_urls[0]

t0 = time.time()
test_rq = requests.get(all_urls[0])
print(time.time() - t0)
test_klb = blizzard_api.KeyRunLeaderboard(test_rq.json())

In [None]:
test_klb.keyruns[0].get_composition_vector()
t0 = time.time()
comps = test_klb.get_run_comps_as_vector_list()
print(time.time()-t0)
print(comps)

In [None]:
tuple([1,2])

In [None]:
print(len(set(comps)))

In [None]:
#import pickle
#pickle.dump(leaderboard.get_runs_as_tuple_list(), open('test_objs.pkl', 'wb'))
#print(leaderboard.keyruns)