In [81]:
import tensorflow as tf
from google.cloud import bigquery, bigquery_storage_v1beta1, storage
from google import auth
import os
from sklearn.model_selection import train_test_split
import json 
import numpy as np
import tensorflow_datasets as tfds
from tqdm import tqdm
from functools import partial
import pdb

In [9]:
%load_ext google.cloud.bigquery

The google.cloud.bigquery extension is already loaded. To reload it, use:
  %reload_ext google.cloud.bigquery


In [10]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../Credentials/bigquery.json'

In [11]:
def download_blob(bucket_name, source_blob_name, destination_file_name):

    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(source_blob_name)

    blob.download_to_filename(destination_file_name)
    print('Success')

In [12]:
client = bigquery.Client()

In [13]:
%%bigquery df
Select *, ARRAY_LENGTH(picks_bans) as count
from dota.dota_mini
where ARRAY_LENGTH(picks_bans) > 0
LIMIT 10000

In [111]:
%%bigquery hero
Select *
from dota.hero_data

In [114]:
max(hero['hero_id'])

129

In [14]:
# %%bigquery un_df
#     Select dm.match_id
#     from dota.dota_mini dm
#     CROSS JOIN UNNEST(pickS_bans) as picks
#     LIMIT 10000

In [15]:
def encode_picks(current, sequence):
    encoder = [int(x) for x in sequence[current:]] + [-1] * (current)
    return encoder

In [36]:
def padding(sequence, maxlen,padding='pre',value=0.0):
    if len(sequence) >= maxlen:
        return sequence[:maxlen]
    else:
        num_pad = maxlen - len(sequence)
        pad_array = [value] * num_pad
        if padding == 'pre':
            return pad_array + sequence
        elif padding == 'post':
            return sequence + pad_array

In [17]:
def process_training(sequence,maxlen=21, is_pad = True):
    y = []
    hero     = padding( [s['hero_id'] for s in sequence] ,maxlen=maxlen,padding='post',value=-1.)
    pick_ban = padding( [int(s['is_pick']) for s in sequence ], maxlen=maxlen, padding='post',value=-1.)
    team     = padding( [int(s['team']) for s in sequence    ], maxlen=maxlen,padding = 'post',value=-1.)

#     hero     = padding( [s['hero_id'] for s in sequence] ,maxlen=maxlen,padding='post',value=-1.) if is_pad else [ s['hero_id'] for s in sequence]
#     pick_ban = padding( [s['is_pick'] for s in sequence] ,maxlen=maxlen,padding='post',value=-1.) if is_pad else [ s['is_pick'] for s in sequence]
#     pick_ban = padding( [s['is_pick'] for s in sequence] ,maxlen=maxlen,padding='post',value=-1.) if is_pad else [ s['is_pick'] for s in sequence]

    placeholder = np.array([np.array([hero[i] ] + encode_picks(i, pick_ban) + encode_picks(i,team)) for i in range(maxlen)])
    x = placeholder[:-1]
    y = placeholder[:1,0]
    
    return x,y


def process_hero(sequence):
    x        = [sequence[i]['hero_id'] for i in range(len(sequence)-1)]
    y        = [sequence[i]['hero_id'] for i in range(1,len(sequence))]
    return x,y
    

In [18]:
def to_dataset(data):
#     train, test = train_test_split(data)
    X = []
    Y = []
    for t in tqdm(data):
        x,y = process_training(t)
        X.append(x)
        Y.append(y)
    return X,Y

In [118]:
class Generator(object):
    def __init__(self, list_of_sequence :list,pad: bool = True, shuffle:bool = True,batch_size=10):
        self.list_of_sequence = np.random.shuffle(list_of_sequence) if shuffle else list_of_sequence
        self.pad = pad
        self.batch_size = 10
        
    def padding(self, sequence, maxlen,padding='pre',value=0.0):
        if len(sequence) >= maxlen:
            return sequence[:maxlen]
        else:
            num_pad = maxlen - len(sequence)
            pad_array = [value] * num_pad
            if padding == 'pre':
                return pad_array + sequence
            elif padding == 'post':
                return sequence + pad_array
            
    def encode_pick(self. current, sequence):
        encoder = [int(x) for x in sequence[current:]] + [-1] * (current)
        return encoder

            
    def __len__(self): return len(self.list_of_sequence)
                
    def process_training(self,sequence,maxlen=21, is_pad = True):
        x, y = [], []
        
#         hero     = padding( [s['hero_id'] for s in sequence] ,maxlen=maxlen,padding='post',value=0)
#         pick_ban = padding( [int(s['is_pick']) for s in sequence ], maxlen=maxlen, padding='post',value=-1.)
#         team     = padding( [int(s['team']) for s in sequence    ], maxlen=maxlen,padding = 'post',value=-1.)

        hero     = padding( [s['hero_id'] for s in sequence] ,maxlen=maxlen,padding='post',value=-1.) if is_pad else [ s['hero_id'] for s in sequence]
        pick_ban = padding( [s['is_pick'] for s in sequence] ,maxlen=maxlen,padding='post',value=-1.) if is_pad else [ s['is_pick'] for s in sequence]
        team     = padding( [s['team'] for s in sequence] ,maxlen=maxlen,padding='post',value=-1.)    if is_pad else [ s['team'] for s in sequence]

        placeholder = np.array([np.array([hero[i] ] + encode_picks(i, pick_ban) + encode_picks(i,team)) for i in range(maxlen)])
        x = placeholder[:-1]
        y = placeholder[1:,0]
        return x,y
    
    def __iter__(self):
        total_batches = len(self) // self.batch_size
        for i in range(total_batches):
            seq =  [ self.process_training(arr) for arr in self.list_of_sequence[i*self.batch_size: (i+1)*self.batch_size] ]
            x,y = zip(*seq)
            yield np.array(x), np.array(y)
        
            
        

        
        
        

In [141]:
test = df['picks_bans'][:5].values

In [149]:
def p_t(sequence):
    hero = [s['hero_id'] for s in sequence]
    pick_ban = [int(s['is_pick']) for s in sequence ]
    team = [int(s['team']) for s in sequence    ]
    placeholder = np.array([np.array([hero[i] ] + encode_picks(i, pick_ban) + encode_picks(i,team)) for i in range(len(sequence))])
    x = placeholder[:-1]
    y = placeholder[1:,0]
    
    return x,y
    
    

In [155]:
test = df['picks_bans'][:10].values
mylist = []
for i in test:
    x,y = p_t(i)
    mylist.append(x)
    print(x.shape)

(15, 33)
(14, 31)
(16, 35)
(17, 37)
(17, 37)
(17, 37)
(20, 43)
(18, 39)
(20, 43)
(20, 43)


In [122]:
arr = ['this','is','cat']
# arr[:1]

In [132]:
# np.array(x).shape

In [124]:
def generator():
    for seq in df['picks_bans']:
        x,y = process_training(seq)
        yield x,y
        next
        
def generator_1(seq):
    for s in seq:
        x,y = process_training(s)
        yield x,y
        next


In [133]:
# g = generator_1(df['picks_bans'])

In [127]:
tr0 = []
target1 = []
for seq in tqdm(df['picks_bans']):
    x,y = process_hero(seq)
    tr0.append(x)
    target1.append(y)

100%|██████████| 10000/10000 [00:00<00:00, 138661.55it/s]


In [None]:
tr01 = tf.keras.preprocessing.sequence.pad_sequences(tr0,maxlen=21,padding='post')

In [128]:
tfd1 = tf.data.Dataset.from_generator(generator=partial(MyGenerator,df['picks_bans']),output_types=( tf.int32, tf.int32))

In [None]:
for i in tfd1.take(1):
    print(i)

In [134]:
def build_model():
    model = tf.keras.Sequential( [
        tf.keras.layers.LSTM(units=100,input_shape=(20,43),return_sequences=True),
        tf.keras.layers.Dense(130)
#         tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(130))
    ])
    
    model.compile(loss=[tf.keras.losses.sparse_categorical_crossentropy],metrics=[tf.keras.metrics.sparse_categorical_accuracy])
    return model

In [135]:
model = build_model()
# model(a)

In [137]:
# model.fit_generator(tfd1,epochs=5)