In [1]:
# youtube8m のディレクトリに移動
import os
while os.getcwd().split('/')[-1] != 'youtube8m': os.chdir('..')
os.getcwd()

'/home/jupyter/ASLOpenProject/youtube8m'

In [2]:
import tensorflow as tf
tf.enable_eager_execution()

In [None]:
tf.gather([5, 6, 7, 8, 9], [0, 1])

In [None]:
from video.data import id_category_id_table
print(id_category_id_table[:10])
t, _ = tf.unique(tf.gather(id_category_id_table, [1, 2, 3, 0]))
t

In [None]:
int(t.shape[0])

In [None]:
t = tf.constant([[1], [2], [3], [4]])
m = 3
d = m - t.shape[0]
p = max(d, 0)
print(p)
tf.pad(t[:m,:], [[p, 0], [0, 0]])

In [None]:
import tensorflow as tf
CLASS_NUM = 3862
RGB_DIM = 1024
AUDIO_DIM = 128
MAX_LEN = 300

def multi_hot(indices):
    return tf.reduce_sum(tf.one_hot(indices, CLASS_NUM), axis=-2)

def dequantize(feat_vector, max_quantized_value=2, min_quantized_value=-2):
    '''
    8bit に圧縮されているデータを float32 に戻します
    see: https://github.com/linrongc/youtube-8m/blob/master/utils.py#L28
    '''
    feat_vector = tf.cast(feat_vector, tf.float32)
    quantized_range = max_quantized_value - min_quantized_value
    scalar = quantized_range / 255.0
    bias = (quantized_range / 512.0) + min_quantized_value
    return feat_vector * scalar + bias

def adjust_length(feature):
    f = feature[:MAX_LEN,:]
    l = tf.unstack(tf.shape(f))[0]
    return tf.cond(
        l < MAX_LEN,
        lambda: tf.pad(f, [[MAX_LEN - l, 0], [0, 0]]),
        lambda: f
    )

def decode(feature, dim):
    '''
    バイト列になっているフィーチャーを float32 の配列にして返します。
    '''
    f = tf.reshape(
        tf.decode_raw(feature, tf.uint8),
        [-1, dim],  # [len, dim]
    )
    f = adjust_length(f)
    f = dequantize(f)
    return f

def parse_row(row):
    context_features = {
        "id": tf.FixedLenFeature([], tf.string),
        "labels": tf.VarLenFeature(tf.int64),
    }
    sequence_features = {
        "rgb": tf.io.FixedLenSequenceFeature([], dtype=tf.string),
        "audio": tf.io.FixedLenSequenceFeature([], dtype=tf.string),
    }
    context_data, sequence_data = tf.parse_single_sequence_example(row, context_features, sequence_features)
    label = multi_hot(tf.sparse.to_dense(context_data['labels']))
    label.set_shape([CLASS_NUM])
    rgb = decode(sequence_data['rgb'], RGB_DIM)
    audio = decode(sequence_data['audio'], AUDIO_DIM)
    features = {
        'id': context_data['id'],
        'rgb': rgb,
        'audio': audio,
    }
    return features, label


def read_dataset(files_pattern, mode, batch_size=128):
    tffiles = tf.io.gfile.glob(files_pattern)
    dataset = tf.data.TFRecordDataset(tffiles)
    dataset = dataset.map(
        parse_row,
        num_parallel_calls=tf.data.experimental.AUTOTUNE
    )
    if mode == tf.estimator.ModeKeys.TRAIN:
        dataset = dataset.shuffle(batch_size*10).repeat().batch(batch_size)
    else:
        dataset = dataset.repeat(1).batch(batch_size)
    return dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)


In [None]:
ds = read_dataset(
    'gs://asl-mixi-project-bucket/data/youtube-8m-frame/train/train0000.tfrecord',
    tf.estimator.ModeKeys.TRAIN,
    batch_size=3,
)
next(iter(ds))

# dataset

In [3]:
from frame.data import adjust_length, dequantize

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
adjust_length(tf.constant([[1], [2], [3]]), max_len=5)

In [None]:
adjust_length(tf.constant([[[1], [2], [3]]]), max_len=5)

In [12]:
f = tf.constant([[100], [102], [103]], dtype=tf.uint8)
dim = 1
max_len = 5
f = dequantize(f)
f, length = adjust_length(f, max_len)
f.set_shape([max_len, dim])
print(f, length)

tf.Tensor(
[[ 0.        ]
 [ 0.        ]
 [-0.4235599 ]
 [-0.39218736]
 [-0.37650108]], shape=(5, 1), dtype=float32) tf.Tensor(3, shape=(), dtype=int32)


In [None]:
import sys
with tf.Graph().as_default():
    c = tf.constant([1] * 100)
    p = tf.print(c, summarize=-1)
    with tf.control_dependencies([p]):
        c = tf.identity(c)
    with tf.Session() as sess:
        sess.run(c)

In [None]:
tf.keras.layers.Dropout()()

# Model

In [None]:
from frame.attention.common_layer import FeedForwardNetwork, ResidualNormalizationWrapper, LayerNormalization
from frame.attention.embedding import AddPositionalEncoding
from frame.attention.attention import SelfAttention


class AttentionModel(tf.keras.models.Model):
    def __init__(self, params, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        hopping_num = params.get('hopping_num', 6)
        head_num = params.get('head_num', 8)
        hidden_dim = (params.get('hidden_dim', 512) // head_num) * head_num
        dropout_rate = params.get('dropout', 0.1)
        
        self.input_dense = tf.keras.layers.Dense(hidden_dim, activation='relu')
        self.add_position_embedding = AddPositionalEncoding()
        self.input_dropout_layer = tf.keras.layers.Dropout(dropout_rate)

        self.attention_block_list = []
        for _ in range(hopping_num):
            attention_layer = SelfAttention(hidden_dim, head_num, dropout_rate, name='self_attention')
            ffn_layer = FeedForwardNetwork(hidden_dim, dropout_rate, name='ffn')
            self.attention_block_list.append([
                ResidualNormalizationWrapper(attention_layer, dropout_rate, name='self_attention_wrapper'),
                ResidualNormalizationWrapper(ffn_layer, dropout_rate, name='ffn_wrapper'),
            ])
        self.output_normalization = LayerNormalization()
        self.output_layer = tf.keras.layers.Dense(params['output_dim'])

    def call(
            self,
            visual_feature,
            audio_feature,
            training=None,
    ):
        '''
        モデルを実行します

        :param visual_feature: shape = [batch_size, length, dim]
        :param audio_feature: shape = [batch_size, length, dim]
        :param training: 学習時は True
        :return: shape = [batch_size, output_dim]
        '''
        input = tf.concat([visual_feature, audio_feature], axis=-1)
        input = self.input_dense(input)
        tf.print('input', input.shape)
        self_attention_mask = self._create_enc_attention_mask(input)
        embedded_input = self.add_position_embedding(input)
        tf.print('emb', embedded_input)
        query = self.input_dropout_layer(embedded_input, training=training)
        tf.print(query.shape)

        for i, layers in enumerate(self.attention_block_list):
            attention_layer, ffn_layer = tuple(layers)
            with tf.name_scope('hopping_{}'.format(i)):
                query = attention_layer(query, attention_mask=self_attention_mask, training=training)
                query = ffn_layer(query, training=training)
        # [batch_size, length, hidden_dim]
        attention_output = self.output_normalization(query)
        return self.output_layer(attention_output[:,0,:])

    def _create_enc_attention_mask(self, encoder_input: tf.Tensor):
        with tf.name_scope('enc_attention_mask'):
            encoder_input = tf.reduce_sum(encoder_input, axis=-1)  # [batch_size, length]
            batch_size, length = tf.unstack(tf.shape(encoder_input))
            pad_array = tf.equal(encoder_input, 0.0)  # [batch_size, m_length]
            # shape broadcasting で [batch_size, head_num, (m|q)_length, m_length] になる
            return tf.reshape(pad_array, [batch_size, 1, 1, length])

In [None]:
AttentionModel({'output_dim': 12})(tf.ones([4, 3, 2]), tf.ones([4, 3, 2]))

In [None]:
import csv

categories = {}
with open('./video/vocabulary.csv', 'r') as c:
    reader = csv.reader(c)
    print(next(reader))
    for r in reader:
        category = r[5]
        if category not in categories:
            categories[category] = 0
        categories[category] += 1
sorted(categories.keys())

In [None]:
text = 'aa,bb,cc\ndd,ee,cc\n'
import io
with io.StringIO(text) as f:
    print(next(csv.reader(f)))

In [None]:
import csv

categories = {}
with open('./video/vocabulary.csv', 'r') as c:
    reader = csv.reader(c)
    next(reader)
    print(list(reader))


# weighted loss

In [None]:
batch_size = 4
class_num = 8
category_num = 3
weight = tf.expand_dims(tf.constant([1.0] * class_num + [2.0] * category_num), axis=0)
t = tf.ones([batch_size, class_num + category_num])
t * weight

In [None]:
loss_weights[:class_num] 

In [None]:
tf.metrics.accuracy(labels=[1.0, 2.0, 3.0], predictions=[1.0, 2.0, 2.0])