In [1]:
import os
import tensorflow as tf
tf.compat.v1.enable_eager_execution()
from glob import glob
import numpy as np
import pandas as pd
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.optim.lr_scheduler as lr_sched
from typing import Any, Dict, List, Optional, Tuple, Union

from sklearn.model_selection import KFold

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SEGMENT_LEN = 5
BATCH_SIZE = 24
NUM_FOLDS  = 10
FEATURES_DIM    = 1152
NUM_CLASSES     = 1000

LEARNING_RATE   = 0.00011729283760398037
WEIGHT_DECAY    = 0.0011412688966608406

LR_FACTOR       = 0.1
LR_PATIENCE     = 3
LR_MINIMUM      = 3e-7
LR_THRESHOLD    = 1e-3

NUM_EPOCHS      = 25
LOG_FREQ        = 1000

# DATA analysis

File descriptions
frame-level data

Total size of 1.53TB (Large file warning!)
Each video has

a. id: unique id for the video, in train set it is a YouTube video id, and in test/validation they are anonymized.

b. labels: list of labels of that video. c. Each frame has rgb: float array of length 1024, d. Each frame has audio: float array of length 128

A subset of the validation set videos are provided with segment-level labels. In addition to id, labels and the frame level features described above, they come with

a. segment_start_times: list of segment start times. b. segment_end_times: list of segment end times. c. segment_labels: list of segment labels. d. segment_scores: list of binary values indicating positive or negative corresponding to the segment labels.

vocabulary.csv - the full data dictionary for label names and their descriptions

IMPORTANT: In order to minimize submission file sizes, for segment predictions, you should only include the video id and the segment start time, but not the segment end time. (These are not needed, since all segments are 5 seconds in duration.)

In [3]:
data_path=r"C:\Users\liaox\yt8m\dataset_ori"
data_listdir =os.listdir(data_path)
print(data_listdir)

['test0397.tfrecord', 'test1553.tfrecord', 'test2108.tfrecord', 'test3134.tfrecord', 'train0024.tfrecord', 'train0052.tfrecord', 'train0259.tfrecord', 'train0267.tfrecord', 'train0455.tfrecord', 'train0523.tfrecord', 'train0638.tfrecord', 'train0676.tfrecord', 'train0701.tfrecord', 'train0780.tfrecord', 'train0797.tfrecord', 'train0842.tfrecord', 'train0876.tfrecord', 'train0955.tfrecord', 'train0965.tfrecord', 'train1116.tfrecord', 'train1169.tfrecord', 'train1210.tfrecord', 'train1219.tfrecord', 'train1395.tfrecord', 'train1644.tfrecord', 'train1704.tfrecord', 'train1769.tfrecord', 'train1831.tfrecord', 'train1932.tfrecord', 'train1937.tfrecord', 'train2094.tfrecord', 'train2164.tfrecord', 'train2394.tfrecord', 'train2501.tfrecord', 'train2517.tfrecord', 'train2525.tfrecord', 'train3043.tfrecord', 'train3115.tfrecord', 'train3182.tfrecord', 'train3343.tfrecord', 'train3432.tfrecord', 'train3489.tfrecord', 'train3731.tfrecord']


In [4]:
frame_lvl_record = os.path.join(data_path,data_listdir[0])
print(frame_lvl_record)

C:\Users\liaox\yt8m\dataset\train0024.tfrecord


In [5]:
vid_ids = []
labels = []

for example in tf.python_io.tf_record_iterator(frame_lvl_record):
    tf_example = tf.train.Example.FromString(example)
    vid_ids.append(tf_example.features.feature['id']
                   .bytes_list.value[0].decode(encoding='UTF-8'))
    labels.append(tf_example.features.feature['labels'].int64_list.value)

print('Number of videos in this tfrecord: ',len(vid_ids))
print ('Number of labels in this tfrecord: ', len (labels))

Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`
Number of videos in this tfrecord:  16
Number of labels in this tfrecord:  16


In [7]:
feat_rgb = []
feat_audio = []

for example in tf.python_io.tf_record_iterator(frame_lvl_record):  
    tf_seq_example = tf.train.SequenceExample.FromString(example)
    n_frames = len(tf_seq_example.feature_lists.feature_list['audio'].feature)
    sess = tf.InteractiveSession()
    rgb_frame = []
    audio_frame = []
    # iterate through frames
    for i in range(n_frames):
        rgb_frame.append(tf.cast(tf.decode_raw(
                tf_seq_example.feature_lists.feature_list['rgb']
                  .feature[i].bytes_list.value[0],tf.uint8)
                       ,tf.float32).numpy())
        audio_frame.append(tf.cast(tf.decode_raw(
                tf_seq_example.feature_lists.feature_list['audio']
                  .feature[i].bytes_list.value[0],tf.uint8)
                       ,tf.float32).numpy())
        
        
    sess.close()
    
    feat_audio.append(audio_frame)
    feat_rgb.append(rgb_frame)
    break



In [8]:
print('The first video has %d frames' %len(feat_rgb[0]))

The first video has 215 frames


In [16]:

vocabulary = pd.read_csv(r'C:\Users\liaox\yt8m\dataset\vocabulary.csv')
vocabulary.head()

Unnamed: 0,Index,TrainVideoCount,KnowledgeGraphId,Name,WikiUrl,Vertical1,Vertical2,Vertical3,WikiDescription
0,3,378135,/m/01jddz,Concert,https://en.wikipedia.org/wiki/Concert,Arts & Entertainment,,,A concert is a live music performance in front...
1,7,200813,/m/0k4j,Car,https://en.wikipedia.org/wiki/Car,Autos & Vehicles,,,"A car is a wheeled, self-powered motor vehicle..."
2,8,181579,/m/026bk,Dance,https://en.wikipedia.org/wiki/Dance,Arts & Entertainment,,,Dance is a performance art form consisting of ...
3,11,135357,/m/02wbm,Food,https://en.wikipedia.org/wiki/Food,Food & Drink,,,Food is any substance consumed to provide nutr...
4,12,130835,/m/02vx4,Association football,https://en.wikipedia.org/wiki/Association_foot...,Sports,,,"Association football, more commonly known as f..."


In [17]:
vocabulary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Index             1000 non-null   int64 
 1   TrainVideoCount   1000 non-null   int64 
 2   KnowledgeGraphId  1000 non-null   object
 3   Name              988 non-null    object
 4   WikiUrl           988 non-null    object
 5   Vertical1         1000 non-null   object
 6   Vertical2         153 non-null    object
 7   Vertical3         12 non-null     object
 8   WikiDescription   988 non-null    object
dtypes: int64(2), object(7)
memory usage: 70.4+ KB


# convert data to csv


In [19]:
def convert_data(prefix: str, wildcard: str) -> None:
    print('converting', wildcard)
    all_files = sorted(glob(wildcard))

    all_ids = []
    all_labels = []
    all_scores = []
    all_features_list = []

    for tfrec_file in all_files:
        for example in tf.python_io.tf_record_iterator(tfrec_file):
            tf_example = tf.train.Example.FromString(example)

            video_id = tf_example.features.feature['id'] \
                       .bytes_list.value[0].decode(encoding='utf-8')

            seg_start = list(tf_example.features.feature['segment_start_times'].int64_list.value)
            seg_labels = list(tf_example.features.feature['segment_labels'].int64_list.value)
            seg_scores = list(tf_example.features.feature['segment_scores'].float_list.value)

            tf_seq_example = tf.train.SequenceExample.FromString(example)
            num_frames = len(tf_seq_example.feature_lists.feature_list['audio'].feature)

            if any(np.array(seg_start) > num_frames): # why are there videos with invalid labels?
                print('skipping video', video_id, 'file', tfrec_file)
                continue

            for segment, label, score in zip(seg_start, seg_labels, seg_scores):
                features = []

                for frame in range(segment, segment + SEGMENT_LEN):
                    rgb = tf.decode_raw(tf_seq_example.feature_lists \
                                        .feature_list['rgb'].feature[frame] \
                                        .bytes_list.value[0],tf.uint8).numpy()
                    audio = tf.decode_raw(tf_seq_example.feature_lists \
                                          .feature_list['audio'].feature[frame] \
                                          .bytes_list.value[0],tf.uint8).numpy()

                    frame_features = np.concatenate([rgb, audio])
                    features.append(frame_features)

                all_ids.append(video_id)
                all_labels.append(label)
                all_scores.append(score)
                all_features_list.append(np.expand_dims(features, axis=0))


    all_features = np.concatenate(all_features_list)
    print(all_features.shape)




    #------labels
    labels_table = pd.read_csv(r'C:\Users\liaox\yt8m_ml\code\youtube-8m\vocabulary.csv')
    labels_table = labels_table.Index#len=1000

    encode_table = np.zeros(np.amax(labels_table) + 1, dtype=int)#(1832)
    for i, index in enumerate(labels_table):
        encode_table[index] = i


    labels = encode_table[all_labels]
    #-------

    
    features_path=os.path.join(data_path, f'{prefix}_features.npy')
    print('writing features to the disk,',features_path)
    np.save(features_path, all_features)

    
    ids_path=os.path.join(data_path, f'{prefix}_ids.csv')
    print('writing labels to the disk,',ids_path)
    id_table=pd.DataFrame({'all_ids':all_ids,'all_labels':all_labels,'all_scores':all_scores,'labels':labels})
    id_table.to_csv(ids_path,index=False,sep=',')


    return features_path,ids_path



In [20]:


train_tfrecord=r"C:\Users\liaox\yt8m\dataset\train*.tfrecord"
print(glob(train_tfrecord))
features_path,ids_path=convert_data('train',train_tfrecord )

['C:\\Users\\liaox\\yt8m\\dataset\\train0024.tfrecord', 'C:\\Users\\liaox\\yt8m\\dataset\\train0052.tfrecord', 'C:\\Users\\liaox\\yt8m\\dataset\\train0259.tfrecord', 'C:\\Users\\liaox\\yt8m\\dataset\\train0267.tfrecord', 'C:\\Users\\liaox\\yt8m\\dataset\\train0455.tfrecord', 'C:\\Users\\liaox\\yt8m\\dataset\\train0523.tfrecord', 'C:\\Users\\liaox\\yt8m\\dataset\\train0638.tfrecord', 'C:\\Users\\liaox\\yt8m\\dataset\\train0676.tfrecord', 'C:\\Users\\liaox\\yt8m\\dataset\\train0701.tfrecord', 'C:\\Users\\liaox\\yt8m\\dataset\\train0780.tfrecord', 'C:\\Users\\liaox\\yt8m\\dataset\\train0797.tfrecord', 'C:\\Users\\liaox\\yt8m\\dataset\\train0842.tfrecord', 'C:\\Users\\liaox\\yt8m\\dataset\\train0876.tfrecord', 'C:\\Users\\liaox\\yt8m\\dataset\\train0955.tfrecord', 'C:\\Users\\liaox\\yt8m\\dataset\\train0965.tfrecord', 'C:\\Users\\liaox\\yt8m\\dataset\\train1116.tfrecord', 'C:\\Users\\liaox\\yt8m\\dataset\\train1169.tfrecord', 'C:\\Users\\liaox\\yt8m\\dataset\\train1210.tfrecord', 'C:\\User

In [40]:
#tmp
# all_features_list=[np.zeros((1,5,1152))]*314
# print(len(all_features_list))
# all_features = np.concatenate(all_features_list)
# print(all_features.shape)

314
(314, 5, 1152)


## load train data

In [22]:
df=pd.read_csv(ids_path) 
print(df.head(5))
all_ids, all_labels, all_scores,labels = df['all_ids'],df['all_labels'],df['all_scores'],df['labels']


  all_ids  all_labels  all_scores  labels
0    5way         316         1.0     167
1    5way         316         1.0     167
2    5way         316         1.0     167
3    5way         316         1.0     167
4    5way         316         1.0     167


In [23]:


def dequantize(feat_vector: np.array, max_quantized_value=2, min_quantized_value=-2) -> np.array:
    ''' Dequantize the feature from the byte format to the float format. '''
    assert max_quantized_value > min_quantized_value
    quantized_range = max_quantized_value - min_quantized_value
    scalar = quantized_range / 255.0
    bias = (quantized_range / 512.0) + min_quantized_value
    return feat_vector * scalar + bias


# PyTorch dataset class for numpy arrays.
class SegmentsDataset(torch.utils.data.Dataset):
    def __init__(self, ids: np.array, dataset_mask: Optional[np.array], labels: Optional[np.array],
                 scores: Optional[np.array], features_path: str, mode: str) -> None:
        print(f'creating SegmentsDataset in mode {mode}')

        self.ids = ids
        self.scores = scores
        self.mode = mode
        self.labels = labels

        if self.mode != 'test':

            assert dataset_mask is not None and self.scores is not None
            self.features_indices = np.arange(dataset_mask.size)[dataset_mask]
            
            features_size = dataset_mask.size

            assert self.labels.shape[0] == self.scores.shape[0]
            assert self.features_indices.size == self.labels.shape[0]
            assert features_size >= self.scores.shape[0]

            if self.mode == 'train':
                self.labels *= (self.scores > 0.5).astype(int)
        else:
            features_size = self.ids.shape[0]

        self.features = np.load(features_path)

    def __getitem__(self, index: int) -> Tuple[torch.Tensor, int]:
        features = self.features

        if self.mode != 'test':
            features_indices = self.features_indices
            labels = self.labels

            x = features[features_indices[index]]
        else:
            x = features[index]

        x = dequantize(x)
        x = torch.tensor(x, dtype=torch.float32)

        if self.mode == 'test':
            return x, 0
        else:
            y = labels[index].item()
            return x, y

    def __len__(self) -> int:
        return self.ids.shape[0]


def get_train_val_split(items: List[str], fold: int) -> Tuple[np.array, np.array]:
    skf = KFold(NUM_FOLDS, shuffle=True, random_state=0)
    items = np.array(items)
    train_idx, val_idx = list(skf.split(items))[fold]
    return items[train_idx], items[val_idx]

def load_train_data(fold: int) -> Any:
    df=pd.read_csv(ids_path) 
    all_ids, all_labels, all_scores,all_labels_index = df['all_ids'],df['all_labels'],df['all_scores'],df['labels']

    unique_ids = sorted(set(all_ids))
    unique_train_ids, unique_val_ids = get_train_val_split(unique_ids, fold)

    all_ids = np.array(all_ids)
    all_labels = np.array(all_labels)
    all_scores = np.array(all_scores)
    all_labels_index = np.array(all_labels_index)
    print(all_ids.shape)
    print(all_labels.shape)

    train_mask = np.isin(all_ids, unique_train_ids)
    train_ids = all_ids[train_mask]
    train_labels = all_labels[train_mask]
    train_scores = all_scores[train_mask]
    train_labels_index=all_labels_index[train_mask]

    val_ids = all_ids[~train_mask]
    val_labels = all_labels[~train_mask]
    val_scores = all_scores[~train_mask]
    val_labels_index=all_labels_index[~train_mask]

    train_dataset = SegmentsDataset(train_ids, train_mask, train_labels_index, train_scores,
                                    features_path, mode='train')

    val_dataset = SegmentsDataset(val_ids, ~train_mask, val_labels_index, val_scores,
                                    features_path, mode='val')

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=BATCH_SIZE, shuffle=True,
        num_workers=0, drop_last=True)

    val_loader = torch.utils.data.DataLoader(
        val_dataset, batch_size=BATCH_SIZE, shuffle=False,
        num_workers=0, drop_last=False)

    return train_loader, val_loader


In [24]:
fold_num=0
train_loader, val_loader = load_train_data(fold_num)

(2469,)
(2469,)
creating SegmentsDataset in mode train
creating SegmentsDataset in mode val


## Model

In [25]:

class SwishActivation(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.sigmoid(x) * x

class ClassifierModel(nn.Module):
    def __init__(self) -> None:
        super().__init__()

        layers: List[nn.Module] = []
        width = FEATURES_DIM

        for num_neurons in [2765, 1662]:
            layers.append(nn.Linear(width, num_neurons))
            width = num_neurons

            layers.append(nn.BatchNorm1d(width))
            layers.append(SwishActivation())

        layers.append(nn.Linear(width, NUM_CLASSES))
        self.layers = nn.Sequential(*layers)
        self.avg_pool = nn.AdaptiveAvgPool1d(1)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = torch.transpose(x, 1, 2)
        x = self.avg_pool(x).view(x.size(0), -1)
        x = self.layers(x)
        return x



In [28]:
model = ClassifierModel()
model.cuda()

ClassifierModel(
  (layers): Sequential(
    (0): Linear(in_features=1152, out_features=2765, bias=True)
    (1): BatchNorm1d(2765, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): SwishActivation(
      (sigmoid): Sigmoid()
    )
    (3): Linear(in_features=2765, out_features=1662, bias=True)
    (4): BatchNorm1d(1662, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): SwishActivation(
      (sigmoid): Sigmoid()
    )
    (6): Linear(in_features=1662, out_features=1000, bias=True)
  )
  (avg_pool): AdaptiveAvgPool1d(output_size=1)
)

## criterion,optimizer,lr_scheduler

In [30]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

lr_scheduler = lr_sched.ReduceLROnPlateau(optimizer, mode='max', factor=LR_FACTOR,
                                    patience=LR_PATIENCE, threshold=LR_THRESHOLD,
                                    min_lr=LR_MINIMUM)

## train model

In [34]:

def average_precision(actuals, predictions, k=None):
    num_positives = actuals.sum() + 1e-10

    sorted_idx = np.argsort(predictions)[::-1]
    if k is not None:
        sorted_idx = sorted_idx[:k]

    actuals = actuals[sorted_idx]
    precisions = np.cumsum(actuals) / np.arange(1, len(actuals) + 1)

    return (precisions * actuals).sum() / float(num_positives)

def get_model_path(fold_num: int) -> str:
    return os.path.join(r'C:\Users\liaox\yt8m',f'best_model_fold_{fold_num}.pth')

class MeanAveragePrecisionCalculator:
    ''' Classwise MAP@K - metric for Youtube-8M 2019 competition. '''

    def __init__(self, num_classes=NUM_CLASSES, k=10 ** 5):
        self._num_classes = num_classes
        self._k = k
        self._predictions = [[] for _ in range(num_classes)]
        self._actuals = [[] for _ in range(num_classes)]

    def accumulate(self, predictions, actuals, masks=None):
        if masks is None:
            masks = np.ones_like(actuals)

        for i in range(self._num_classes):
            mask = masks[:, i] > 0

            self._predictions[i].append(predictions[:, i][mask])
            self._actuals[i].append(actuals[:, i][mask])

    def __call__(self):
        aps = []
        positive_count = []
        total_count = []

        for i in range(self._num_classes):
            actuals = np.concatenate(self._actuals[i])
            predictions = np.concatenate(self._predictions[i])

            aps.append(average_precision(actuals, predictions, self._k))

            total_count.append(len(actuals))
            positive_count.append(actuals.sum())

        return np.mean(aps)



class AverageMeter:
    ''' Computes and stores the average and current value. '''
    def __init__(self) -> None:
        self.reset()

    def reset(self) -> None:
        self.val = 0.0
        self.avg = 0.0
        self.sum = 0.0
        self.count = 0

    def update(self, val: float, n: int = 1) -> None:
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def set_lr(optimizer: Any, lr: float) -> None:
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

def get_lr(optimizer: Any) -> float:
    for param_group in optimizer.param_groups:
        lr = float(param_group['lr'])
        return lr

    assert False


def accuracy(predicts: Any, targets: Any) -> float:
    if isinstance(predicts, torch.Tensor):
        predicts = predicts.cpu().numpy()

    if isinstance(targets, torch.Tensor):
        targets = targets.cpu().numpy()

    if len(predicts.shape) == 2:
        predicts = np.argmax(predicts, axis=1)

    if len(targets.shape) == 2:
        targets = np.argmax(targets, axis=1)

    if predicts.shape != targets.shape:
        print(predicts.shape)
        print(targets.shape)
        assert False

    return np.mean(predicts == targets)



def train_epoch(train_loader: Any, model: Any, criterion: Any, optimizer: Any,
                epoch: int, lr_scheduler: Any) -> float:
    print(f'epoch: {epoch}')
    print(f'learning rate: {get_lr(optimizer)}')

    batch_time = AverageMeter()
    losses = AverageMeter()
    avg_score = AverageMeter()

    model.train()
    optimizer.zero_grad()

    num_steps = len(train_loader)

    # print(f'total batches: {num_steps}')
    end = time.time()
    activation = nn.Softmax(dim=1)

    for i, (input_, target) in enumerate(train_loader):
        input_ = input_.cuda()
        output = model(input_)

        loss = criterion(output, target.cuda())

        predict = torch.argmax(output.detach(), dim=-1)
        avg_score.update(accuracy(predict, target))

        losses.update(loss.data.item(), input_.size(0))
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        batch_time.update(time.time() - end)
        end = time.time()

        if i % LOG_FREQ == 0:
            print(f'{epoch} [{i}/{num_steps}]\t'
                        f'time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                        f'loss {losses.val:.4f} ({losses.avg:.4f})\t'
                        f'acc {avg_score.val:.4f} ({avg_score.avg:.4f})')

    print(f' * average acc on train {avg_score.avg:.4f}')
    return avg_score.avg



def get_lr(optimizer: Any) -> float:
    for param_group in optimizer.param_groups:
        lr = float(param_group['lr'])
        return lr

    assert False


def set_lr(optimizer: Any, lr: float) -> None:
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    


def inference(data_loader: Any, model: Any) -> np.array:
    ''' Returns predictions array. '''
    model.eval()

    predicts_list = []
    activation = nn.Softmax(dim=1)

    with torch.no_grad():
        for input_, target in data_loader:
            output = model(input_.cuda())
            output = activation(output)
            predicts_list.append(output.detach().cpu().numpy())

    predicts = np.concatenate(predicts_list)
    print('predicts', predicts.shape)
    return predicts



def validate(val_loader: Any, model: Any, epoch: int) -> float:
    ''' Infers predictions and calculates validation score. '''
    print('validate()')
    val_pred = inference(val_loader, model)

    metric = MeanAveragePrecisionCalculator()

    val_true = val_loader.dataset.labels
    val_scores = val_loader.dataset.scores

    assert val_true.size == val_pred.shape[0]

    masks = np.eye(NUM_CLASSES)[val_true]   # convert to one-hot encoding
    actuals = masks * np.expand_dims(val_scores, axis=-1)

    metric.accumulate(val_pred, actuals, masks)
    score = metric()

    print(f' * epoch {epoch} validation score: {score:.4f}')
    return score

def train_model(model,criterion,optimizer,train_loader, val_loader) -> float:
    print('=' * 80)

    last_epoch = -1
    print(f'training will start from epoch {last_epoch + 1}')

    best_score = 0.0
    best_epoch = 0

    last_lr = get_lr(optimizer)
    best_model_path = None

    for epoch in range(last_epoch + 1, NUM_EPOCHS):
        print('-' * 50)
        lr = get_lr(optimizer)

        # if we have just reduced LR, reload the best saved model
        if lr < last_lr - 1e-10 and best_model_path is not None:
            print(f'learning rate dropped: {lr}, reloading')
            last_checkpoint = torch.load(best_model_path)

            model.load_state_dict(last_checkpoint['state_dict'])
            optimizer.load_state_dict(last_checkpoint['optimizer'])
            print(f'checkpoint loaded: {best_model_path}')
            set_lr(optimizer, lr)
            last_lr = lr

        train_epoch(train_loader, model, criterion, optimizer, epoch, lr_scheduler)
        score = validate(val_loader, model, epoch)

        lr_scheduler.step(metrics=score)

        is_best = score > best_score
        best_score = max(score, best_score)
        if is_best:
            best_epoch = epoch

        if is_best:
            best_model_path = get_model_path(fold_num)

            data_to_save = {
                'epoch': epoch,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
            }

            torch.save(data_to_save, best_model_path)
            print(f'a snapshot was saved to {best_model_path}')

    print(f'best score: {best_score:.04f}')
    return -best_score

In [35]:
train_model(model,criterion,optimizer,train_loader, val_loader)

training will start from epoch 0
--------------------------------------------------
epoch: 0
learning rate: 0.00011729283760398037
total batches: 92
0 [0/92]	time 3.896 (3.896)	loss 6.9039 (6.9039)	acc 0.0000 (0.0000)
 * average acc on train 0.4706
validate()
predicts (244, 1000)
 * epoch 0 validation score: 0.0334
a snapshot was saved to C:\Users\liaox\yt8m\best_model_fold_0.pth
--------------------------------------------------
epoch: 1
learning rate: 0.00011729283760398037
total batches: 92
1 [0/92]	time 0.005 (0.005)	loss 1.4420 (1.4420)	acc 0.8333 (0.8333)
 * average acc on train 0.8197
validate()
predicts (244, 1000)
 * epoch 1 validation score: 0.0323
--------------------------------------------------
epoch: 2
learning rate: 0.00011729283760398037
total batches: 92
2 [0/92]	time 0.005 (0.005)	loss 0.4533 (0.4533)	acc 0.9583 (0.9583)
 * average acc on train 0.9216
validate()
predicts (244, 1000)
 * epoch 2 validation score: 0.0327
-------------------------------------------------

-0.03341651785601382

In [37]:
epoch='end'
score = validate(val_loader, model, epoch)

validate()
predicts (244, 1000)
 * epoch end validation score: 0.0326


## predict


In [40]:
def unwrap_video(video_dict):
    return (
        video_dict['id'],
        video_dict['labels'],
        video_dict['features'],
        video_dict['segment_start_times'],
        video_dict['segment_labels'],
        video_dict['segment_scores']
    )

def wrap_segment(vid, labels, start_time, features, segment_label, segment_score):
    return {
        'id': vid,
        'labels': labels,
        'start_time': start_time,
        'features': features,
        'segment_label': segment_label,
        'segment_score': segment_score
    }

class YouTube8MRecordParser:
    context_features = {
        "id": tf.io.FixedLenFeature((), tf.string),
        "labels": tf.io.VarLenFeature(tf.int64),
        "segment_start_times": tf.io.VarLenFeature(tf.int64),
        "segment_end_times": tf.io.VarLenFeature(tf.int64),
        "segment_labels": tf.io.VarLenFeature(tf.int64),
        "segment_scores": tf.io.VarLenFeature(tf.float32)
    }

    sequence_features = {
        "rgb": tf.io.FixedLenSequenceFeature([], tf.string),
        "audio": tf.io.FixedLenSequenceFeature([], tf.string),
    }

    @staticmethod
    def parse(proto):
        sample, sequence_parsed = tf.io.parse_single_sequence_example(
            proto,
            YouTube8MRecordParser.context_features,
            YouTube8MRecordParser.sequence_features
        )

        sample['features'] = tf.concat([
            tf.decode_raw(sequence_parsed['rgb'], tf.uint8),
            tf.decode_raw(sequence_parsed['audio'], tf.uint8)
        ], axis=-1
        )

        for k, v in sample.items():
            if k == 'labels' or 'segment' in k:
                sample[k] = v.values

        return sample

    @staticmethod
    def to_numpy(eager_sample):
        return {
            k: v.numpy()
            for k, v in eager_sample.items()
        }

    @staticmethod
    def get_video_dataset(tfrecords, num_workers=None):
        return tf.data.TFRecordDataset(tfrecords, num_parallel_reads=num_workers)\
            .map(YouTube8MRecordParser.parse, num_parallel_calls=num_workers)\
            .filter(lambda video: tf.math.greater_equal(tf.shape(video['features'])[0], 5))

    @staticmethod
    def _video_to_segments_iterator(vid, labels, features, segment_start_times, segment_labels, segment_scores):
        n_samples = len(features) // SEGMENT_LEN

        assert n_samples >= 5

        for idx in range(n_samples):
            start_time = SEGMENT_LEN * idx

            segment_label = segment_score = -1
            if start_time in segment_start_times:
                i = np.where(segment_start_times == start_time)[0][0]
                segment_label = segment_labels[i]
                segment_score = segment_scores[i]

            yield (
                vid,
                labels,
                start_time,
                features[start_time: start_time + SEGMENT_LEN],
                segment_label,
                np.float32(segment_score)
            )

    def _video_to_segments(*args):
        result = [[] for _ in range(6)]

        for segment in YouTube8MRecordParser._video_to_segments_iterator(*args):
            for i, value in enumerate(segment):
                result[i].append(value)

        return result

    @staticmethod
    def get_segment_dataset(tfrecords: List[str]) -> Any:
        return YouTube8MRecordParser.get_video_dataset(tfrecords, None)\
            .map(lambda video: tf.py_func(
                YouTube8MRecordParser._video_to_segments,
                unwrap_video(video),
                Tout=[tf.string, tf.int64, tf.int32, tf.uint8, tf.int32, tf.float32]),
                num_parallel_calls=None
            )\
            .flat_map(lambda *args: tf.data.Dataset.zip(tuple(
                tf.data.Dataset.from_tensor_slices(k)
                for k in args))
            )\
            .map(
                wrap_segment,
                num_parallel_calls=None
            )

class YouTube8MSegmentDataset(torch.utils.data.IterableDataset):
    def __init__(self, tfrecords: List[str]) -> None:
        self._dataset = YouTube8MRecordParser.get_segment_dataset(tfrecords)

    def __iter__(self) -> None:
        for i, segment in enumerate(map(YouTube8MRecordParser.to_numpy, self._dataset)):
            features = dequantize(segment['features']).astype(np.float32)
            yield features, segment['id'].decode()

def inference_for_testset(test_predicts: np.array, data_loader: Any, model: Any) -> np.array:
    ''' Returns predictions array. '''
    model.eval()

    ids_list: List[str] = []
    activation = nn.Softmax(dim=1)

    with torch.no_grad():
        for i, (input_, ids) in enumerate(data_loader):
            output = model(input_.cuda())
            output = activation(output)

            ids_list.extend(ids)
            pred = output.detach().cpu().numpy()
            bs = data_loader.batch_size
            test_predicts[i * bs : i * bs + pred.shape[0]] += pred

    ids = np.array(ids_list)
    print('ids', ids.shape)
    return ids



def load_test_data(wildcard: str) -> Any:
    test_dataset = YouTube8MSegmentDataset(glob(wildcard))
    test_loader = torch.utils.data.DataLoader(
        test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

    return test_loader

In [39]:
test_loader = load_test_data(r'C:\Users\liaox\yt8m_ml\3\frame\test\test*.tfrecord')


Instructions for updating:
tf.py_func is deprecated in TF V2. Instead, there are two
    options available in V2.
    - tf.py_function takes a python function which manipulates tf eager
    tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
    an ndarray (just call tensor.numpy()) but having access to eager tensors
    means `tf.py_function`s can use accelerators such as GPUs as well as
    being differentiable using a gradient tape.
    - tf.numpy_function maintains the semantics of the deprecated tf.py_func
    (it is not differentiable, and manipulates numpy arrays). It drops the
    stateful argument making all functions stateful.
    


In [44]:
def predict_with_model(model,test_loader) -> np.array:
    print(f'predicting on the test set')
    test_predicts = np.zeros((2038114, 1000), dtype=np.float16)

    output = inference_for_testset(test_predicts, test_loader, model)
    return output,test_predicts

In [45]:
test_ids,test_predicts = predict_with_model(model,test_loader)

predicting on the test set
ids (1976,)
