# Video Summarizer 모델 학습/평가 코드

In [1]:
import os
import math
import numpy as np

## 경로 설정

In [15]:
dataset_dir = 'dataset14_sl0.5_vsr2_vw80_vh80_asr44100_mfcc'
ckpt_dir = 'checkpoints'

## 데이터 로더 생성

In [16]:
from data_loader import DataLoader

data_loader = DataLoader(dataset_dir, x_includes=['video', 'audio'])

data_config = data_loader.get_metadata()['config']
input_shape_dict = data_loader.get_metadata()['data_shape']
class_counts = data_loader.all_segment_df['label'].value_counts(sort=False)

In [17]:
video_input_shape = input_shape_dict['video']
video_input_shape

[1, 80, 80, 3]

## 하이퍼파라미터 설정

In [8]:
learning_rate = 1e-3
epochs = 20
batch_size = 256
class_weights = (1, 8)

In [13]:
from tensorflow.keras.layers import Dense, Dropout, TimeDistributed, Conv3D, Conv2D, Input, MaxPool2D, MaxPool3D, Flatten, Activation, concatenate, LSTM, Reshape
from tensorflow.keras.backend import expand_dims
from tensorflow.keras.applications.xception import Xception
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Model

def build_model(input_shape_dict):
    video_input_shape = input_shape_dict['video']
    audio_input_shape = input_shape_dict['audio'] 
    weight_decay = 0.005
    
    video_input = Input(video_input_shape)
    x = video_input
    x = TimeDistributed(Conv3D(8, (3, 3, 3), strides=(1, 1, 1), padding='same', activation='relu', kernel_initializer='he_uniform', kernel_regularizer=l2(weight_decay)))(x)
    x = TimeDistributed(MaxPool3D((2, 2, 2), strides=(2, 2, 2), padding='same'))(x)
    video_output = TimeDistributed(Flatten())(x)
    
    # Audio 2D Conv layers
    audio_input = Input(audio_input_shape)
    x = expand_dims(audio_input)    # add channel dim
    x = TimeDistributed(Conv2D(4, (3, 3), strides=(1, 1), padding='same', activation='relu', kernel_initializer='he_uniform', kernel_regularizer=l2(weight_decay)))(x)
    x = TimeDistributed(MaxPool2D((2, 2), strides=(2, 2), padding='same'))(x)
    audio_output = TimeDistributed(Flatten())(x)
    
    # LSTM layers
    x = concatenate([video_output, audio_output])
    x = Bidirectional(LSTM(16, activation='relu', kernel_initializer='he_uniform', kernel_regularizer=l2(weight_decay)))(x)

    # Fully-connected layers
    x = Dense(16, activation='relu', kernel_initializer='he_uniform', kernel_regularizer=l2(weight_decay))(x)
    #     x = Dropout(0.2)(x)
    fc_output = Dense(1, activation='sigmoid', kernel_initializer='he_uniform', kernel_regularizer=l2(weight_decay))(x)

    model = Model(inputs=[video_input, audio_input], outputs=fc_output)

In [14]:
model = build_model(input_shape_dict)
model.summary()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


ValueError: input tensor must have rank 5

## 모델 생성

**Video 2D data**

- video_input_shape이 (1, 100, 100, 3)이므로 Xception에 넣기 위해 (100, 100, 3)으로 Reshape 레이어 사용해서 reshape

- reshape된 것을 Xception에 집어넣음

- 그 결과를 TimeDistributed에 넣을 수 있도록 (1, 3, 3, 2048)로 reshape함.

**Audio 2D data**

- mfcc가 2차원 데이터이므로 일단 채널 차원을 Reshape 레이어 이용해서 만듦

- Xception에 넣기 위해서 Conv2D 1x1 covolution을 해줌, 이걸 해주면 크기는 안바뀌고 채널 수만 바뀜 이때 RGB 3채널로 바꾸기 위해 필터 수 = 3

- 그 다음 Xception에 넣어줌. 그 결과는 input이 (100, 100, 3)일때 (3, 3, 2048).

- 그걸 TimeDistributed에 넣을 수 있도록 (1, 3, 3, 2048)로 reshape함.

In [11]:
from tensorflow.keras.layers import Dense, Dropout, TimeDistributed, Conv2D, Input, MaxPool2D, Flatten, Activation, concatenate, LSTM, Reshape
from tensorflow.keras.backend import expand_dims
from tensorflow.keras.applications.xception import Xception
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Model

def build_model(input_shape_dict):
    video_input_shape = input_shape_dict['video']
    audio_input_shape = input_shape_dict['audio'] 
    weight_decay = 0.005
    
   # Video 2D Conv layers
    base_model_video = Xception(weights='imagenet', include_top=False, input_shape=(80, 80, 3))
    base_model_video.trainable = False

    video_input = Input(shape=video_input_shape)
    x = Reshape((80, 80, 3))(video_input)
    x = base_model_video(x, training=False)
    x = Reshape((1, 3, 3, 2048))(x)
    x = TimeDistributed(Conv2D(8, (3, 3), strides=(1, 1), padding='same', activation='relu', 
                        kernel_initializer='he_uniform', kernel_regularizer=l2(weight_decay)))(x)
    x = TimeDistributed(MaxPool2D((2, 2), strides=(2, 2), padding='same'))(x)
    video_output = Flatten()(x)

    # Audio 2D Conv layers    
    base_model_audio = VGG16(weights='imagenet', include_top=False, input_shape=(44, 44, 3)) 
    base_model_audio.trainable = False

    audio_input = Input(shape=audio_input_shape)
    y = Reshape((44, 44, 1))(audio_input)
    y = Conv2D(3, (1,1), strides=(1, 1), padding='same', activation='relu')(y) 
    y = base_model_audio(y, training=False)
    y = Reshape((1, 1, 1, 512))(y)
    y = TimeDistributed(Conv2D(8, (3, 3), strides=(1, 1), padding='same', activation='relu', 
                        kernel_initializer='he_uniform', kernel_regularizer=l2(weight_decay)))(y)
    y = TimeDistributed(MaxPool2D((2, 2), strides=(2, 2), padding='same'))(y)
    audio_output = Flatten()(y)

    # Fully-connected layers & LSTM
    fc_input = concatenate([video_output, audio_output])
    fc_input = expand_dims(fc_input)
    z = LSTM(128)(fc_input)
    z = Dense(16, activation='relu', kernel_initializer='he_uniform', kernel_regularizer=l2(weight_decay))(z)
    #z = Dropout(0.2)(z)
    fc_output = Dense(1, activation='sigmoid', kernel_initializer='he_uniform', kernel_regularizer=l2(weight_decay))(z)

    model = Model(inputs=[video_input, audio_input], outputs=fc_output)
    
    return model

In [12]:
model = build_model(input_shape_dict)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            [(None, 44, 44)]     0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 1, 80, 80, 3 0                                            
__________________________________________________________________________________________________
reshape_2 (Reshape)             (None, 44, 44, 1)    0           input_6[0][0]                    
__________________________________________________________________________________________________
reshape (Reshape)               (None, 80, 80, 3)    0           input_4[0][0]                    
______________________________________________________________________________________________

## 모델 학습

In [14]:
from trainer import Trainer
from tensorflow.keras.optimizers import Adam

# 학습 시작
trainer = Trainer(model, data_loader, ckpt_dir)
trainer.train(Adam(learning_rate), epochs, batch_size, class_weights)

Training started at 20200822-114705
optimizer: {'name': 'Adam', 'learning_rate': 0.001, 'decay': 0.0, 'beta_1': 0.9, 'beta_2': 0.999, 'epsilon': 1e-07, 'amsgrad': False}
epochs: 20
batch size: 256
class weights: (1, 8)
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
normalized class weights: [0.61443784 4.91550269]



HBox(children=(IntProgress(value=0, description='Train 1/20', max=282, style=ProgressStyle(description_width='…


Train stopped

Top5 models


KeyError: "None of ['epoch'] are in the columns"

##  가중치 복원
모델이 선언되어 있을 때 저장된 가중치를 복원

In [26]:
checkpoint_name = 'ckpt-20200820-040849-0002-0.2604'
model.load_weights(os.path.join(ckpt_dir, checkpoint_name + '.h5'))

## 모델 테스트

In [27]:
loss, accuracy, precision, recall, f1score = trainer.test(batch_size)
print(f'loss: {loss:.4f}, accuracy: {accuracy:.4f}, precision: {precision:.4f}, recall: {recall:.4f}, f1score: {f1score:.4f}')

HBox(children=(IntProgress(value=0, description='Test', max=130, style=ProgressStyle(description_width='initia…


loss: 0.5368, accuracy: 0.7639, precision: 0.2064, recall: 0.4145, f1score: 0.2756


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

y_true, y_pred = trainer.test_prediction(batch_size)

print(f'test data count: {len(y_true)}')
print(f'true_1, pred_1: {y_true.sum(), y_pred.sum()}')
print()
print('Confusion Matrix:')
print(confusion_matrix(y_true, y_pred))
print()
print('Report:')
print(classification_report(y_true, y_pred))

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Num'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Num'


## 모델의 모든 정보를 온전하게 저장 / 복원
모델의 가중치 뿐만아니라 모든 레이어 구성 정보를 저장하여 추후 모델 선언부가 없어도 불러와서 사용 가능

### 모델 저장

In [None]:
checkpoint_name = 'ckpt-20200819-224303-0019-0.4714'
model_name = checkpoint_name + '_model'
model_path = os.path.join(ckpt_dir, model_name + '.h5')
print(model_path)

In [16]:
model.save(model_path)

# 모델 복원

In [17]:
checkpoint_name = 'ckpt-20200819-224303-0019-0.4714'
model_name = checkpoint_name + '_model'
model_path = os.path.join(ckpt_dir, model_name + '.h5')
print(model_path)

checkpoints/ckpt-20200819-224303-0019-0.4714_model.h5


In [18]:
from tensorflow.keras.models import load_model

model_restored = load_model(model_path)

In [19]:
model_restored.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 40, 130)]    0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 6, 64, 64, 3 0                                            
__________________________________________________________________________________________________
tf_op_layer_ExpandDims (TensorF (None, 40, 130, 1)   0           input_2[0][0]                    
__________________________________________________________________________________________________
conv3d (Conv3D)                 (None, 6, 64, 64, 8) 656         input_1[0][0]                    
______________________________________________________________________________________________

In [None]:
from trainer import Trainer
trainer = Trainer(model_restored, data_loader, ckpt_dir)

loss, accuracy, precision, recall, f1score = trainer.test(batch_size)
print(f'loss: {loss:.4f}, accuracy: {accuracy:.4f}, precision: {precision:.4f}, recall: {recall:.4f}, f1score: {f1score:.4f}')