In [None]:
## 비디오 인코딩 모델 생성
  ## 베이스 모델: InceptionV3
  ## 데이터
    # x_data: video files
    # y_data: sentence_kwd

  ## 모델 목표
    # InceptionV3를 사용해 각 장면에서 추출한 특징을 sentence_kwd와 매칭해 해당 장면은 kwd 에 관련된다는 패턴 생성

  ## 목표 추론 결과
    # valid data 입력시, 해당 장면에 대한 올바른 키워드를 예측하는지 확인

In [1]:
# basic import

import pandas as pd
import numpy as np
# from google.colab import drive
import os


In [2]:
# data load and unzip

drive.mount('/content/drive')
zip_file_path = '/content/drive/My Drive/pjt_3_data/drive_upload_sample.zip'
%mkdir 'pjt_3_sample'

!cp "{zip_file_path}" "/content/"

!unzip -q "/content/drive_upload_sample.zip" -d "/content/pjt_3_sample/"

print('unizipped files: ', os.listdir('/content/pjt_3_sample'))

Mounted at /content/drive
unizipped files:  ['df_main.csv', 'D3_DA_0527_000042.mp4', 'D3_DA_0603_000001.mp4', 'D3_DA_0601_000001.csv', 'D3_DA_0603_000001.csv', 'D3_DA_0609_000001.csv', 'D3_DA_0601_000001.mp4', 'D3_DA_0610_000001.mp4', 'D3_DA_0609_000001.mp4', 'D3_DA_0610_000001.csv', 'D3_DA_0527_000042.csv']


In [2]:
import cv2
import os
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.applications.inception_v3 import preprocess_input

In [4]:
# preprocessing: video, txt data

main_df = pd.read_csv('./try_first/df_main.csv', encoding='utf-8')

tokenizer = Tokenizer()

video_data = []
context_data = []

def convert_time_to_seconds(time_str):
  if isinstance(time_str, float):
    return time_str

  minutes, seconds = time_str.split(':')
  return int(minutes) * 60 + float(seconds)

def load_video_frames(video_path, time_start, time_end, max_frames=60, resize=(299,299)):
  cap = cv2.VideoCapture(video_path)
  frames = []
  frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
  frame_rate = cap.get(cv2.CAP_PROP_FPS)

  start_frame = int(convert_time_to_seconds(time_start) * frame_rate)
  end_frame = int(min(convert_time_to_seconds(time_end) * frame_rate, frame_count))

  current_frame = 0
  while current_frame < start_frame:
    ret, frame = cap.read()
    if not ret:
      break
    current_frame += 1

  while current_frame < end_frame and len(frames) < max_frames:
    ret, frame = cap.read()
    if not ret:
      break
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    frame = cv2.resize(frame, resize)
    frame = preprocess_input(frame)
    frames.append(frame)
    current_frame += 1

  while len(frames) < max_frames:
    black_frame = np.zeros((resize[0], resize[1], 3), np.uint8)
    frames.append(black_frame)

  cap.release()
  return np.array(frames)

def load_and_process_video_and_text(main_df, folder_path):
  for label_file_id in main_df['id']:
    label_file_path = os.path.join(folder_path, label_file_id + '.csv')
    labels_df = pd.read_csv(label_file_path, encoding='utf-8')

    for _, row in labels_df.iterrows():
      video_path = os.path.join(folder_path, row['video_name'])
      time_start, time_end = row['time_start'], row['time_end']
      keywords =  row['sentence_kwd']
      keywords_str = ' '.join(keywords)

      video_frames = load_video_frames(video_path, time_start, time_end)
      video_data.append(video_frames)

      context_data.append(keywords_str)

In [12]:
np.__version__

'1.26.2'

In [5]:
# train, test split

folder_path = './try_first'
load_and_process_video_and_text(main_df, folder_path)

tokenizer.fit_on_texts(context_data)
context_sequences = tokenizer.texts_to_sequences(context_data)

x = video_data
y = context_sequences

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=42)

x_train = np.array(x_train)
x_test = np.array(x_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (24,) + inhomogeneous part.

In [10]:
y_train

[[1, 9, 5, 7, 2, 4, 1, 1, 15, 10, 16, 8, 14, 3, 8, 1, 1, 6, 3, 3, 6, 14, 1],
 [1, 9, 5, 7, 2, 4, 1, 1, 6, 2, 11, 13, 12, 4, 17, 1],
 [1,
  9,
  5,
  7,
  2,
  4,
  1,
  1,
  9,
  2,
  11,
  13,
  8,
  1,
  1,
  21,
  10,
  5,
  4,
  6,
  1,
  1,
  10,
  3,
  21,
  10,
  12,
  17,
  3,
  10,
  2,
  6,
  5,
  10,
  1],
 [1,
  9,
  5,
  7,
  2,
  4,
  1,
  1,
  17,
  5,
  3,
  8,
  1,
  1,
  8,
  3,
  2,
  6,
  1,
  1,
  6,
  16,
  7,
  15,
  11,
  3,
  10,
  1,
  1,
  9,
  2,
  6,
  3,
  10,
  1],
 [1, 9, 5, 7, 2, 4, 1, 1, 11, 5, 5, 13, 8, 1, 1, 18, 14, 5, 4, 3, 1],
 [1, 9, 5, 7, 2, 4, 1, 1, 8, 12, 6, 8, 1, 1, 18, 11, 2, 19, 3, 1],
 [1,
  9,
  5,
  7,
  2,
  4,
  1,
  1,
  6,
  2,
  11,
  13,
  8,
  1,
  1,
  18,
  14,
  5,
  4,
  3,
  1,
  1,
  6,
  2,
  11,
  13,
  8,
  1],
 [1, 9, 5, 7, 2, 4, 1, 1, 20, 10, 12, 4, 13, 8, 1, 1, 9, 2, 6, 3, 10, 1],
 [1, 9, 5, 7, 2, 4, 1, 1, 15, 10, 16, 8, 14, 3, 8, 1, 1, 6, 3, 3, 6, 14, 1],
 [1,
  9,
  5,
  7,
  2,
  4,
  1,
  1,
  10,
  3,
  7,
  5,
  2

In [10]:
## train, test set check
  # x_train, x_test는 비디오 데이터이므로 (샘플수, 프레임수, 높이, 너비, 차원수)로 나와야 함
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(24, 60, 299, 299, 3)


AttributeError: 'list' object has no attribute 'shape'

In [25]:
x_train[0].shape

(120, 299, 299, 3)

In [26]:
## x_train or x_test 데이터의 크기가 올바르지 않을 때 각 비디오 데이터의 크기 확인 코드
 ## 데이터 크기가 올바르지 않을 경우: 추출된 프레임 수가 불일치해서 np.array로 변환되지 못했기 때문
  ## 추출되는 프레임 수가 다른 이유: 타임스탬프의 길이 차이
    ## 예시: 초당프레임: 30, 최대 추출 프레임수: 120, 스탬프1: 5초, 스탬프2: 3초
      # 스탬프 1은 총 150프레임을 구성되므로(5*30) 150중 120프레임 추출됨
      # 스탬프 2는 총 90프레임으로 구성되므로(3*30) 90중 90프레임이 추출됨

    ## 결론: 최대 프레임을 기준으로 패딩하거나, 타임스탬프 길이를 동일하게 정규화해야 함

for i, video in enumerate(x_train):
  print(f"video {i}: shape = {video.shape}")

video 0: shape = (120, 299, 299, 3)
video 1: shape = (120, 299, 299, 3)
video 2: shape = (120, 299, 299, 3)
video 3: shape = (120, 299, 299, 3)
video 4: shape = (120, 299, 299, 3)
video 5: shape = (120, 299, 299, 3)
video 6: shape = (120, 299, 299, 3)
video 7: shape = (120, 299, 299, 3)
video 8: shape = (120, 299, 299, 3)
video 9: shape = (120, 299, 299, 3)
video 10: shape = (120, 299, 299, 3)
video 11: shape = (90, 299, 299, 3)
video 12: shape = (120, 299, 299, 3)
video 13: shape = (120, 299, 299, 3)
video 14: shape = (120, 299, 299, 3)
video 15: shape = (120, 299, 299, 3)
video 16: shape = (120, 299, 299, 3)
video 17: shape = (120, 299, 299, 3)
video 18: shape = (120, 299, 299, 3)
video 19: shape = (120, 299, 299, 3)
video 20: shape = (120, 299, 299, 3)
video 21: shape = (120, 299, 299, 3)
video 22: shape = (120, 299, 299, 3)
video 23: shape = (120, 299, 299, 3)


In [13]:
# model define

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Input, Dense, Embedding
from tensorflow.keras.layers import Dropout, TimeDistributed
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.applications.inception_v3 import InceptionV3

base_model = InceptionV3(weights='imagenet', include_top=False,\
                         pooling='avg')

input_shape = (None, 299, 299, 3)
video_input = Input(shape=input_shape)
encoded_frames = TimeDistributed(base_model)(video_input)

video_lstm = LSTM(256)(encoded_frames)
video_dense = Dense(126, activation='relu')(video_lstm)

num_keywords = 10
embedding_dim = 100
max_num_keywords = 1000
keyword_input = Input(shape=(None,), dtype='int32')
keyword_embedding = Embedding(max_num_keywords, embedding_dim)(keyword_input)
keyword_lstm = LSTM(126)(keyword_embedding)

keyword_dense = Dense(64, activation='relu')(keyword_lstm)

combined = Concatenate()([video_dense, keyword_dense])
predictions = Dense(units=num_keywords, activation='softmax')(combined)

model = Model(inputs=[video_input, keyword_input], outputs=predictions)

In [14]:
# model compile

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_5 (InputLayer)        [(None, None, 299, 299, 3)   0         []                            
                             ]                                                                    
                                                                                                  
 input_6 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 time_distributed_1 (TimeDi  (None, None, 2048)           2180278   ['input_5[0][0]']             
 stributed)                                               4                                       
                                                                                              

In [None]:
## 12월 21일 저녁 과제
  ## 전처리 함수 재검토(일단 프레임 패딩 추가함)
  ## 전처리 데이터 재생성
  ## 모델 학습
  ## 모델 결과 확인 및 데이터 증강(5개 -> 최대 100개까지)
  ## 위의 작업을 반드시 집 데스크탑으로 실행해볼 것!!(눕지마!!)

In [None]:
# model train

In [None]:
# model validation