In [None]:
## 이미지 캡셔닝 1차 모델 구축
  ## 베이스 아키텍처: Show, Attend and Tell
  ## 추가 확보 데이터: vocab(from pre-trained model)
  ## 프로세스
    ## 데이터 로드, 체크
    ## 텍스트 데이터 추가 전처리(정규화, 토큰화, 패딩)
    ## 이미지 데이터 전처리
    ## 모델 정의: CNN, Attention, LSTM
    ## 기타 사용자 정의 함수 생성(손실함수 등)
    ## 모델 컴파일 및 훈련
    ## 모델 검증(테스트 데이터 확인, 학습결과 시각화, 매트릭스 출력)

In [None]:
!pip install nltk rouge

In [1]:
!pip install pillow



In [None]:
## Data load

from google.colab import drive
import os

drive.mount('/content/drive')
zip_file_path = '/content/drive/My Drive/data-team3-imagecaption/train_sample_2.zip'
%mkdir 'train_sample_2'

!cp "{zip_file_path}" "/content/"

!unzip -q "/content/train_sample_2.zip" -d "/content/train_sample_2/"

print('unizipped files: ', os.listdir('/content/train_sample_2'))

In [2]:
## text data additional preprocessing for model train
import tensorflow as tf

tf.get_logger().setLevel('ERROR')
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import re
from PIL import Image

In [3]:
df = pd.read_csv('./train_sample_2000.csv', encoding='utf-8')
df.head(2)

Unnamed: 0,id,height,width,file_name,category,sentence_en
0,446250,1920,1440,IMG_0446250_person(person).jpg,person,a man is holding an umbrella
1,446251,1920,1080,IMG_0446251_person(person).jpg,person,a man is turning on a gas stove


In [4]:
## text data preprocessing

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np

def preprocess_and_tokenize(df, text_column='sentence_en'):
  def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text

  start_token = '<start>'
  end_token = '<end>'
  pad_token = '<pad>'
  unk_token = '<unk>'

  df['cleaned_sentence'] = df[text_column].apply(clean_text)

  tokenizer = Tokenizer(oov_token=unk_token)
  tokenizer.fit_on_texts(df['cleaned_sentence'])

  tokenizer.word_index[start_token] = len(tokenizer.word_index) + 1
  tokenizer.word_index[end_token] = len(tokenizer.word_index) + 1
  tokenizer.word_index[pad_token] = 0
  tokenizer.word_index[unk_token] = len(tokenizer.word_index) + 1
  tokenizer.index_word[0] = pad_token
  tokenizer.index_word[len(tokenizer.word_index)] = unk_token

  sequences = tokenizer.texts_to_sequences(df['cleaned_sentence'])
  sequences = [[tokenizer.word_index[start_token]] + seq + [tokenizer.word_index[end_token]] for seq in sequences]

  max_length = max(len(sequence) for sequence in sequences)
  sequences_padded = pad_sequences(sequences, maxlen=max_length, padding='post', value=tokenizer.word_index[pad_token])

  vocab_size = len(tokenizer.word_index) + 1

  x_data_text = sequences_padded[:, :-1]
  y_data = sequences_padded[:, 1:]

  y_data_one_hot = to_categorical(y_data, num_classes=vocab_size)
  max_sequence_length = max_length - 1

  return x_data_text, y_data_one_hot, y_data, vocab_size, tokenizer, max_sequence_length


In [5]:
## image data preprocessing for model train

from tensorflow.keras.preprocessing.image import load_img, img_to_array
from PIL import Image
import numpy as np
import os

image_folder = './image'

In [6]:
def process_image(file_name, target_size=(299,299)):
  image_path = os.path.join(image_folder, file_name)
  image = load_img(image_path, target_size=target_size)
  image = img_to_array(image)
  image = image.astype('float32')
  image /= 255.0
  return image

In [7]:
image_data = np.array([process_image(file_name) for file_name in df['file_name']])

x_data_text, y_data_one_hot, y_data, vocab_size, tokenizer, max_sequence_length = preprocess_and_tokenize(df)
x_data_image = image_data

In [8]:
print(image_data.shape)

(2002, 299, 299, 3)


In [9]:
## first model train: sample 200

# x_train_image = x_data_image[:200]
# x_train_text = x_data_text[:200]
# y_train = y_data_one_hot[:200]
# y_train_int = y_data[:200]

x_train_image = x_data_image
x_train_text = x_data_text
y_train = y_data_one_hot
y_train_int = y_data

In [10]:
## train_test data split: one-hot encoding

train_image_1, test_image_1, train_sequences_1,\
 test_sequences_1, train_y_1, test_y_1, train_y_int, test_y_int = train_test_split(x_train_image, x_train_text, y_train, y_train_int, test_size=0.2, random_state=42)

In [11]:
print(train_image_1.shape)
print(train_sequences_1.shape)
print(test_image_1.shape)
print(test_sequences_1.shape)
print(train_y_1.shape)
print(test_y_1.shape)
print(train_y_int.shape)
print(test_y_int.shape)

(1601, 299, 299, 3)
(1601, 39)
(401, 299, 299, 3)
(401, 39)
(1601, 39, 1061)
(401, 39, 1061)
(1601, 39)
(401, 39)


In [12]:
print(vocab_size)

1061


In [12]:
## BLEU, ROUGE score metrics function

import nltk
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
nltk.download('punkt')

def calculate_bleu(references, candidates):
  score = 0
  for ref, cand in zip(references, candidates):
    ref_tokens = nltk.word_tokenize(ref.lower())
    cand_tokens = nltk.word_tokenize(cand.lower())
    score += sentence_bleu([ref_tokens], cand_tokens, weights=(0.25, 0.25, 0.25, 0.25))

  return score / len(candidates)

def calculate_rouge(references, candidates):
  rouge = Rouge()
  scores = rouge.get_scores(candidates, references, avg=True)
  return scores

[nltk_data] Downloading package punkt to /home/kkm_lnx22/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
## define model(use cross-attention only)
  ## cross-attention은 그냥 attention이랑 같은거라고 보면 됨

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense
from tensorflow.keras.layers import LSTM, Bidirectional, Dropout
from tensorflow.keras.layers import AdditiveAttention, MultiHeadAttention, Lambda, Masking, RepeatVector, Reshape
from tensorflow.keras.applications.inception_resnet_v2 import InceptionResNetV2
from keras.optimizers import Adam

In [None]:
## 학습 준비: basic model compile

In [14]:
## model2: (cross)attention + self_attention

def create_model(vocab_size, lstm_units, max_sequence_length):

  inception_resnet_model = InceptionResNetV2(include_top=False,\
                                            weights='imagenet',\
                                            pooling='avg')
  inception_resnet_model.trainable = False

  image_input = Input(shape=(299,299,3))
  image_features = inception_resnet_model(image_input)

  sequence_input = Input(shape=(max_sequence_length,))

  embedding = Embedding(input_dim=vocab_size,\
                        output_dim=256, mask_zero=True)(sequence_input)

  bidirectional_lstm = Bidirectional(LSTM(lstm_units, return_sequences=True))

  lstm_output = bidirectional_lstm(embedding)

  dropout = Dropout(0.5)
  lstm_output = dropout(lstm_output)

  second_lstm = LSTM(lstm_units,\
                    return_sequences=True,\
                    return_state=True)
  lstm_output, _, _ = second_lstm(lstm_output)

  image_features_dense = Dense(lstm_units)(image_features)
  image_features_dense = Reshape((1, lstm_units))(image_features_dense)
  image_features_dense = Lambda(lambda x: tf.tile(x, [1, max_sequence_length, 1]))(image_features_dense)

  print("LSTM output shape: ", lstm_output.shape)
  print("image feature shape: ", image_features_dense.shape)

  attention = AdditiveAttention()
  context_vector, attention_weights = attention([lstm_output, image_features_dense], return_attention_scores=True)

  self_attention = MultiHeadAttention(num_heads=8, key_dim=lstm_units)
  self_attention_output = self_attention(query=lstm_output, value=lstm_output, key=lstm_output)

  combined_attention_output = tf.keras.layers.Concatenate(axis=-1)([context_vector, self_attention_output])

  dense = Dense(vocab_size, activation='softmax')
  output = dense(combined_attention_output)

  return Model(inputs=[image_input, sequence_input], outputs=output)


model_2 = create_model(vocab_size, 256, max_sequence_length)
initial_weights = model_2.get_weights()
model_2.summary()

2024-01-16 11:06:56.844579: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-01-16 11:06:56.864937: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-01-16 11:06:56.864985: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-01-16 11:06:56.867046: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-01-16 11:06:56.867121: I external/local_xla/xla/stream_executor

LSTM output shape:  (None, 39, 256)
image feature shape:  (None, 39, 256)
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_3 (InputLayer)        [(None, 39)]                 0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 299, 299, 3)]        0         []                            
                                                                                                  
 embedding (Embedding)       (None, 39, 256)              271616    ['input_3[0][0]']             
                                                                                                  
 inception_resnet_v2 (Funct  (None, 1536)                 5433673   ['input_2[0][0]']             
 ional)             

In [16]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # GPU 메모리 증가를 동적으로 할당
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

Physical devices cannot be modified after being initialized


In [15]:
adam = Adam(learning_rate=0.001)

model_2.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_2.set_weights(initial_weights)

In [16]:
print(max_sequence_length)
print(train_sequences_1.shape)
print(test_sequences_1.shape)

39
(1601, 39)
(401, 39)


In [17]:
## train model_2

history_1 = model_2.fit([train_image_1, train_sequences_1], train_y_int,\
                      validation_data=([test_image_1, test_sequences_1], test_y_int),\
                      epochs=20, batch_size=32, verbose=1)

Epoch 1/20


W0000 00:00:1705370849.763971    1392 op_level_cost_estimator.cc:699] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "GPU" vendor: "NVIDIA" model: "NVIDIA GeForce RTX 4060 Ti" frequency: 2610 num_cores: 34 environment { key: "architecture" value: "8.9" } environment { key: "cuda" value: "12020" } environment { key: "cudnn" value: "8904" } num_registers: 65536 l1_cache_size: 24576 l2_cache_size: 33554432 shared_memory_size_per_multiprocessor: 102400 memory_size: 14171504640 bandwidth: 288032000 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }
2024-01-16 11:07:32.139966: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
2024-01-16 11:07:32.372168: W tensorflow/core/common_runtime/type_inference.cc:339] Type inference failed. This indicates an invalid graph that escaped type checking. Error message: INVALID_ARGUMENT: expected 



W0000 00:00:1705370871.028131    1392 op_level_cost_estimator.cc:699] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "GPU" vendor: "NVIDIA" model: "NVIDIA GeForce RTX 4060 Ti" frequency: 2610 num_cores: 34 environment { key: "architecture" value: "8.9" } environment { key: "cuda" value: "12020" } environment { key: "cudnn" value: "8904" } num_registers: 65536 l1_cache_size: 24576 l2_cache_size: 33554432 shared_memory_size_per_multiprocessor: 102400 memory_size: 14171504640 bandwidth: 288032000 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
## BLEU, ROUGE score check

predicted_sequences = model.predict([test_image_1, test_sequences_1])

In [None]:
index_to_word = {index: word for word, index in tokenizer.word_index.items()}

def sequences_to_text(sequences):
  text_output = []
  for sequence in sequences:
    sequence_text = []
    for word_idx in sequences:
      max_idx = np.argmax(word_idx)
      sequence_text.append(index_to_word.get(max_idx, ''))
    text_output.append(' '.join(sequence_text))
  return text_output

predicted_texts = sequences_to_text(predicted_sequences)

In [None]:
## 원-핫 인코딩된 텍스트를 다시 문장으로 변환 function

def one_hot_to_text(one_hot_sequences, index_to_word):
  text_output = []
  for sequence in one_hot_sequences:
    sequence_text = []
    for word_vec in sequence:
      max_idx = np.argmax(word_vec)
      word = index_to_word.get(max_idx, '')
      if word:
        sequence_text.append(word)
    text_output.append(' '.join(sequence_text))
  return text_output

In [None]:
test_y_texts = one_hot_to_text(test_y_1, index_to_word)

bleu_score = calculate_bleu(test_y_texts, predicted_texts)
rouge_score = calculate_rouge(test_y_texts, predicted_texts)

print("BLEU Score: ", bleu_score)
print("ROUGE Score: ", rouge_score)

In [None]:
## one-hot 인코딩 라벨 대신 정수화 인코딩 라벨로 학습해보기

initial_weights = model_2.get_weights()
model_2.set_weights(initial_weights)
model_2.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history_3 = model_2.fit([train_image_1, train_sequences_1], train_y_int,\
                      validation_data=([test_image_1, test_sequences_1], test_y_int),\
                      epochs=30, batch_size=32, verbose=1)