# First model outline

In [178]:
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Add
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import RepeatVector

from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import spacy
import pandas as pd
import zipfile
import nltk
import re
import json
import zipfile
import os
import io

In [179]:
zip_file_path = 'C:/Users/Admin/Desktop/instagram_data.zip'
img_folder = 'img'
csv_file_name = 'captions_csv.csv'

## Filtering and captions preprocessing

In [180]:
bad_words = ['fuck', 'bitch', 'hoe', 'motherfucker']

with zipfile.ZipFile(zip_file_path, 'r') as z:
    with z.open(csv_file_name) as f:
        df = pd.read_csv(f)

def tokenize_caption(text):
    text = text.lower()
    text = re.sub(r' +|\t', ' ', text)
    tokens = nltk.word_tokenize(text)
    tokens = ['<start>'] + tokens + ['<end>']
    return tokens

df = df.dropna(subset=['Caption']) 
df = df[['Image File', 'Caption']]
df = df.drop_duplicates(subset=['Caption'])
df = df[df['Caption'].apply(len) < 60]
df = df[~df['Caption'].str.contains(r'http\S+|www\S+', regex=True)]
df = df[~df['Caption'].str.contains('|'.join(bad_words), case=False)]
df['Tokenized Caption'] = df['Caption'].apply(tokenize_caption)

## Tokenization

In [181]:
'''
data = pd.read_csv('caption_data.csv')

special_tokens = ['<start>', '<end>']

tokenized_captions = data['Tokenized Caption'].tolist()

tokenizer = Tokenizer(filters='.,', oov_token='<unk>')

tokenizer.fit_on_texts(special_tokens)
tokenizer.fit_on_texts(tokenized_captions)
encoded_tokens = tokenizer.texts_to_sequences(tokenized_captions)

data['Encoded Tokens'] = encoded_tokens

data.to_csv('caption_data.csv', sep=',', index=False, encoding='utf-8')

'''
special_tokens = ['<start>', '<end>']

tokenized_captions = df['Tokenized Caption'].tolist()

tokenizer = Tokenizer(filters='.,', oov_token='<unk>')

tokenizer.fit_on_texts(special_tokens + tokenized_captions)

encoded_tokens = tokenizer.texts_to_sequences(tokenized_captions)

df['Encoded Tokens'] = encoded_tokens


## Image features extraction

In [182]:
model_incep = InceptionV3(include_top=False, weights='imagenet')
image_feature_extractor = Model(model_incep.input, model_incep.layers[-1].output)

def extract_image_features(image_path):
    img = load_img(image_path, target_size=(299, 299))

    img_array = image.img_to_array(img)

    img_array = np.expand_dims(img_array, axis=0)

    img_array = preprocess_input(img_array)

    features = image_feature_extractor.predict(img_array)

    return features

'''
with zipfile.ZipFile(zip_file_path, 'r') as z:
    for file in z.namelist():
        if file.startswith(img_folder):
            img_name = os.path.basename(file)
            if img_name in images_names[:20]:
                print(file)
        
'''

images_names = df['Image File'].values
images_names = [os.path.basename(img) + '.jpg' for img in images_names]

image_features = {}
with zipfile.ZipFile(zip_file_path, 'r') as z:
    for file in z.namelist():
        if file.startswith(img_folder):
            img_name = os.path.basename(file)
            if img_name in images_names[:2000]:
                print(f"Processing {file}")
                with z.open(file) as f:
                    image_data = io.BytesIO(f.read())
                    features = extract_image_features(image_data) 
                    image_features[img_name] = features.flatten()

Processing img/insta10.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
Processing img/insta1000.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 131ms/step
Processing img/insta1001.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 126ms/step
Processing img/insta1002.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step
Processing img/insta1003.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step
Processing img/insta1005.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step
Processing img/insta1006.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step
Processing img/insta1008.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 121ms/step
Processing img/insta1014.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step
Processing img/insta1015.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s

## Training data

In [183]:
df['Image File'] = df['Image File'].apply(lambda x: os.path.basename(x) + '.jpg')

df['Image Features'] = df['Image File'].map(image_features.get)
df = df.dropna(subset = ['Image Features'])

max_caption_len = max(len(caption) for caption in df['Encoded Tokens'])
captions = pad_sequences(df['Encoded Tokens'], maxlen = max_caption_len, padding='post')

image_features_array = np.array(df['Image Features'].tolist())

X1_train, X1_val, X2_train, X2_val, y_train, y_val = train_test_split(
    image_features_array, captions[:, :-1], captions[:, 1:], test_size = 0.2, random_state = 42
)

vocab_size = len(tokenizer.word_index) + 1

y_train_cat = to_categorical(y_train, num_classes = vocab_size)
y_val_cat = to_categorical(y_val, num_classes = vocab_size)

y_train_cat = y_train_cat.reshape(y_train.shape[0], y_train.shape[1], vocab_size)
y_val_cat = y_val_cat.reshape(y_val.shape[0], y_val.shape[1], vocab_size)

## Model

In [184]:
def build_model(max_caption_len, vocab_size, image_features_array, learning_rate):
    input_image = Input(shape=(image_features_array.shape[1],))
    cnn_layer1 = Dropout(0.4)(input_image)
    cnn_layer2 = Dense(256, activation = 'relu')(cnn_layer1)

    input_text = Input(shape=(max_caption_len -1,))
    lstm_layer1 = Embedding(vocab_size, 256, mask_zero = True)(input_text)
    lstm_layer2 = Dropout(0.4)(lstm_layer1)
    lstm_layer3 = LSTM(256, activation = 'tanh', return_sequences = True)(lstm_layer2)

    merging_layer = Add()([
        RepeatVector(max_caption_len - 1)(cnn_layer2), 
        lstm_layer3
    ]) 
    final_dense = Dense(256, activation = 'relu')(merging_layer)
    output = Dense(vocab_size, activation = 'softmax')(final_dense)

    model = Model(inputs = [input_image, input_text], outputs = output)

    optimizer = Adam(learning_rate = learning_rate)
    model.compile(loss = 'categorical_crossentropy', optimizer = optimizer, metrics = ['accuracy'])

    return model


max_caption_len = max_caption_len
vocab_size = len(tokenizer.word_index) + 1

captioning_model = build_model(max_caption_len, vocab_size, image_features_array, 0.001)

captioning_model.summary()

## Training

In [185]:
epochs = 50
batch_size = 32

history = captioning_model.fit(
    [X1_train, X2_train], 
    y_train_cat,
    validation_data = ([X1_val, X2_val], y_val_cat),
    epochs = epochs,
    batch_size = batch_size,
    verbose = 1
)

Epoch 1/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 894ms/step - accuracy: 0.5682 - loss: 6.2029 - val_accuracy: 0.6994 - val_loss: 2.3214
Epoch 2/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 617ms/step - accuracy: 0.7239 - loss: 1.9408 - val_accuracy: 0.7494 - val_loss: 2.0451
Epoch 3/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 606ms/step - accuracy: 0.7405 - loss: 1.7893 - val_accuracy: 0.7492 - val_loss: 2.0299
Epoch 4/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 596ms/step - accuracy: 0.7539 - loss: 1.6022 - val_accuracy: 0.7494 - val_loss: 2.1126
Epoch 5/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 595ms/step - accuracy: 0.7575 - loss: 1.4618 - val_accuracy: 0.7501 - val_loss: 2.1599
Epoch 6/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 611ms/step - accuracy: 0.7609 - loss: 1.3084 - val_accuracy: 0.7498 - val_loss: 2.3507
Epoch 7/50
[1m50/50[

## Generating captions

In [1]:
def generate_caption(image_feature, max_len = max_caption_len):
    input_seq = np.zeros((1, max_len - 1), dtype = np.int32)
    input_seq[0, 0] = tokenizer.word_index['<start>']
    
    captions = []
    for i in range(1, max_len-1): 

        pred = captioning_model.predict([
            np.array([image_feature]), 
            input_seq
        ])
        
        pred_word_index = np.argmax(pred[0, i-1, :])
        
        pred_word = tokenizer.index_word.get(pred_word_index, '')
        
        if pred_word and pred_word != '<unk>':
            captions.append(pred_word)
        
        input_seq[0, i] = pred_word_index
        
        if pred_word == '<end>':
            break
    
    return ' '.join(captions)


sample_image_feature = image_features_array[299]

generated_caption = generate_caption(sample_image_feature)

print("\nWygenerowany podpis:", generated_caption)

original_tokens = df['Tokenized Caption'].iloc[299]
original_caption = ' '.join(original_tokens)

print("\nOryginalny podpis:", original_caption)

NameError: name 'max_caption_len' is not defined