### Data filtering and tokenization

In [None]:
import nltk
import re
import pandas as pd
import spacy
import numpy as np

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
df = pd.read_csv("C:\\Users\\sonia\\Desktop\\ig-caption-gen\\instagram_data\\captions_csv.csv")
bad_words = ['fuck', 'bitch', 'hoe', 'motherfucker']

start_token = '<start>'
end_token = '<end>'

def contains_date(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == 'DATE':
            return True
    return False

def remove_emojis(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

def remove_unwanted_chars(text):
    text = text.replace(r',|.', '')
    text = text.replace(r' +|\t', ' ')
    text = text.replace(r'*', '')
    text = text.replace(r':)', '')
    text = text.replace(r'@', '')
    text = text.replace(r'#', '')
    text = text.replace(r'"', '')
    return text

def tokenize_caption(text):
    text = text.lower()
    text = text.replace(r' +|\t', ' ')
    text = text.replace(r'*', '')
    text = text.replace(r':)', '')
    tokens = nltk.word_tokenize(text)
    tokens = [start_token] + tokens + [end_token]
    return tokens


data = df.dropna(subset=['Caption'])
data = data[['Image File', 'Caption']]
data = data.drop_duplicates(subset=['Caption'])
data = data[data['Caption'].apply(len) < 60]
data['Caption'] = data['Caption'].apply(remove_emojis)
data['Caption'] = data['Caption'].apply(remove_unwanted_chars)
data = data[~data['Caption'].str.contains(r'http\S+|www\S+|#|@|\d{2,}|\n', regex=True)]
data = data[~data['Caption'].apply(contains_date)]
data = data[data['Caption'].str.strip() != '']
data = data[~data['Caption'].str.contains('|'.join(bad_words), case=False)]
data['Tokenized Caption'] = data['Caption'].apply(tokenize_caption)

data.to_csv('caption_data_tokenized.csv', sep=',', index=False, encoding='utf-8')

### Captions encoding

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
import ast

In [None]:
data = pd.read_csv("caption_data_tokenized.csv")
data['Caption'] = data['Caption'].apply(lambda x: '<start> ' + x + ' <end>')

raw_text = data['Caption'].tolist()
tokenized_captions = data['Tokenized Caption'].apply(ast.literal_eval)

tokenizer = Tokenizer(num_words=15000, filters=',', oov_token='<unk>')

tokenizer.fit_on_texts(raw_text)
encoded_tokens = tokenizer.texts_to_sequences(tokenized_captions)

max_length = max([len(tokens) for tokens in encoded_tokens])

data['Encoded Tokens'] = encoded_tokens

data.to_csv('caption_data_encoded_all.csv', sep=',', index=False, encoding='utf-8')

### Getting encoded captions

In [18]:
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [5]:
captions = pd.read_csv("caption_data_encoded_all.csv")
captions['Encoded Tokens'] = captions['Encoded Tokens'].apply(ast.literal_eval)

### Image features

In [None]:
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.preprocessing.image import load_img

In [7]:
model_incep = InceptionV3(include_top=False, weights='imagenet')

In [8]:
def extract_features(image_path):
    img = load_img(image_path, target_size=(299, 299))

    img_array = image.img_to_array(img)
    print(img_array.shape)

    img_array = np.expand_dims(img_array, axis=0)

    img_array = preprocess_input(img_array)

    features = model_incep.predict(img_array)

    return features

In [9]:
images_features = {}
for i in range(len(captions)):
    path = captions.loc[i]['Image File']

    image_path = "C:/Users/sonia/Desktop/ig-caption-gen/instagram_data/" + path + ".jpg"
    features = extract_features(image_path)
    images_features[path] = features

(299, 299, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
(299, 299, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 191ms/step
(299, 299, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 174ms/step
(299, 299, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 192ms/step
(299, 299, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 182ms/step
(299, 299, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 178ms/step
(299, 299, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 184ms/step
(299, 299, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 197ms/step
(299, 299, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 177ms/step
(299, 299, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 179ms/step
(299, 299, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 174ms/step
(299, 299, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━

In [9]:
import csv

fields = ['Image File', 'Features']
with open('images_features.csv', 'w', encoding='utf-8', newline="") as f:
    w= csv.writer(f)
    w.writerow(fields)
    for key, val in images_features.items():
        w.writerow([key, val])

### Batch data processing

In [10]:
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, add, Dropout, Flatten, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

In [11]:
import random

def generate_data(captions, image_features, max_length, vocab_size, batch_size):
    while True:
        indices = list(range(len(captions)))
        random.shuffle(indices)
        
        input1 = []
        input2 = []
        output = []
        for i in indices:
            tokens = captions.loc[i]['Encoded Tokens']
            img_id = captions.loc[i]['Image File']
            img_features = image_features[img_id]
            img_features = img_features.reshape(img_features.shape[1], img_features.shape[2], img_features.shape[3])
            for index in range(1, len(tokens)):
                in_seq, out_seq = tokens[:index], tokens[index]
                in_seq = pad_sequences([in_seq], maxlen=max_length, padding='pre')[0]
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                input1.append(img_features)
                input2.append(in_seq)
                output.append(out_seq)

                if len(input1) == batch_size:
                    yield (np.array(input1), np.array(input2)), np.array(output)
                    input1 = []
                    input2 = []
                    output = []

### Model

In [12]:
def captions_model(vocab, max_length):
    features_inputs = Input(shape=(8, 8, 2048))
    features_layer1 = Dropout(0.5)(features_inputs)
    features_flatten = Flatten()(features_layer1)
    features_layer2 = Dense(256, activation='relu')(features_flatten)

    captions_inputs = Input(shape=(max_length,))
    captions_embeddings = Embedding(vocab, 256, mask_zero=True)(captions_inputs)
    captions_lstm = LSTM(256, activation = 'tanh', return_sequences = False)(captions_embeddings)

    decoder1 = Concatenate()([features_layer2, captions_lstm])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab, activation='softmax')(decoder2)

    model = Model(inputs=[features_inputs, captions_inputs], outputs=outputs)
    optimizer = Adam(learning_rate=0.001)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)
    return model
    

In [19]:
tokenizer = pickle.load(open('tokenizer.pickle', 'rb'))

In [20]:
len_vocab = len(tokenizer.word_index) + 1
max_length = 60
model = captions_model(len_vocab, max_length)
print(model.summary())

None


### Splitting dataset

In [21]:
from sklearn.model_selection import train_test_split

captions_train, captions_val = train_test_split(captions, test_size=0.2, random_state=42)
captions_train = captions_train.reset_index(drop=True)
captions_val = captions_val.reset_index(drop=True)
images_features_train = {k: v for k, v in images_features.items() if k in captions_train['Image File'].values}
images_features_val = {k: v for k, v in images_features.items() if k in captions_val['Image File'].values}

### Training

In [22]:
batch_size = 64
data_generator_train = generate_data(captions_train, images_features_train, max_length, len_vocab, batch_size)
data_generator_val = generate_data(captions_val, images_features_val, max_length, len_vocab, batch_size)
steps_per_epoch = len(captions_train) // batch_size
validation_steps = len(captions_val) // batch_size

model.fit(data_generator_train, epochs=50, steps_per_epoch=steps_per_epoch, validation_data=data_generator_val, validation_steps=validation_steps, verbose=1)

Epoch 1/50




[1m232/232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 437ms/step - loss: 7.0835 - val_loss: 6.2421
Epoch 2/50
[1m232/232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 444ms/step - loss: 6.2360 - val_loss: 6.0297
Epoch 3/50
[1m232/232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 459ms/step - loss: 6.1470 - val_loss: 6.0426
Epoch 4/50
[1m232/232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 489ms/step - loss: 6.0845 - val_loss: 5.9940
Epoch 5/50
[1m232/232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 499ms/step - loss: 5.9274 - val_loss: 6.0149
Epoch 6/50
[1m232/232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 522ms/step - loss: 5.9513 - val_loss: 5.8930
Epoch 7/50
[1m232/232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 513ms/step - loss: 5.6580 - val_loss: 6.0105
Epoch 8/50
[1m232/232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m132s[0m 569ms/step - loss: 5.7737 - val_loss: 5.9146
Epoch 9/50
[1m232/

<keras.src.callbacks.history.History at 0x171270ffb10>

### Prediction

In [None]:
def top_k_sampling(predictions, k=5):
    k = min(k, len(predictions))
    top_k_indices = np.argsort(predictions)[-k:]
    top_k_probs = predictions[top_k_indices]
    top_k_probs = top_k_probs / np.sum(top_k_probs)
    chosen_index = np.random.choice(top_k_indices, p=top_k_probs)
    return chosen_index

def generate_caption(model, tokenizer, photo, max_length, k=5):
    start_token = '<start>'
    end_token = '<end>'
    in_text = start_token
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        predictions = model.predict([photo, sequence], verbose=0)[0]
        
        prediction = top_k_sampling(predictions, k)
        word = tokenizer.index_word.get(prediction, None)

        if word == end_token and in_text == start_token:
            continue
        if word == in_text[-1]:
            break
        if word is None:
            break
        in_text += ' ' + word
        if word == end_token:
            break
        
    return in_text

(299, 299, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step
Image: C:/Users/sonia/Desktop/ig-caption-gen/instagram_data/img/my_img10.jpg
Caption: <start> my day matching <end>



In [None]:
image_path = "C:/Users/sonia/Desktop/ig-caption-gen/instagram_data/img/my_img10.jpg"
photo = extract_features(image_path)
caption = generate_caption(model, tokenizer, photo, 60)
print(f"Image: {image_path}")   
print(f"Caption: {caption}")
print()

In [24]:
model.save('models/caption_model5.keras')