### Loading data

In [None]:
from datasets import load_from_disk

dataset = load_from_disk("C:/Users/sonia/Desktop/ig-caption-gen/huggingface_dataset2")

### Loading data from Hugging Face dataset

In [None]:
train_data = dataset["train"]
test_data = dataset["test"]

train_data = train_data.select(range(10000))

# Preview the first few examples
print(train_data[:5])
print(test_data[:5])

DatasetDict({
    train: Dataset({
        features: ['image', 'item_id', 'image_path', 'caption'],
        num_rows: 28360
    })
    test: Dataset({
        features: ['image', 'item_id', 'image_path', 'caption'],
        num_rows: 3152
    })
})
{'image': [<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=612x612 at 0x28D282C7B90>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1080x810 at 0x28D7BF4A890>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=840x941 at 0x28D282C73D0>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1080x1284 at 0x28D282FE690>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x640 at 0x28D282AADD0>], 'item_id': [23177, 28216, 31221, 8318, 1749], 'image_path': ['ig_image_with_caption/insta23177.jpg', 'ig_image_with_caption/insta28216.jpg', 'ig_image_with_caption/insta31221.jpg', 'ig_image_with_caption/insta8318.jpg', 'ig_image_with_caption/insta1749.jpg'], 'caption': ['Rows on rows on rows', '@deepikapadukone poses with h

### Converting to a dataframe

In [None]:
import pandas as pd

def create_csv(dataset,  output_csv_path):
    # extracting relevant columns
    image_paths = dataset["image_path"]
    captions = dataset["caption"]
    
    # Create a DataFrame
    df = pd.DataFrame({
        "Image File": image_paths,
        "Caption": captions
    })
    
    df.to_csv(output_csv_path, index=False)
    print(f"CSV file saved at: {output_csv_path}")

create_csv(train_data, "train_data.csv")
create_csv(test_data, "test_data.csv")

                             Image File  \
0  ig_image_with_caption/insta23177.jpg   
1  ig_image_with_caption/insta28216.jpg   
2  ig_image_with_caption/insta31221.jpg   
3   ig_image_with_caption/insta8318.jpg   
4   ig_image_with_caption/insta1749.jpg   

                                             Caption  
0                               Rows on rows on rows  
1  @deepikapadukone poses with her team, in Madri...  
2                                   "eyes up here" 💋  
3          I guess she wasn’t feeling this fitting!   
4  Is this a good pic? HAHHAHHH DO YOU GET IT OH ...  
CSV file saved at: train_data.csv
                             Image File  \
0   ig_image_with_caption/insta1344.jpg   
1   ig_image_with_caption/insta1564.jpg   
2   ig_image_with_caption/insta4161.jpg   
3  ig_image_with_caption/insta20507.jpg   
4  ig_image_with_caption/insta27437.jpg   

                                             Caption  
0                                       After hours   
1  DIREC

### Filtering and tokenization

In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [4]:
import json
import nltk
import re
import pandas as pd

In [None]:
df = pd.read_csv("C:/Users/sonia/Desktop/ig-caption-gen/src/train_data.csv")
bad_words = ['fuck', 'bitch', 'hoe', 'motherfucker']

start_token = '<start>'
end_token = '<end>'

def contains_date(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == 'DATE':
            return True
    return False

def remove_emojis(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

def remove_unwanted_chars(text):
    text = text.replace(r',|.', '')
    text = text.replace(r' +|\t', ' ')
    text = text.replace(r'*', '')
    text = text.replace(r':)', '')
    text = text.replace(r'@', '')
    text = text.replace(r'#', '')
    text = text.replace(r'"', '')
    return text

def tokenize_caption(text):
    text = text.lower()
    text = text.replace(r' +|\t', ' ')
    text = text.replace(r'*', '')
    text = text.replace(r':)', '')
    tokens = nltk.word_tokenize(text)
    tokens = [start_token] + tokens + [end_token]
    return tokens


data = df.dropna(subset=['Caption'])
data = data[['Image File', 'Caption']]
data = data.drop_duplicates(subset=['Caption'])
data = data[data['Caption'].apply(len) < 60]
data['Caption'] = data['Caption'].apply(remove_emojis)
data['Caption'] = data['Caption'].apply(remove_unwanted_chars)
data = data[~data['Caption'].str.contains(r'http\S+|www\S+|#|@|\d{2,}|\n', regex=True)]
data = data[~data['Caption'].apply(contains_date)]
data = data[data['Caption'].str.strip() != '']
data = data[~data['Caption'].str.contains('|'.join(bad_words), case=False)]
data['Tokenized Caption'] = data['Caption'].apply(tokenize_caption)

data.to_csv('caption_data_tokenized_train.csv', sep=',', index=False, encoding='utf-8')

### Encoding using the saved tokenizer

In [1]:
import pickle
tokenizer = pickle.load(open('tokenizer.pickle', 'rb'))

In [2]:
import ast

In [5]:
captions = pd.read_csv("caption_data_tokenized_train.csv")
tokenized_captions = captions['Tokenized Caption'].apply(ast.literal_eval)
encoded_captions = tokenizer.texts_to_sequences(tokenized_captions)
captions['Encoded Caption'] = encoded_captions
captions.to_csv('caption_data_encoded_train.csv', sep=',', index=False, encoding='utf-8')

In [6]:
captions = pd.read_csv("caption_data_encoded_train.csv")
captions['Encoded Caption'] = captions['Encoded Caption'].apply(ast.literal_eval)

### Extracting features

In [9]:
import numpy as np
from tensorflow.keras.preprocessing import image as keras_image
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.preprocessing.image import load_img

In [10]:
model_incep = InceptionV3(include_top=False, weights='imagenet')

In [11]:
def extract_features(img):
    img_array = keras_image.img_to_array(img)
    print(img_array.shape)

    img_array = np.expand_dims(img_array, axis=0)

    img_array = preprocess_input(img_array)

    features = model_incep.predict(img_array)

    return features

In [16]:
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

images_features = {}
images = train_data['image']
img_paths = train_data['image_path']

In [17]:
for image, img_path in zip(images, img_paths):
    try:
        img_resized = image.resize((299, 299))
        features = extract_features(img_resized)
        images_features[img_path] = features

    except Exception as e:
        print(f"Corrupted or invalid image at {img_path}: {e}")

(299, 299, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step
(299, 299, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 185ms/step
(299, 299, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 126ms/step
(299, 299, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step
(299, 299, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step
(299, 299, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 126ms/step
(299, 299, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 216ms/step
(299, 299, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step
(299, 299, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 121ms/step
(299, 299, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step
(299, 299, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step
(299, 299, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━

### Batch processing function

In [None]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [21]:
import random

def generate_data(captions, image_features, max_length, vocab_size, batch_size):
    while True:
        # Shuffle the data
        indices = list(range(len(captions)))
        random.shuffle(indices)
        
        input1 = []
        input2 = []
        output = []
        for i in indices:
            tokens = captions.loc[i]['Encoded Caption']
            img_id = captions.loc[i]['Image File']
            img_features = image_features[img_id]
            img_features = img_features.reshape(img_features.shape[1], img_features.shape[2], img_features.shape[3])
            for index in range(1, len(tokens)):
                in_seq, out_seq = tokens[:index], tokens[index]
                in_seq = pad_sequences([in_seq], maxlen=max_length, padding='pre')[0]
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                input1.append(img_features)
                input2.append(in_seq)
                output.append(out_seq)

                if len(input1) == batch_size:
                    yield (np.array(input1), np.array(input2)), np.array(output)
                    input1 = []
                    input2 = []
                    output = []

### Loading model

In [24]:
import tensorflow
model = tensorflow.keras.models.load_model('models/caption_model5.keras')

### Splitting dataset

In [19]:
from sklearn.model_selection import train_test_split

captions_train, captions_val = train_test_split(captions, test_size=0.2, random_state=42)
captions_train = captions_train.reset_index(drop=True)
captions_val = captions_val.reset_index(drop=True)
images_features_train = {k: v for k, v in images_features.items() if k in captions_train['Image File'].values}
images_features_val = {k: v for k, v in images_features.items() if k in captions_val['Image File'].values}

In [22]:
max_length = 60
len_vocab = len(tokenizer.word_index) + 1

### Fine-tunning

In [27]:
batch_size = 64
data_generator_train = generate_data(captions_train, images_features_train, max_length, len_vocab, batch_size)
data_generator_val = generate_data(captions_val, images_features_val, max_length, len_vocab, batch_size)
steps_per_epoch = len(captions_train) // batch_size
validation_steps = len(captions_val) // batch_size

model.fit(data_generator_train, epochs=50, steps_per_epoch=steps_per_epoch, validation_data=data_generator_val, validation_steps=validation_steps, verbose=1)

Epoch 1/50




[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 690ms/step - loss: 9.3067 - val_loss: 6.7107
Epoch 2/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 656ms/step - loss: 6.6214 - val_loss: 6.2205
Epoch 3/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 610ms/step - loss: 6.3485 - val_loss: 6.1209
Epoch 4/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 602ms/step - loss: 6.1283 - val_loss: 6.2606
Epoch 5/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 622ms/step - loss: 6.0641 - val_loss: 6.1190
Epoch 6/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 616ms/step - loss: 6.1469 - val_loss: 6.1759
Epoch 7/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 612ms/step - loss: 5.8106 - val_loss: 6.1791
Epoch 8/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 638ms/step - loss: 5.5674 - val_loss: 5.9498
Epoch 9/50
[1m74/74[0m [32m━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x28d345d6150>

### Saving the model

In [34]:
model.save('models/caption_model6.keras')