In [1]:
import numpy as np
import pandas as pd
from collections import Counter
import torch
import torch.nn as nn
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from torchvision.models import resnet18
# import matplotlib.pyplot as plt
from PIL import Image


from torch.autograd import Variable
from tqdm import tqdm
import os
import math
import pickle
import os

In [2]:
df = pd.read_csv(r"C:\Users\Shaikh Irfan\Documents\Ai Adeventures\Image_Captioning\Raw Data\captions.txt",sep=',')

In [3]:
df.head()

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...


In [4]:
df['cleaned_caption'] = df['caption'].apply(lambda x: ['<start>'] + [word.lower() if word.isalpha else ' ' for word in x.split(' ')] + ['<end>'])

In [5]:
df.head()

Unnamed: 0,image,caption,cleaned_caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...,"[<start>, a, child, in, a, pink, dress, is, cl..."
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .,"[<start>, a, girl, going, into, a, wooden, bui..."
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .,"[<start>, a, little, girl, climbing, into, a, ..."
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...,"[<start>, a, little, girl, climbing, the, stai..."
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...,"[<start>, a, little, girl, in, a, pink, dress,..."


In [6]:
df['seq'] = df['cleaned_caption'].apply(lambda x: len(x))

In [7]:
max_seq = df['seq'].max()
max_seq

np.int64(40)

In [8]:
df['cleaned_caption'] = df['cleaned_caption'].apply(lambda x: x + ['pad']*(max_seq - len(x)))

In [9]:
df.drop('seq',axis=1,inplace=True)

In [10]:
word_list = df['cleaned_caption'].apply(lambda x : " ".join(x)).str.cat(sep = ' ').split(' ')
len(word_list)

1618200

In [11]:
word_dict = Counter(word_list)
len(word_dict)

8920

In [12]:
word_dict = sorted(word_dict,key = word_dict.get,reverse=True)


In [13]:
vocab_size = len(word_dict)
print(vocab_size)

8920


In [14]:
idx_to_word = {idx:word for idx,word in enumerate(word_dict)}
word_to_idx = {word:idx for idx,word in enumerate(word_dict)}

In [15]:
print(len(idx_to_word),len(word_to_idx))

8920 8920


In [16]:
df['seq'] = df['cleaned_caption'].apply(lambda x: [word_to_idx[word] for word in x])


In [17]:
df = df.sort_values(by='image')
train = df.iloc[:int(0.8*len(df))]
val = df.iloc[int(0.8*len(df)):int(0.9*len(df))]
test = df.iloc[int(0.9*len(df)):]
train.shape,val.shape,test.shape

((32364, 4), (4045, 4), (4046, 4))

In [18]:
unq_train_imgs = train[['image']].drop_duplicates()
unq_valid_imgs = val[['image']].drop_duplicates()
print(len(unq_train_imgs), len(unq_valid_imgs))

6473 810


In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [20]:
class ImageTransform():
    def __init__(self, image_files, resize=(224, 224), normalize_mean=(0.485, 0.456, 0.406), normalize_std=(0.229, 0.224, 0.225)):
        self.image_files = image_files
        self.transform = transforms.Compose([
            transforms.Resize(resize),                          # Resize the image
            transforms.ToTensor(),                              # Convert to tensor
            transforms.Normalize(mean=normalize_mean, std=normalize_std)  # Normalize
        ])

    def __len__(self):
        return len(self.image_files)  # Return the number of images

    def __getitem__(self, idx):
        img_name = self.image_files.iloc[idx]['image']
        img_path = os.path.join('\..Raw Data\Images', img_name)  # Get the image path
        image = Image.open(img_path)  # Open the image
        transformed_image = self.transform(image)  # Apply the transformations
        return img_name, transformed_image

In [21]:
train_image = ImageTransform(unq_train_imgs)
train_image = DataLoader(train_image, batch_size=12, shuffle=True)

In [22]:
len(train_image)

540

In [23]:
val_image = ImageTransform(unq_valid_imgs)
val_image = DataLoader(val_image, batch_size=12, shuffle=True)
len(val_image)

68

In [24]:
resnet18 = resnet18(pretrained=True).to(device)
resnet18.eval()
list(resnet18._modules)



['conv1',
 'bn1',
 'relu',
 'maxpool',
 'layer1',
 'layer2',
 'layer3',
 'layer4',
 'avgpool',
 'fc']

In [25]:
resNet18Layer4 = resnet18._modules.get('layer4').to(device)
resNet18Layer4

Sequential(
  (0): BasicBlock(
    (conv1): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (downsample): Sequential(
      (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
      (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (1): BasicBlock(
    (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn2): BatchNorm2d(512, eps=1

In [26]:
resNet18Layer4.eval()

Sequential(
  (0): BasicBlock(
    (conv1): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (downsample): Sequential(
      (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
      (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (1): BasicBlock(
    (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn2): BatchNorm2d(512, eps=1

In [27]:
def get_vector(t_img):
    
    t_img = Variable(t_img)
    my_embedding = torch.zeros(1, 512, 7, 7)
    def copy_data(m, i, o):
        my_embedding.copy_(o.data)
    
    h = resNet18Layer4.register_forward_hook(copy_data)
    resnet18(t_img)
    
    h.remove()
    return my_embedding

In [28]:
import matplotlib.pyplot as plt
plt.imshow(train_image.dataset[0][1].permute(1, 2, 0))


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\Shaikh Irfan\AppData\Roaming\Python\Python311\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\Shaikh Irfan\AppData\Roaming\Python\Python311\site-packages\traitlets\config\application.py", line 1053, in launch_instance
    app.start()
  File "C:\Users\Shaikh Irfan\AppData\Roaming\Python\Python311\site-packages\ipykernel\kernelapp.py", line 737, in

AttributeError: _ARRAY_API not found

ImportError: numpy.core.multiarray failed to import

In [None]:
extract_imgFtr_ResNet_train = {}
for image_name, t_img in tqdm(train_image):
    t_img = t_img.to(device)
    print(t_img)
    embdg = get_vector(t_img)
    print(embdg)
    
    extract_imgFtr_ResNet_train[image_name[0]] = embdg
    break
extract_imgFtr_ResNet_train

  0%|          | 0/540 [00:00<?, ?it/s]


FileNotFoundError: [Errno 2] No such file or directory: '\\..Raw Data\\Images\\3562169000_6aa7f1043d.jpg'