# IMAGE CAPTIONING

In [None]:
!pip install torch

In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

# Load Pretrained ResNet Model (Feature Extractor)
resnet = models.resnet50(pretrained=True)
resnet = torch.nn.Sequential(*(list(resnet.children())[:-1]))  # Remove last layer
resnet.eval()

# Tokenizer for captions
class SimpleTokenizer:
    def __init__(self, captions, vocab_size=5000):
        words = [word for caption in captions for word in word_tokenize(caption.lower())]
        most_common = Counter(words).most_common(vocab_size)
        self.word2idx = {w: i+1 for i, (w, _) in enumerate(most_common)}
        self.word2idx["<start>"] = vocab_size + 1
        self.word2idx["<end>"] = vocab_size + 2
        self.word2idx["<pad>"] = 0
        self.idx2word = {i: w for w, i in self.word2idx.items()}

    def encode(self, text):
        return [self.word2idx.get(word, 0) for word in word_tokenize(text.lower())]

    def decode(self, indices):
        return " ".join([self.idx2word.get(i, "") for i in indices])

# Captioning Model (LSTM)
class ImageCaptioningModel(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size):
        super(ImageCaptioningModel, self).__init__()
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, features, captions):
        lstm_out, _ = self.lstm(captions)
        outputs = self.fc(lstm_out)
        return outputs

# Feature Extraction Function
def extract_features(image_path):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])
    img = Image.open(image_path).convert('RGB')
    img = transform(img).unsqueeze(0)
    with torch.no_grad():
        features = resnet(img)
    return features.squeeze().numpy()

# Generate Caption Function
def generate_caption(model, image_features, tokenizer, max_length=20):
    model.eval()
    caption = ["<start>"]
    for _ in range(max_length):
        tokenized_input = torch.tensor([tokenizer.encode(" ".join(caption))])
        prediction = model(torch.tensor(image_features).unsqueeze(0), tokenized_input)
        predicted_word = tokenizer.idx2word[prediction.argmax().item()]
        if predicted_word == "<end>":
            break
        caption.append(predicted_word)
    return " ".join(caption[1:])

# Example Usage
captions_dataset = ["A cat sitting on a mat.", "A dog running in the park.", "A person riding a bicycle."]
tokenizer = SimpleTokenizer(captions_dataset)
model = ImageCaptioningModel(embed_size=256, hidden_size=512, vocab_size=len(tokenizer.word2idx))

image_path = "example.jpg"
image_features = extract_features(image_path)
generated_caption = generate_caption(model, image_features, tokenizer)
print("Generated Caption:", generated_caption)


In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, add

# Load Pre-trained ResNet50 model
resnet = ResNet50(weights='imagenet')
feature_extractor = Model(inputs=resnet.input, outputs=resnet.layers[-2].output)

def extract_features(image_path):
    image = load_img(image_path, target_size=(224, 224))
    image = img_to_array(image)
    image = np.expand_dims(image, axis=0)
    image = tf.keras.applications.resnet50.preprocess_input(image)
    feature = feature_extractor.predict(image, verbose=0)
    return feature

# Sample text processing functions
def preprocess_text(text):
    text = text.lower()
    text = text.replace('[^a-z]', '')
    return text

def create_tokenizer(captions):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(captions)
    return tokenizer

# Define captioning model
def build_model(vocab_size, max_length):
    inputs = tf.keras.Input(shape=(2048,))
    fe1 = Dense(256, activation='relu')(inputs)
    fe2 = Dropout(0.5)(fe1)
    
    seq_input = tf.keras.Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(seq_input)
    se2 = LSTM(256)(se1)
    
    decoder1 = add([fe2, se2])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    
    model = Model(inputs=[inputs, seq_input], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

# Example Usage
if __name__ == "__main__":
    image_path = "sample.jpg"  # Replace with an actual image path
    features = extract_features(image_path)
    print("Extracted Features Shape:", features.shape)
