In [8]:
pip install pycocoevalcap

Collecting pycocoevalcap
  Downloading pycocoevalcap-1.2-py3-none-any.whl.metadata (3.2 kB)
Collecting pycocotools>=2.0.2 (from pycocoevalcap)
  Downloading pycocotools-2.0.8-cp39-cp39-win_amd64.whl.metadata (1.1 kB)
Downloading pycocoevalcap-1.2-py3-none-any.whl (104.3 MB)
   ---------------------------------------- 0.0/104.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/104.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/104.3 MB ? eta -:--:--
   ---------------------------------------- 0.3/104.3 MB ? eta -:--:--
   ---------------------------------------- 0.5/104.3 MB 728.2 kB/s eta 0:02:23
   ---------------------------------------- 0.5/104.3 MB 728.2 kB/s eta 0:02:23
   ---------------------------------------- 0.5/104.3 MB 728.2 kB/s eta 0:02:23
   ---------------------------------------- 0.8/104.3 MB 558.9 kB/s eta 0:03:06
   ---------------------------------------- 0.8/104.3 MB 558.9 kB/s eta 0:03:06
   --------------------------------------

In [1]:
import string 
import os
from PIL import Image
import numpy as np 
import tensorflow as tf
from tensorflow.keras.applications.resnet import  ResNet50
from tensorflow.keras.applications.xception import  Xception
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model ,load_model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout
from tensorflow.keras.layers import add
import matplotlib.pyplot as plt
import pickle
from googletrans import Translator
from gtts import gTTS

In [6]:
import nltk
from nltk.translate.bleu_score import corpus_bleu

In [9]:
from pycocoevalcap.cider.cider import Cider  # Requires `pycocoevalcap`

# Testing:

In [2]:
def img_feature(path,cnn_model):
    try:
        img = Image.open(path)
        img = img.resize((299,299))
        img = np.expand_dims(img, axis=0)
        img = img / 127.5
        img = img -1.0
        
        imgFeature = cnn_model.predict(img)
        return imgFeature
    except:
        print("cannot read the image")
        return None

def get_word(indx,tokenizer):
    return tokenizer.index_word.get(indx,'')

def generate_captions(model,tokenizer,imgFeature,longest_cap):
    output_seq = 'start'
    for i in range(longest_cap):
        seq = tokenizer.texts_to_sequences([output_seq])[0]
        seq = pad_sequences([seq],maxlen=longest_cap)
        predict = model.predict([imgFeature,seq])
        indx = np.argmax(predict)
        word = get_word(indx,tokenizer)
        if word =='end':
            break
        output_seq+= ' '+word
    return output_seq[6:]

def translate_and_speak(text, dest_lang="ar"):
    # Translate text
    translator = Translator()
    translation = translator.translate(text, dest=dest_lang)
    translated_text = translation.text
    print(f"Translated Text: {translated_text}")

    # Convert translated text to speech
    tts = gTTS(translated_text, lang=dest_lang)
    audio_file = "D:\\test\\translated_audio.mp3"
    tts.save(audio_file)

    # Play the audio file
    os.system(f"start {audio_file}")  # Windows

def generate_cap(img_path):
    max_cap = 35
    '''imgFeature = img_feature(img_path , cnn_model)
    caption = generate_captions(cap_model,tokenizer,imgFeature,max_cap)
    print(caption)'''
    imgFeature = img_feature(img_path, cnn_model)
    if imgFeature is not None:
        caption = generate_captions(cap_model, tokenizer, imgFeature, max_cap)
        print("Generated Caption (English):", caption)
         # Translate and speak the caption
        translate_and_speak(caption)
        
    else:
        print("Failed to generate caption.")



In [3]:
cnn_model = Xception(include_top= False , pooling='avg')
cap_model = load_model('D:\\test\\model_15.h5')
tokenizer = pickle.load(open('D:\\test\\tokenizer','rb'))

In [5]:
img_path = 'D:\\Image Caption Test\\p.jpg'
generate_cap(img_path)

Generated Caption (English): man in red shirt and glasses is standing next to an unpainted table
Translated Text: رجل يرتدي قميصًا ونظارات حمراء يقف بجانب طاولة غير مصممة


In [None]:
# New Function: Evaluate Model with BLEU and CIDEr
def evaluate_model(model, features, tokens, max_length):
    hypotheses = {}
    references = {}
    for img, captions in tokens.items():
        if img in features:
            # Generate caption
            img_feature = features[img][0]
            caption = generate_captions(model, bert_tokenizer, img_feature, max_length)
            hypotheses[img] = [caption]
            # Prepare references (remove [CLS] and [SEP])
            refs = [cap.replace('[CLS]', '').replace('[SEP]', '').strip() for cap in captions]
            references[img] = refs
    
    # Compute BLEU Score
    refs_for_bleu = [[ref.split() for ref in ref_list] for ref_list in references.values()]
    hyps_for_bleu = [hyp[0].split() for hyp in hypotheses.values()]
    bleu4 = corpus_bleu(refs_for_bleu, hyps_for_bleu)
    
    # Compute CIDEr Score
    cider_scorer = Cider()
    cider_score, _ = cider_scorer.compute_score(references, hypotheses)
    
    return bleu4, cider_score