In [23]:
import os
import random
from PIL import Image
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from nltk.translate.bleu_score import corpus_bleu

# Load the model, processor, and tokenizer
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Model configuration
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.max_length = 16
model.config.num_beams = 4

image_directory = "Images/"
ground_truth_file = "captions.txt/captions_8k.txt"

ground_truth_captions = {}
try:
    with open(ground_truth_file, "r") as f:
        for line in f:
            # Split based on the first comma
            if "," in line:
                image_name, caption = line.strip().split(",", 1)
                ground_truth_captions[image_name] = caption
            else:
                print(f"Skipping malformed line: {line.strip()}")
except FileNotFoundError:
    print(f"Ground truth captions file not found: {ground_truth_file}")
    exit()

image_files = [f for f in os.listdir(image_directory) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif'))]
random_images = random.sample(image_files, min(100, len(image_files)))

generated_captions = []
reference_captions = []

for image_file in random_images:
    image_path = os.path.join(image_directory, image_file)

    try:
        image = Image.open(image_path)
        inputs = processor(images=image, return_tensors="pt").pixel_values
        outputs = model.generate(inputs)
        caption = tokenizer.decode(outputs[0], skip_special_tokens=True)

        print(f"Image: {image_file}")
        print(f"Generated Caption: {caption}")
        print("-" * 50)

        generated_captions.append(caption)

        if image_file in ground_truth_captions:
            reference_captions.append([ground_truth_captions[image_file].split()])

    except Exception as e:
        print(f"Error processing {image_file}: {e}")

# Calculate BLEU score
if generated_captions and reference_captions:
    bleu_score = corpus_bleu(reference_captions, [cap.split() for cap in generated_captions])
    print(f"\nBLEU Score: {bleu_score:.2f}")
else:
    print("No captions generated or ground-truth captions available for evaluation.")


Skipping malformed line: caption




Image: 3442272060_f9155194c2.jpg
Generated Caption: two dogs are playing with each other in a field 
--------------------------------------------------
Image: 482353373_03a9d5e8bc.jpg
Generated Caption: a woman standing next to a man in a restaurant 
--------------------------------------------------
Image: 947969010_f1ea572e89.jpg
Generated Caption: a dog in the water with a frisbee in its mouth 
--------------------------------------------------
Image: 1659358133_95cd1027bd.jpg
Generated Caption: a black and white elephant jumping over a log 
--------------------------------------------------
Image: 2646540383_343e1ec9a4.jpg
Generated Caption: a man sitting on top of a surfboard in a pool 
--------------------------------------------------
Image: 3351360323_91bb341350.jpg
Generated Caption: a man in a suit and tie playing a guitar 
--------------------------------------------------
Image: 2273028514_d7b584f73d.jpg
Generated Caption: a woman is looking at her cell phone 
-------------

In [3]:
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction
import os
import random
from PIL import Image
import string

# Load the model, processor, and tokenizer
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Update model configuration
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.max_length = 32  
model.config.num_beams = 10  
model.config.early_stopping = True

image_directory = "Images/"
ground_truth_file = "captions_8k.txt"

ground_truth_captions = {}
with open(ground_truth_file, "r") as f:
    for line in f:
        if "," in line:
            image_name, caption = line.strip().split(",", 1)
            ground_truth_captions[image_name] = caption.lower().translate(str.maketrans("", "", string.punctuation))

image_files = [f for f in os.listdir(image_directory) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif'))]
random_images = random.sample(image_files, min(10, len(image_files)))

# Generate captions and prepare for BLEU score
generated_captions = []
reference_captions = []

for image_file in random_images:
    image_path = os.path.join(image_directory, image_file)

    try:
        image = Image.open(image_path).convert("RGB")
        inputs = processor(images=image, return_tensors="pt").pixel_values
        outputs = model.generate(inputs)
        caption = tokenizer.decode(outputs[0], skip_special_tokens=True).lower().translate(str.maketrans("", "", string.punctuation))

        print(f"Image: {image_file}")
        print(f"Generated Caption: {caption}")
        print("-" * 50)

        generated_captions.append(caption.split())

        if image_file in ground_truth_captions:
            reference_captions.append([ground_truth_captions[image_file].split()])

    except Exception as e:
        print(f"Error processing {image_file}: {e}")

# Calculate BLEU score
if generated_captions and reference_captions:
    smoothing_function = SmoothingFunction().method1
    bleu_score = corpus_bleu(reference_captions, generated_captions, smoothing_function=smoothing_function)
    print(f"\nBLEU Score: {bleu_score*10:.2f}")
else:
    print("No captions generated or ground-truth captions available for evaluation.")

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "transformers_version": "4.47.1"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "decoder_start_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_rang

Image: 3674521435_89ff681074.jpg
Generated Caption: a man flying through the air while riding a skateboard 
--------------------------------------------------
Image: 2483993827_243894a4f9.jpg
Generated Caption: a man riding on the back of a white horse 
--------------------------------------------------
Image: 1207159468_425b902bfb.jpg
Generated Caption: a man standing on top of a snow covered slope 
--------------------------------------------------
Image: 3098707588_5096d20397.jpg
Generated Caption: two men in suits posing for a picture 
--------------------------------------------------
Image: 2273105617_7c73d2d2d3.jpg
Generated Caption: a man that is standing in front of a door 
--------------------------------------------------
Image: 2213113526_beeb4f9bdc.jpg
Generated Caption: a woman sitting on a bench next to a dog 
--------------------------------------------------
Image: 528498076_43f0ef36b5.jpg
Generated Caption: a young boy standing on top of a bed 
-----------------------

In [None]:
# %pip install transformers torch torchvision pillow

from PIL import Image
import os
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import logging

logging.getLogger("transformers").setLevel(logging.ERROR)

# Load the pre-trained model and tokenizer
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Configure the model
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.max_length = 16
model.config.num_beams = 4

braille_dict = {
    'a': '⠁', 'b': '⠃', 'c': '⠉', 'd': '⠙', 'e': '⠑',
    'f': '⠋', 'g': '⠛', 'h': '⠓', 'i': '⠊', 'j': '⠚',
    'k': '⠅', 'l': '⠇', 'm': '⠍', 'n': '⠝', 'o': '⠕',
    'p': '⠏', 'q': '⠟', 'r': '⠗', 's': '⠎', 't': '⠞',
    'u': '⠥', 'v': '⠧', 'w': '⠺', 'x': '⠭', 'y': '⠽', 'z': '⠵',
    ' ': ' ', '1': '⠼⠁', '2': '⠼⠃', '3': '⠼⠉', '4': '⠼⠙',
    '5': '⠼⠑', '6': '⠼⠋', '7': '⠼⠛', '8': '⠼⠓', '9': '⠼⠊', '0': '⠼⠚'
}

def text_to_braille(text):
    return ''.join(braille_dict.get(char.lower(), '?') for char in text)

def generate_caption_and_braille(image_path):
    try:
        image = Image.open(image_path)
        inputs = processor(images=image, return_tensors="pt").pixel_values
        image.show()
      
        outputs = model.generate(inputs)
        caption = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        braille_text = text_to_braille(caption)
        
        print(f"Generated Caption for {os.path.basename(image_path)}: {caption}")
        print(f"Braille Output: {braille_text}")
    except Exception as e:
        print(f"Error processing the image {os.path.basename(image_path)}: {e}")

local_image_path = input("Enter the relative or absolute path to the image (e.g., 'image.jpg'): ")

generate_caption_and_braille(local_image_path)








Generated Caption for imgg3.jpg: a green plastic bottle sitting on top of a wooden table 
Braille Output: ⠁ ⠛⠗⠑⠑⠝ ⠏⠇⠁⠎⠞⠊⠉ ⠃⠕⠞⠞⠇⠑ ⠎⠊⠞⠞⠊⠝⠛ ⠕⠝ ⠞⠕⠏ ⠕⠋ ⠁ ⠺⠕⠕⠙⠑⠝ ⠞⠁⠃⠇⠑ 


In [3]:
import os
import pandas as pd
from PIL import Image
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer

# Initialize model, processor, and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning").to(device)
processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

class ImageCaptionDataset(Dataset):
    def __init__(self, captions_file, image_dir, processor):
        self.image_dir = image_dir
        self.processor = processor
        self.captions_df = pd.read_csv(captions_file, names=["filename", "caption"])
    
    def __len__(self):
        return len(self.captions_df)
    
    def __getitem__(self, idx):
        row = self.captions_df.iloc[idx]
        image_path = os.path.normpath(os.path.join(self.image_dir, row['filename']))
        
        if not os.path.exists(image_path):
            return None, None

        try:
            # Load and preprocess the image
            image = Image.open(image_path).convert("RGB")
            pixel_values = self.processor(images=image, return_tensors="pt").pixel_values
            caption = row['caption']
            return pixel_values.squeeze(0), caption
        except Exception as e:
            return None, None

def custom_collate_fn(batch):
    return [(img, cap) for img, cap in batch if img is not None and cap is not None]

def generate_captions(model, dataloader, tokenizer):
    model.eval()
    all_results = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Processing Batches"):
            valid_batch = [(img, cap) for img, cap in batch if img is not None and cap is not None]
            if not valid_batch:
                continue
            
            images, captions = zip(*valid_batch)
            images = torch.stack(images).to(device)
            outputs = model.generate(images, max_length=64, num_beams=4)
            generated_captions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
            
            for orig_cap, gen_cap in zip(captions, generated_captions):
                all_results.append({
                    "generated_caption": gen_cap,
                    "original_caption": orig_cap
                })
    return all_results

captions_file = r"captions_8k.txt"
image_dir = r"C:\Users\rosyd\Mini_Project\Images"

# Dataset and DataLoader
dataset = ImageCaptionDataset(captions_file, image_dir, processor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=custom_collate_fn)

results = generate_captions(model, dataloader, tokenizer)

output_csv = "generated_captions.csv"
pd.DataFrame(results).to_csv(output_csv, index=False)
print(f"Results saved to {output_csv}")


  from .autonotebook import tqdm as notebook_tqdm
Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "transformers_version": "4.47.1"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "decoder_start_token_id": 50256,
  "embd_pdrop":

Results saved to generated_captions.csv





In [5]:
import os
import pandas as pd
from PIL import Image
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import subprocess

# Initialize model, processor, and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning").to(device)
processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Dataset class
class ImageCaptionDataset(Dataset):
    def __init__(self, captions_file, image_dir, processor):
        self.image_dir = image_dir
        self.processor = processor
        self.captions_df = pd.read_csv(captions_file, names=["filename", "caption"])
    
    def __len__(self):
        return len(self.captions_df)
    
    def __getitem__(self, idx):
        row = self.captions_df.iloc[idx]
        image_path = os.path.normpath(os.path.join(self.image_dir, row['filename']))
        
        if not os.path.exists(image_path):
            return None, None

        try:
            # Load and preprocess the image
            image = Image.open(image_path).convert("RGB")
            pixel_values = self.processor(images=image, return_tensors="pt").pixel_values
            caption = row['caption']
            return pixel_values.squeeze(0), caption
        except Exception as e:
            # Skip problematic images
            return None, None

def custom_collate_fn(batch):
    return [(img, cap) for img, cap in batch if img is not None and cap is not None]

def generate_captions(model, dataloader, tokenizer):
    model.eval()
    all_results = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Processing Batches"):
            valid_batch = [(img, cap) for img, cap in batch if img is not None and cap is not None]
            if not valid_batch:
                continue
            
            images, captions = zip(*valid_batch)
            images = torch.stack(images).to(device)
            outputs = model.generate(images, max_length=64, num_beams=4)
            generated_captions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
            
            for orig_cap, gen_cap in zip(captions, generated_captions):
                all_results.append({
                    "generated_caption": gen_cap,
                    "original_caption": orig_cap
                })
    return all_results

captions_file = r"captions_8k.txt"
image_dir = r"C:\Users\rosyd\Mini_Project\Images"

dataset = ImageCaptionDataset(captions_file, image_dir, processor)
subprocess.run(["python", "changes.py"])
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=custom_collate_fn)

results = generate_captions(model, dataloader, tokenizer)

output_csv = "generated_captions.csv"
pd.DataFrame(results).to_csv(output_csv, index=False)
print(f"Results saved to {output_csv}")


Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "transformers_version": "4.47.1"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "decoder_start_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_rang

Results saved to generated_captions.csv





In [1]:
import pickle

# Save model, processor, and tokenizer
with open("oo.pkl", "wb") as f:
    pickle.dump({
        "model": model,
        "processor": processor,
        "tokenizer": tokenizer
    }, f)

print("Model, processor, and tokenizer saved to 'oo.pkl'")


NameError: name 'model' is not defined

In [2]:
pip install flask

Note: you may need to restart the kernel to use updated packages.




In [None]:
from flask import Flask, request, jsonify
from PIL import Image
import pickle
import os
import base64
from io import BytesIO

try:
    with open("oo.pkl", "rb") as f:
        data = pickle.load(f)
except Exception as e:
    print(f"Error loading .pkl file: {e}")
    sys.exit(1)
print("Model, processor, and tokenizer loaded successfully.")

braille_dict = {
    'a': '⠁', 'b': '⠃', 'c': '⠉', 'd': '⠙', 'e': '⠑',
    'f': '⠋', 'g': '⠛', 'h': '⠓', 'i': '⠊', 'j': '⠚',
    'k': '⠅', 'l': '⠇', 'm': '⠍', 'n': '⠝', 'o': '⠕',
    'p': '⠏', 'q': '⠟', 'r': '⠗', 's': '⠎', 't': '⠞',
    'u': '⠥', 'v': '⠧', 'w': '⠺', 'x': '⠭', 'y': '⠽', 'z': '⠵',
    ' ': ' ', '1': '⠼⠁', '2': '⠼⠃', '3': '⠼⠉', '4': '⠼⠙',
    '5': '⠼⠑', '6': '⠼⠋', '7': '⠼⠛', '8': '⠼⠓', '9': '⠼⠊', '0': '⠼⠚'
}

def text_to_braille(text):
    return ''.join(braille_dict.get(char.lower(), '?') for char in text)

app = Flask(__name__)

@app.route("/",methods=['GET'])
def home():
    return "Welcome to the Image Captioning and Braille Generation API!"

@app.route('/generate', methods=['POST'])
def generate_caption_and_braille():
    try:
        data = request.json
        image_data = base64.b64decode(data['image'])
        image = Image.open(BytesIO(image_data))
        
        inputs = processor(images=image, return_tensors="pt").pixel_values
        
        outputs = model.generate(inputs)
        caption = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        braille_text = text_to_braille(caption)
        
        return jsonify({
            "caption": caption,
            "braille": braille_text
        })
    except Exception as e:
        return jsonify({"error": str(e)}), 400

if __name__ == '__main__':
    app.run(debug=True,port=5001)


In [17]:
with open("oo.pkl", "rb") as f:

    data = pickle.load(f)
    #print(data["model"])
    model = data["model"]
    tokenizer = data["tokenizer"]
print("Model, processor, and tokenizer loaded successfully.")


def generate_caption_and_braille(image_path):
    try:
        image = Image.open(image_path)
        inputs = processor(images=image, return_tensors="pt").pixel_values
        image.show()
        
        outputs = model.generate(inputs)
        caption = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        braille_text = text_to_braille(caption)
        
        print(f"Generated Caption for {os.path.basename(image_path)}: {caption}")
        print(f"Braille Output: {braille_text}")
    except Exception as e:
        print(f"Error processing the image {os.path.basename(image_path)}: {e}")

generate_caption_and_braille("imgg2.jpg")


Model, processor, and tokenizer loaded successfully.
Generated Caption for imgg2.jpg: a green book is sitting on top of a wooden table 
Braille Output: ⠁ ⠛⠗⠑⠑⠝ ⠃⠕⠕⠅ ⠊⠎ ⠎⠊⠞⠞⠊⠝⠛ ⠕⠝ ⠞⠕⠏ ⠕⠋ ⠁ ⠺⠕⠕⠙⠑⠝ ⠞⠁⠃⠇⠑ 
