In [1]:
import torch
import torch.nn as nn 
from transformers import BlipProcessor, BlipForConditionalGeneration, BertTokenizer, BertForQuestionAnswering
from PIL import Image

In [2]:
model = torch.hub.load('ultralytics/yolov5', 'yolov5s')

img_path = "maxresdefault.jpg"
results = model("maxresdefault.jpg")
detected_objects = results.pandas().xyxy[0]['name'].tolist()
print("Detected Objects:", detected_objects)

Using cache found in /home/naren/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2024-9-11 Python-3.12.4 torch-2.4.0+cu121 CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 


Detected Objects: ['horse']


  with amp.autocast(autocast):


In [3]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")


img = Image.open(img_path)
inputs = processor(img, return_tensors="pt")
outputs = model.generate(**inputs)

# Decode the output
caption = processor.decode(outputs[0], skip_special_tokens=True)
print("Generated Caption:", caption)

Generated Caption: a black horse running through a field


In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

# Example Query
query = "What the horse doing?"

# Tokenize the inputs (caption + query)
inputs = tokenizer.encode_plus(query, caption, return_tensors="pt")

# Get the answer span
start_scores, end_scores = model(**inputs, return_dict=False)
start_idx = torch.argmax(start_scores)
end_idx = torch.argmax(end_scores) + 1

# Decode the answer
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start_idx:end_idx]))
print("Answer:", answer)

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Answer: running


In [None]:
from flask import Flask, request, jsonify, render_template
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import os

app = Flask(__name__)

# Load YOLOv5 model
yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s')

# Load BLIP image captioning model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Home route serving HTML
@app.route('/')
def index():
    return render_template('index.html')  # This will serve the HTML file

# Endpoint to handle image upload and caption generation
@app.route('/generate_caption', methods=['POST'])
def generate_caption():
    if 'image' not in request.files:
        return jsonify({'error': 'No image uploaded'})

    # Save uploaded image
    image = request.files['image']
    image_path = os.path.join('uploads', image.filename)
    image.save(image_path)

    # Perform object detection using YOLOv5
    results = yolo_model(image_path)
    detected_objects = results.pandas().xyxy[0]['name'].tolist()

    # Perform image captioning using BLIP
    img = Image.open(image_path)
    inputs = processor(img, return_tensors="pt")
    outputs = blip_model.generate(**inputs)
    caption = processor.decode(outputs[0], skip_special_tokens=True)

    # Remove the uploaded image after processing
    os.remove(image_path)

    return jsonify({'detected_objects': detected_objects, 'caption': caption})

if __name__ == '__main__':
    # Ensure 'uploads' directory exists
    os.makedirs('uploads', exist_ok=True)
    app.run(debug=True)
