In [93]:
import os
import torch
from torchvision import models, transforms
from PIL import Image
import requests
import json
from io import BytesIO
from tqdm import tqdm

In [94]:

class ObjectDetection:
    COCO_INSTANCE_CATEGORY_NAMES = [
        '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
        'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
        'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
        'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A',
        'N/A', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
        'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
        'surfboard', 'tennis racket', 'bottle', 'N/A', 'wine glass', 'cup', 'fork',
        'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli',
        'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
        'bed', 'dining table', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse',
        'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
        'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
        'toothbrush'
    ]

    def __init__(self, threshold=0.8):
        self.threshold = threshold
        self.model = models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
        self.model.eval()
        self.transform = transforms.Compose([
            transforms.ToTensor()
        ])

    def load_image(self, image_path):
        if image_path.startswith('http'):
            response = requests.get(image_path)
            img = Image.open(BytesIO(response.content)).convert("RGB")
        else:
            img = Image.open(image_path).convert("RGB")
        return img

    def transform_image(self, image):
        return self.transform(image)

    def detect_objects(self, image_path):
        # Load and transform the image
        image = self.load_image(image_path)
        # image = image_path
        image_tensor = self.transform_image(image).unsqueeze(0)
        
        # Perform object detection
        with torch.no_grad():
            predictions = self.model(image_tensor)[0]
        
        # Parse the output and create the JSON structure
        detected_objects = []
        for label, score, box in zip(predictions['labels'], predictions['scores'], predictions['boxes']):
            if score >= self.threshold:
                try:
                    name = self.COCO_INSTANCE_CATEGORY_NAMES[label.item()],
                except:
                    name = ("others",)
                
                
                # print(type(name))
                # print(name)
                detected_objects.append({

                    "bb": {
                        "topLeft": {
                            "x": box[0].item(),
                            "y": box[1].item()
                        },
                        "size": {
                            "width": (box[2] - box[0]).item(),
                            "height": (box[3] - box[1]).item()
                        }
                    },
                    "name" : name[0]
                })

        
        if len(detected_objects) == 0:
            return [
                {
                    "bb":{
                        "topLeft":{
                        "x":0,
                        "y":0
                        },
                        "size":{
                        "width":0,
                        "height":0
                        }
                    },
                    "name":"None"
                }
            ]
        return detected_objects
    
    



In [95]:
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
import torch
from PIL import Image

class ImageCaptioning:
    def __init__(self):
        self.model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
        self.feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
        self.tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

    def generate_caption(self, image_path):
        image = Image.open(image_path)
        pixel_values = self.feature_extractor(images=image, return_tensors="pt").pixel_values
        output_ids = self.model.generate(pixel_values)
        caption = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
        return caption


In [96]:
object_detector = ObjectDetection()
image_captioner = ImageCaptioning()

In [97]:
def detect_and_save_to_jsonl(images, output_file):
        with open(output_file, 'w') as f:
            for image in tqdm(images):

                result = {}
                objects = object_detector.detect_objects(image)
                description = image_captioner.generate_caption(image)
                result['objects'] = objects
                result['description'] = description

                f.write(json.dumps(result) + '\n')

In [101]:
image_dir = 'E:\\Hackathons\\TopCoder\\AI-Powered Image Classification\\provisional\\provisional'
images = os.listdir(image_dir)

In [102]:
for i in range(len(images)):
    images[i] = image_dir + "\\" + images[i]

In [103]:
output_path = "E:\\Hackathons\\TopCoder\\AI-Powered Image Classification\\submission\\solution\\"
detect_and_save_to_jsonl(images, output_path+ 'submission.jsonl')

100%|██████████| 40/40 [04:53<00:00,  7.33s/it]
