In [1]:
# d1
from flask import Flask, request, jsonify, render_template
from transformers import pipeline
from PIL import Image
import os

app = Flask(__name__)

# Load pre-trained model
model_path = "./models/helmet_vit"
helmet_detector = pipeline("image-classification", model=model_path)

# Create upload folder
UPLOAD_FOLDER = "./Media"
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
app.config["UPLOAD_FOLDER"] = UPLOAD_FOLDER

@app.route("/")
def index():
    return render_template("index.html")

@app.route("/predict", methods=["POST"])
def predict():
    if "file" not in request.files:
        return jsonify({"error": "No file uploaded"}), 400

    file = request.files["file"]
    if file.filename == "":
        return jsonify({"error": "No file selected"}), 400

    # Save the uploaded image
    file_path = os.path.join(app.config["UPLOAD_FOLDER"], file.filename)
    file.save(file_path)

    # Load and predict
    image = Image.open(file_path).convert("RGB")
    result = helmet_detector(image)

    # Delete the file after prediction
    os.remove(file_path)

    return jsonify({"prediction": result})

if __name__ == "__main__":
    app.run(debug=True)


  from .autonotebook import tqdm as notebook_tqdm





Device set to use cpu


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with watchdog (windowsapi)


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [2]:
#D2
import torch
from transformers import ViTForImageClassification, ViTImageProcessor
from PIL import Image, ImageDraw
import cv2
import numpy as np

# Load model and processor
model_path = "./models/helmet_vit"
model = ViTForImageClassification.from_pretrained(model_path, output_hidden_states=True)
image_processor = ViTImageProcessor.from_pretrained(model_path)

def detect_helmets(image_path):
    # Load and preprocess the image
    image = Image.open(image_path).convert("RGB")
    original_width, original_height = image.size
    inputs = image_processor(images=image, return_tensors="pt")

    # Inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract hidden states and generate bounding boxes
    hidden_states = outputs.hidden_states[-1]  # Get the last layer's hidden states
    logits = outputs.logits[0].softmax(dim=-1)  # Confidence scores for each class

    # Generate bounding boxes dynamically
    threshold = 0.5  # Confidence threshold for filtering predictions
    results = []
    for idx, logit in enumerate(logits):
        if logit.max() >= threshold:
            label = "With Helmet" if logit.argmax() == 0 else "Without Helmet"
            x1, y1, x2, y2 = np.random.randint(0, original_width // 2), np.random.randint(
                0, original_height // 2
            ), np.random.randint(
                original_width // 2, original_width
            ), np.random.randint(
                original_height // 2, original_height
            )  # Replace with proper bounding box logic based on hidden_states
            results.append({"bbox": [x1, y1, x2, y2], "label": label, "score": logit.max().item()})

    return results

def draw_bounding_boxes(image_path, predictions):
    # Open the image
    image = cv2.imread(image_path)
    for pred in predictions:
        x1, y1, x2, y2 = pred['bbox']
        label = f"{pred['label']} ({pred['score']:.2f})"

        # Draw bounding box
        cv2.rectangle(image, (x1, y1), (x2, y2), color=(0, 255, 0), thickness=2)

        # Draw text
        font_scale = 0.5
        font_thickness = 1
        (text_width, text_height), baseline = cv2.getTextSize(
            label, cv2.FONT_HERSHEY_SIMPLEX, font_scale, font_thickness
        )
        cv2.rectangle(
            image,
            (x1, y1 - text_height - baseline),
            (x1 + text_width, y1),
            (0, 255, 0),
            thickness=cv2.FILLED,
        )
        cv2.putText(
            image,
            label,
            (x1, y1 - baseline),
            cv2.FONT_HERSHEY_SIMPLEX,
            font_scale,
            (0, 0, 0),
            font_thickness,
        )

    return image

# Main execution
if __name__ == "__main__":
    input_image_path = "./Media/riders_1.jpg"
    output_image_path = "./output/detected_riders.jpg"

    try:
        # Detect helmets
        predictions = detect_helmets(input_image_path)

        # Draw bounding boxes
        output_image = draw_bounding_boxes(input_image_path, predictions)

        # Save and display output
        cv2.imwrite(output_image_path, output_image)
        cv2.imshow("Helmet Detection", output_image)
        cv2.waitKey(0)
        cv2.destroyAllWindows()
    except Exception as e:
        print(f"Error: {e}")


In [3]:
#D3
import cv2
import torch
from transformers import ViTForImageClassification, ViTImageProcessor
from PIL import Image
import numpy as np

# Load the ViT model and image processor
model_path = "./models/helmet_vit"
model = ViTForImageClassification.from_pretrained(model_path)
image_processor = ViTImageProcessor.from_pretrained(model_path)

# Load Haar Cascade for face detection
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

def detect_faces(image):
    """Detect faces in an image using Haar Cascade."""
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
    return faces

def classify_helmet(image, face_coordinates):
    """Classify whether the face detected has a helmet."""
    predictions = []
    for (x, y, w, h) in face_coordinates:
        face_img = image[y:y+h, x:x+w]  # Crop the face region
        face_pil = Image.fromarray(cv2.cvtColor(face_img, cv2.COLOR_BGR2RGB)).resize((224, 224))
        inputs = image_processor(images=face_pil, return_tensors="pt")
        
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits[0].softmax(dim=-1)
        label = "With Helmet" if logits.argmax() == 0 else "Without Helmet"
        score = logits.max().item()
        predictions.append({"bbox": (x, y, w, h), "label": label, "score": score})
    return predictions

def draw_results(image, results):
    """Draw bounding boxes and labels on the image."""
    for result in results:
        x, y, w, h = result['bbox']
        label = f"{result['label']} ({result['score']:.2f})"
        color = (0, 255, 0) if "With Helmet" in result['label'] else (0, 0, 255)

        # Draw bounding box
        cv2.rectangle(image, (x, y), (x+w, y+h), color, 2)

        # Draw label
        cv2.putText(image, label, (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
    return image

# Main execution
if __name__ == "__main__":
    input_image_path = "./Media/riders_1.jpg"
    output_image_path = "./output/detected_faces_helmets.jpg"

    # Read the input image
    img = cv2.imread(input_image_path)

    # Detect faces
    face_boxes = detect_faces(img)

    # Classify each detected face
    results = classify_helmet(img, face_boxes)

    # Draw results
    output_image = draw_results(img, results)

    # Save and display the output
    cv2.imwrite(output_image_path, output_image)
    cv2.imshow("Helmet Detection", output_image)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

In [4]:
#D4
from mtcnn import MTCNN
import cv2
import torch
from transformers import ViTForImageClassification, ViTImageProcessor
from PIL import Image

# Load Vision Transformer model and image processor
model_path = "./models/helmet_vit"
model = ViTForImageClassification.from_pretrained(model_path)
image_processor = ViTImageProcessor.from_pretrained(model_path)

# Initialize MTCNN detector
mtcnn = MTCNN()

def detect_faces_with_mtcnn(image):
    """Detect faces using MTCNN."""
    results = mtcnn.detect_faces(image)
    faces = []
    for res in results:
        x, y, w, h = res['box']
        faces.append((x, y, w, h))
    return faces

def classify_helmet(image, face_coordinates):
    """Classify whether the detected face has a helmet."""
    predictions = []
    for (x, y, w, h) in face_coordinates:
        # Crop and preprocess the face region
        face_img = image[y:y+h, x:x+w]
        face_pil = Image.fromarray(cv2.cvtColor(face_img, cv2.COLOR_BGR2RGB)).resize((224, 224))
        inputs = image_processor(images=face_pil, return_tensors="pt")
        
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits[0].softmax(dim=-1)
        label = "With Helmet" if logits.argmax() == 0 else "Without Helmet"
        score = logits.max().item()
        predictions.append({"bbox": (x, y, w, h), "label": label, "score": score})
    return predictions

def draw_results(image, results):
    """Draw bounding boxes and labels on the image."""
    for result in results:
        x, y, w, h = result['bbox']
        label = f"{result['label']} ({result['score']:.2f})"
        color = (0, 255, 0) if "With Helmet" in result['label'] else (0, 0, 255)

        # Draw bounding box
        cv2.rectangle(image, (x, y), (x+w, y+h), color, 2)

        # Draw label
        cv2.putText(image, label, (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
    return image

# Main execution
if __name__ == "__main__":
    input_image_path = "./Media/riders_1.jpg"
    output_image_path = "./output/detected_faces_helmets.jpg"

    # Read the input image
    img = cv2.imread(input_image_path)

    # Detect faces
    face_boxes = detect_faces_with_mtcnn(img)

    # Classify each detected face
    results = classify_helmet(img, face_boxes)

    # Draw results
    output_image = draw_results(img, results)

    # Save and display the output
    cv2.imwrite(output_image_path, output_image)
    cv2.imshow("Helmet Detection", output_image)
    cv2.waitKey(0)
    cv2.destroyAllWindows()






In [None]:
#D5
import cv2
import torch
import math
from mtcnn import MTCNN  # Install with: pip install mtcnn
from transformers import ViTForImageClassification, ViTImageProcessor
from PIL import Image
import cvzone  # For stylish bounding boxes

# Load the Vision Transformer model
model_path = "./models/helmet_vit"
vit_model = ViTForImageClassification.from_pretrained(model_path)
image_processor = ViTImageProcessor.from_pretrained(model_path)

# Define class labels for Vision Transformer
class_labels = ['With Helmet', 'Without Helmet']

# Initialize MTCNN for face detection
mtcnn = MTCNN()

# Load the image
image_path = "Media/riders_1.jpg"
img = cv2.imread(image_path)

# Detect faces using MTCNN
detections = mtcnn.detect_faces(img)

# Function to preprocess and classify a cropped image
def classify_helmet(cropped_img):
    """Classify whether a cropped image contains a helmet."""
    # Convert the image to PIL format and resize
    face_pil = Image.fromarray(cv2.cvtColor(cropped_img, cv2.COLOR_BGR2RGB)).resize((224, 224))
    inputs = image_processor(images=face_pil, return_tensors="pt")
    
    # Perform inference
    with torch.no_grad():
        outputs = vit_model(**inputs)
    logits = outputs.logits[0].softmax(dim=-1)
    
    # Get the predicted label and confidence score
    cls_idx = logits.argmax().item()
    confidence = logits[cls_idx].item()
    return class_labels[cls_idx], confidence

# Process each detected face
for detection in detections:
    box = detection['box']  # Bounding box
    x, y, w, h = box
    x, y, w, h = int(x), int(y), int(w), int(h)
    
    # Crop the face region
    face_crop = img[y:y + h, x:x + w]
    
    # Classify the cropped face
    label, confidence = classify_helmet(face_crop)
    
    # Draw bounding box and label on the image
    color = (0, 255, 0) if label == "With Helmet" else (0, 0, 255)
    cvzone.cornerRect(img, (x, y, w, h), l=10, rt=2, colorR=color)
    label_text = f"{label} {confidence:.2f}"
    cv2.putText(img, label_text, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

# Display the image with detections
cv2.imshow("Helmet Detection", img)

# Close window when 'q' button is pressed
while True:
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cv2.destroyAllWindows()


In [None]:
#D6

import cv2
from mtcnn import MTCNN  # Install with: pip install mtcnn
import cvzone  # For stylish bounding boxes

# Initialize MTCNN for face detection
mtcnn = MTCNN()

# Load the image
image_path = "Media/riders_2.jpg"  # Replace with the path to your image
img = cv2.imread(image_path)

# Detect faces using MTCNN
detections = mtcnn.detect_faces(img)

# Process each detected face
if len(detections) == 0:
    print("No faces detected.")
else:
    for i, detection in enumerate(detections):
        # Extract bounding box coordinates
        box = detection['box']  # [x, y, width, height]
        x, y, w, h = box
        x, y, w, h = int(x), int(y), int(w), int(h)  # Convert to integers

        # Draw bounding box around the face
        cvzone.cornerRect(img, (x, y, w, h), l=10, rt=2, colorR=(0, 255, 0))

        # Add label with face index
        label_text = f"Face {i + 1}"
        cv2.putText(img, label_text, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

# Display the image with face detections
cv2.imshow("Face Detection", img)

# Wait for a key press to close the window
cv2.waitKey(0)
cv2.destroyAllWindows()


In [None]:
#D7
import cv2
from mtcnn import MTCNN  # Install with: pip install mtcnn
import cvzone  # For stylish bounding boxes

# Initialize MTCNN for face detection
mtcnn = MTCNN()

# Load the image
image_path = "Media/riders_5.jpg"  # Replace with the path to your image
img = cv2.imread(image_path)

# Detect faces using MTCNN
detections = mtcnn.detect_faces(img)

# Process each detected face
if len(detections) == 0:
    print("No faces detected.")
else:
    for i, detection in enumerate(detections):
        # Extract bounding box coordinates
        box = detection['box']  # [x, y, width, height]
        x, y, w, h = box
        x, y, w, h = int(x), int(y), int(w), int(h)  # Convert to integers

        # Apply slight padding around the face for better visibility
        padding = 10
        x = max(x - padding, 0)
        y = max(y - padding, 0)
        w += padding * 2
        h += padding * 2

        # Draw bounding box around the face
        cvzone.cornerRect(img, (x, y, w, h), l=10, rt=2, colorR=(0, 255, 0))

        # Add confidence score and face index as a label
        confidence = detection['confidence']  # Confidence score
        label_text = f"Face {i + 1} ({confidence:.2f})"
        cv2.putText(img, label_text, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

# Display the image with face detections
cv2.imshow("Precise Face Detection", img)

# Wait for a key press to close the window
cv2.waitKey(0)
cv2.destroyAllWindows()

In [None]:
#D8

import cv2
import torch
from transformers import ViTImageProcessor, ViTForImageClassification
import cvzone  # For stylish bounding boxes
import numpy as np
from mtcnn import MTCNN  # Ensure you have MTCNN installed

# Initialize the MTCNN for face detection
mtcnn = MTCNN()

# Initialize the Vision Transformer (ViT) model and image processor
model_name = "google/vit-base-patch16-224-in21k"  # You can replace this with a custom-trained model for faces/helmets
processor = ViTImageProcessor.from_pretrained(model_name)
model = ViTForImageClassification.from_pretrained(model_name)

# Load the image
image_path = "Media/riders_3.jpg"  # Replace with the path to your image
img = cv2.imread(image_path)

# Convert the image to RGB (ViT expects RGB)
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

# Detect faces using MTCNN
mtcnn_detections = mtcnn.detect_faces(img)

# Initialize a blank list to collect the face/helmet boxes
all_detections = []

# Process each detected face from MTCNN
if len(mtcnn_detections) == 0:
    print("No faces detected.")
else:
    for i, detection in enumerate(mtcnn_detections):
        # Extract bounding box coordinates for face
        box = detection['box']  # [x, y, width, height]
        x, y, w, h = box
        x, y, w, h = int(x), int(y), int(w), int(h)  # Convert to integers

        # Apply slight padding around the face for better visibility
        padding = 10
        x = max(x - padding, 0)
        y = max(y - padding, 0)
        w += padding * 2
        h += padding * 2

        # Add face detection to the list
        all_detections.append((x, y, w, h))

# Now, process detected faces/helmets with Vision Transformer
for i, (x, y, w, h) in enumerate(all_detections):
    # Crop the detected region (face or helmet)
    cropped_img = img_rgb[y:y+h, x:x+w]

    # Preprocess the cropped image for ViT model
    inputs = processor(images=cropped_img, return_tensors="pt")

    # Make prediction using ViT model
    with torch.no_grad():
        logits = model(**inputs).logits

    # Get the predicted class label
    predicted_class = logits.argmax(-1).item()

    # Assuming '0' = 'Face' and '1' = 'Helmet' in your model classes
    if predicted_class == 0:
        label = "Face"
    elif predicted_class == 1:
        label = "Helmet"
    else:
        label = "Unknown"

    # Draw bounding box and label for each detection
    cvzone.cornerRect(img, (x, y, w, h), l=10, rt=2, colorR=(0, 255, 0))
    label_text = f"{label} {i + 1}"
    cv2.putText(img, label_text, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

# Display the image with face/helmet detections
cv2.imshow("Face and Helmet Detection using ViT", img)

# Wait for a key press to close the window
cv2.waitKey(0)
cv2.destroyAllWindows()


In [None]:
#D9
import cv2
import torch
from transformers import ViTImageProcessor, ViTForImageClassification
import cvzone  # For stylish bounding boxes
import numpy as np
from mtcnn import MTCNN  # Ensure you have MTCNN installed

# Initialize the MTCNN for face detection
mtcnn = MTCNN()

# Initialize the Vision Transformer (ViT) model and image processor
# Use your custom-trained model for helmet detection
model_name = "./models/helmet_vit"  # Replace with a custom-trained model if you have one
processor = ViTImageProcessor.from_pretrained(model_name)
model = ViTForImageClassification.from_pretrained(model_name)

# Load the image
image_path = "Media/riders_3.jpg"  # Replace with the path to your image
img = cv2.imread(image_path)

# Convert the image to RGB (ViT expects RGB)
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

# Detect faces using MTCNN (detect regions that may contain a person)
mtcnn_detections = mtcnn.detect_faces(img)

# Initialize a blank list to collect the face/helmet boxes
all_detections = []

# Process each detected face from MTCNN
if len(mtcnn_detections) == 0:
    print("No faces detected.")
else:
    for i, detection in enumerate(mtcnn_detections):
        # Extract bounding box coordinates for the detected face
        box = detection['box']  # [x, y, width, height]
        x, y, w, h = box
        x, y, w, h = int(x), int(y), int(w), int(h)  # Convert to integers

        # Apply slight padding around the face for better visibility
        padding = 10
        x = max(x - padding, 0)
        y = max(y - padding, 0)
        w += padding * 2
        h += padding * 2

        # Add face detection to the list
        all_detections.append((x, y, w, h))

# Now, process detected faces/helmet regions with Vision Transformer
for i, (x, y, w, h) in enumerate(all_detections):
    # Crop the detected region (face or helmet area)
    cropped_img = img_rgb[y:y+h, x:x+w]

    # Preprocess the cropped image for ViT model
    inputs = processor(images=cropped_img, return_tensors="pt")

    # Make prediction using ViT model
    with torch.no_grad():
        logits = model(**inputs).logits

    # Get the predicted class label
    predicted_class = logits.argmax(-1).item()

    # Assuming '0' = 'No Helmet' and '1' = 'Helmet' in your model's output
    if predicted_class == 1:
        label = "Helmet"
    else:
        label = "No Helmet"

    # Draw bounding box and label for each detection
    cvzone.cornerRect(img, (x, y, w, h), l=10, rt=2, colorR=(0, 255, 0))
    label_text = f"{label} {i + 1}"
    cv2.putText(img, label_text, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

# Display the image with helmet detection
cv2.imshow("Helmet Detection using ViT", img)

# Wait for a key press to close the window
cv2.waitKey(0)
cv2.destroyAllWindows()


In [None]:
#D10

import cv2
import torch
from transformers import ViTImageProcessor, ViTForImageClassification
import cvzone  # For stylish bounding boxes
from mtcnn import MTCNN  # Ensure you have MTCNN installed

# Initialize the MTCNN for face detection
mtcnn = MTCNN()

# Initialize the Vision Transformer (ViT) model and image processor
model_name = "google/vit-base-patch16-224-in21k"  # Replace with a custom-trained model for helmet detection
processor = ViTImageProcessor.from_pretrained(model_name)
model = ViTForImageClassification.from_pretrained(model_name)

# Load the image
image_path = "Media/riders_3.jpg"  # Replace with the path to your image
img = cv2.imread(image_path)

# Convert the image to RGB (ViT expects RGB)
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

# Detect faces using MTCNN (detect regions that may contain a person)
mtcnn_detections = mtcnn.detect_faces(img)

# Initialize a blank list to collect the face/helmet boxes
all_detections = []

# Process each detected face from MTCNN
if len(mtcnn_detections) == 0:
    print("No faces detected.")
else:
    for i, detection in enumerate(mtcnn_detections):
        # Extract bounding box coordinates for the detected face
        box = detection['box']  # [x, y, width, height]
        x, y, w, h = box
        x, y, w, h = int(x), int(y), int(w), int(h)  # Convert to integers

        # Apply slight padding around the face for better visibility
        padding = 10
        x = max(x - padding, 0)
        y = max(y - padding, 0)
        w += padding * 2
        h += padding * 2

        # Add face detection to the list
        all_detections.append((x, y, w, h))

# Now, process detected faces/helmet regions with Vision Transformer
for i, (x, y, w, h) in enumerate(all_detections):
    # Crop the detected region (face or helmet area)
    cropped_img = img_rgb[y:y+h, x:x+w]

    # Preprocess the cropped image for ViT model
    inputs = processor(images=cropped_img, return_tensors="pt")

    # Make prediction using ViT model
    with torch.no_grad():
        logits = model(**inputs).logits

    # Get the predicted class label
    predicted_class = logits.argmax(-1).item()

    # Assuming '0' = 'No Helmet' and '1' = 'Helmet' in your model's output
    if predicted_class == 1:
        label = "Helmet"
    else:
        label = "No Helmet"

    # Draw bounding box and label for each detection
    cvzone.cornerRect(img, (x, y, w, h), l=10, rt=2, colorR=(0, 255, 0))
    label_text = f"Person {i + 1}: {label}"
    cv2.putText(img, label_text, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

# Display the image with helmet detection
cv2.imshow("Helmet Detection for Each Individual", img)

# Wait for a key press to close the window
cv2.waitKey(0)
cv2.destroyAllWindows()


In [None]:
#D11
import cv2
import torch
from transformers import ViTImageProcessor, ViTForImageClassification
import cvzone  # For stylish bounding boxes
from mtcnn import MTCNN  # Ensure you have MTCNN installed

# Initialize the MTCNN for face detection
mtcnn = MTCNN()

# Initialize the Vision Transformer (ViT) model and image processor
model_name = "./models/helmet_vit"  # Replace with a custom-trained model for helmet detection
processor = ViTImageProcessor.from_pretrained(model_name)
model = ViTForImageClassification.from_pretrained(model_name)

# Load the image
image_path = "Media/riders_6.jpg"  # Replace with the path to your image
img = cv2.imread(image_path)

# Convert the image to RGB (ViT expects RGB)
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

# Detect faces using MTCNN (detect regions that may contain a person)
mtcnn_detections = mtcnn.detect_faces(img)

# Initialize a blank list to collect the face/helmet boxes
all_detections = []

# Process each detected face from MTCNN
if len(mtcnn_detections) == 0:
    print("No faces detected.")
else:
    for detection in mtcnn_detections:
        # Extract bounding box coordinates for the detected face
        box = detection['box']  # [x, y, width, height]
        x, y, w, h = box
        x, y, w, h = int(x), int(y), int(w), int(h)  # Convert to integers

        # Apply slight padding around the face for better visibility
        padding = 10
        x = max(x - padding, 0)
        y = max(y - padding, 0)
        w += padding * 2
        h += padding * 2

        # Add face detection to the list
        all_detections.append((x, y, w, h))

# Now, process detected faces/helmet regions with Vision Transformer
for x, y, w, h in all_detections:
    # Crop the detected region (face or helmet area)
    cropped_img = img_rgb[y:y+h, x:x+w]

    # Preprocess the cropped image for ViT model
    inputs = processor(images=cropped_img, return_tensors="pt")

    # Make prediction using ViT model
    with torch.no_grad():
        logits = model(**inputs).logits

    # Get the predicted class label
    predicted_class = logits.argmax(-1).item()

    # Assuming '0' = 'No Helmet' and '1' = 'Helmet' in your model's output
    if predicted_class == 1:
        label = "Helmet"
        color = (0, 255, 0)  # Green for Helmet
    else:
        label = "No Helmet"
        color = (0, 0, 255)  # Red for No Helmet

    # Draw bounding box and label for each detection
    cvzone.cornerRect(img, (x, y, w, h), l=10, rt=2, colorR=color)
    cv2.putText(img, label, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)

# Display the image with helmet detection
cv2.imshow("Helmet Detection", img)

# Wait for a key press to close the window
cv2.waitKey(0)
cv2.destroyAllWindows()
