In [2]:
import torch

# Load the model architecture
from transformers import AutoModelForImageClassification

model_checkpoint = "motheecreator/vit-Facial-Expression-Recognition"
model = AutoModelForImageClassification.from_pretrained(model_checkpoint)

# Load the weights
model.load_state_dict(torch.load("model_weightsv2.pth"))
model.eval()


  model.load_state_dict(torch.load("model_weightsv2.pth"))


ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fe

In [3]:
dummy_input = torch.randn(1, 3, 224, 224)  # Batch size 1, RGB image


In [6]:

torch.onnx.export(
    model,                     # your model
    dummy_input,               # dummy input
    "model_v2.onnx",           # where to save the ONNX file
    export_params=True,        # store trained weights
    opset_version=14,          # ONNX version
    do_constant_folding=True,  # fold constants for optimization
    input_names=['input'],     # input name
    output_names=['output'],   # output name
    dynamic_axes={             # allow variable batch size
        'input': {0: 'batch_size'},
        'output': {0: 'batch_size'}
    }
)


In [7]:
import onnxruntime as ort
import numpy as np

# Load the ONNX model
ort_session = ort.InferenceSession("model_v2.onnx")

# Convert the dummy_input to numpy
onnx_input = dummy_input.numpy()

# Run inference
outputs = ort_session.run(None, {'input': onnx_input})
print("ONNX output:", outputs)



ONNX output: [array([[-0.35294837, -1.1266174 , -1.8066499 ,  1.0692861 ,  1.363068  ,
         0.20152882, -0.57816905]], dtype=float32)]


In [2]:
import cv2
import numpy as np
import time
import onnxruntime as ort
from ultralytics import YOLO
from PIL import Image
from torchvision import transforms

class EmotionDetectionSystem:
    def __init__(self, yolo_model_path, vit_onnx_path, emotion_labels):
        """
        Initialize the system with YOLO and ViT ONNX models
        """
        # Load YOLOv10 model
        self.yolo_model = YOLO(yolo_model_path)
        
        # Load ONNX model
        self.ort_session = ort.InferenceSession(vit_onnx_path, providers=["CUDAExecutionProvider", "CPUExecutionProvider"])
        self.input_name = self.ort_session.get_inputs()[0].name
        
        # Emotion labels
        self.emotion_labels = emotion_labels

        # Define image transformations for ONNX input (same as ViT expected format)
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
        
        self.frame_count = 0
        self.start_time = time.time()
        self.fps = 0
    
    def preprocess_face(self, face_img):
        """
        Preprocess face image for ONNX model
        """
        pil_img = Image.fromarray(cv2.cvtColor(face_img, cv2.COLOR_BGR2RGB))
        img_tensor = self.transform(pil_img).unsqueeze(0).numpy().astype(np.float32)
        return img_tensor
    
    def predict_emotion(self, face_tensor):
        """
        Predict emotion using ONNX ViT model
        """
        outputs = self.ort_session.run(None, {self.input_name: face_tensor})
        logits = outputs[0]
        probabilities = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)
        pred_idx = np.argmax(probabilities)
        emotion = self.emotion_labels[pred_idx]
        confidence = probabilities[0][pred_idx]
        return emotion, confidence
    
    def process_frame(self, frame):
        results = self.yolo_model(frame)
        detections = results[0].boxes
        
        self.frame_count += 1
        elapsed_time = time.time() - self.start_time
        if elapsed_time > 1.0:
            self.fps = self.frame_count / elapsed_time
            self.frame_count = 0
            self.start_time = time.time()
        
        cv2.putText(frame, f"FPS: {self.fps:.1f}", (10, 30), 
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
        
        for det in detections:
            x1, y1, x2, y2 = map(int, det.xyxy[0])
            face_img = frame[y1:y2, x1:x2]
            if face_img.size == 0 or face_img.shape[0] < 20 or face_img.shape[1] < 20:
                continue
            try:
                face_tensor = self.preprocess_face(face_img)
                emotion, confidence = self.predict_emotion(face_tensor)
                cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 255, 255), 2)
                label = f"{emotion} ({confidence*100:.1f}%)"
                (w, h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)
                text_x = x1 + (x2 - x1 - w) // 2
                cv2.putText(frame, label, (text_x, y1 - 10),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
            except Exception as e:
                print(f"Error processing face: {e}")
                continue
        
        return frame
    
    def run_webcam(self, camera_id=0):
        cap = cv2.VideoCapture(camera_id)
        if not cap.isOpened():
            print("Error: Could not open webcam")
            return
        print("Press 'q' to quit")
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            processed_frame = self.process_frame(frame)
            cv2.imshow('Emotion Detection', processed_frame)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
        cap.release()
        cv2.destroyAllWindows()
if __name__ == "__main__":
    yolo_model_path = r"C:\Users\hari\runs\detect\train\weights\best.pt"
    vit_onnx_path = "model_v2.onnx"  # <-- your ONNX ViT model here

    emotion_labels = ["angry", "disgust", "fear", "happy", "neutral", "sad", "surprised"]

    emotion_system = EmotionDetectionSystem(
        yolo_model_path=yolo_model_path,
        vit_onnx_path=vit_onnx_path,
        emotion_labels=emotion_labels
    )

    emotion_system.run_webcam()


Press 'q' to quit

0: 480x640 1 face, 71.4ms
Speed: 2.4ms preprocess, 71.4ms inference, 0.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 face, 74.6ms
Speed: 2.5ms preprocess, 74.6ms inference, 0.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 face, 81.2ms
Speed: 2.6ms preprocess, 81.2ms inference, 0.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 face, 76.5ms
Speed: 1.9ms preprocess, 76.5ms inference, 0.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 face, 72.6ms
Speed: 1.9ms preprocess, 72.6ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 face, 67.7ms
Speed: 1.7ms preprocess, 67.7ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 face, 65.3ms
Speed: 1.8ms preprocess, 65.3ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 face, 64.4ms
Speed: 1.7ms preprocess, 64.4ms inference, 0.3ms postprocess per image at shape (1, 3, 