In [None]:
#To generate live captions from a webcam feed using PyTorch, you would need to perform real-time image processing and inference. Here's a basic outline of the steps involved in implementing this functionality:

# 1. Import the necessary libraries:

import cv2
import torch
from torchvision import transforms
from model import ImageCaptioningModel  # Your image captioning model


# 2. Load the pre-trained image captioning model:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ImageCaptioningModel().to(device)
model.load_state_dict(torch.load("path_to_model_weights"))
model.eval()


# 3. Set up the webcam capture:

cap = cv2.VideoCapture(0)  # Use the appropriate index if you have multiple cameras
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)


# 4. Define image transformations to preprocess the webcam frames:

transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


# 5. Create a function to process the webcam frames and generate captions:

def generate_caption(frame):
    image = transform(frame).unsqueeze(0).to(device)
    with torch.no_grad():
        outputs = model(image)
    caption = " ".join(outputs)  # Modify based on the output format of your model
    return caption


# 6. Start the main loop to capture frames, process them, and display the captions:

while True:
    ret, frame = cap.read()
    if not ret:
        break
    
    # Preprocess the frame and generate a caption
    caption = generate_caption(frame)
    
    # Display the caption on the frame
    cv2.putText(frame, caption, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
    
    # Show the frame
    cv2.imshow("Webcam", frame)
    
    # Exit the loop on 'q' key press
    if cv2.waitKey(1) == ord('q'):
        break

# Release the capture and close the window
cap.release()
cv2.destroyAllWindows()


#Remember to replace `'path_to_model_weights'` in step 2 with the actual path to your pre-trained model weights. 
#Also, modify the `generate_caption()` function in step 5 to process the model outputs and create the desired caption format based on the specific implementation of your image captioning model.

In [None]:
# To stream the video with captions generated by the previous code to OBS Studio, you can make use of the OBS WebSocket API. Here's an example Python code that establishes a WebSocket connection with OBS Studio and sends the frames with captions for streaming:


import cv2
import time
import websocket
#from obswebsocket import obsws, events
import base64
import json
from io import BytesIO
from PIL import Image, ImageDraw, ImageFont
import torch
from torchvision import transforms
from model import ImageCaptioningModel  # Your image captioning model

# OBS WebSocket configuration
OBS_HOST = "localhost"
OBS_PORT = 4444
OBS_PASSWORD = "your_password"  # Set your OBS password here

# Image captioning model configuration
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_WEIGHTS_PATH = "path_to_model_weights"  # Set the path to your model weights
FONT_PATH = "path_to_your_font_file"  # Set the path to a font file for caption display

# Initialize OBS WebSocket connection
ws = websocket.WebSocket()
ws.connect(f"ws://{OBS_HOST}:{OBS_PORT}", password=OBS_PASSWORD)

# Load the pre-trained image captioning model
model = ImageCaptioningModel().to(DEVICE)
model.load_state_dict(torch.load(MODEL_WEIGHTS_PATH))
model.eval()

# Set up image transformations
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load font for caption display
font = ImageFont.truetype(FONT_PATH, size=20)

# Define a function to process the webcam frames and generate captions
def generate_caption(frame):
    image = transform(frame).unsqueeze(0).to(DEVICE)
    with torch.no_grad():
        outputs = model(image)
    caption = " ".join(outputs)  # Modify based on the output format of your model
    return caption

# Start capturing video from webcam
cap = cv2.VideoCapture(0)  # Use the appropriate index if you have multiple cameras
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)

'''
# 6. Connect to the OBS WebSocket server:

ws = obsws("localhost", 4444)  # Replace with your OBS WebSocket server address
ws.connect()
'''

# Start the main loop
while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Generate caption for the frame
    caption = generate_caption(frame)

    # Draw the caption on the frame
    img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    draw = ImageDraw.Draw(img)
    draw.text((10, 10), caption, fill=(255, 255, 255), font=font)

    # Convert the image to base64 for sending via WebSocket
    buffered = BytesIO()
    img.save(buffered, format="JPEG")
    img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")

    

    # Send the frame with caption to OBS Studio for streaming
    data = {
        "request-type": "SetImageSettings",
        "source": "Webcam",  # Set the name of your OBS webcam source
        "width": img.width,
        "height": img.height,
        "mimeType": "image/jpeg",
        "base64ImageData": img_data
    }
    ws.send(json.dumps(data))

    # Wait for a short interval
    time.sleep(0.1)

# Release the capture and close the WebSocket connection
cap.release()
ws.close()

'''
Before running the code, make sure to replace the following placeholders:

- `OBS_HOST`: Set the host where OBS Studio is running (usually "localhost" if running on the same machine).
- `OBS_PORT`: Set the port on which OBS WebSocket is configured to run (default is 4444).
- `OBS_PASSWORD`: Set the password you configured in OBS WebSocket settings.
- `MODEL_WEIGHTS_PATH`: Set the path to your pre-trained model weights file.
- `FONT_PATH`: Set the path to a font file that will be used for displaying captions on the frames.
- `source`: Set the name of your OBS webcam source (the name you gave to the source in OBS Studio).

After setting the appropriate values, you can run the script, and it will continuously stream the webcam feed with generated captions to OBS Studio.
'''


In [None]:
# To send the output of the previous code to OBS Studio, you can use the OBS WebSockets plugin, which provides a WebSocket server that allows you to control OBS Studio programmatically. Here's an example of how you can modify the previous code to send the generated captions to OBS Studio:

# 1. Install the `obs-websocket-py` library:

# pip install obs-websocket-py


# 2. Import the necessary libraries:

import cv2
import torch
from torchvision import transforms
from obswebsocket import obsws, events
from model import ImageCaptioningModel  # Your image captioning model


# 3. Load the pre-trained image captioning model (same as before):

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ImageCaptioningModel().to(device)
model.load_state_dict(torch.load("path_to_model_weights"))
model.eval()


# 4. Set up the webcam capture (same as before):

cap = cv2.VideoCapture(0)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)


# 5. Define image transformations (same as before):

transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


# 6. Connect to the OBS WebSocket server:

ws = obsws("localhost", 4444)  # Replace with your OBS WebSocket server address
ws.connect()


# 7. Create a function to send the captions to OBS Studio:

def send_to_obs(caption):
    ws.call(requests.SetTextGDIPlusProperties("caption_source_name", text=caption))

# Replace `"caption_source_name"` with the actual name of the text source in OBS Studio.

# 8. Start the main loop:

while True:
    ret, frame = cap.read()
    if not ret:
        break
    
    # Preprocess the frame and generate a caption
    caption = generate_caption(frame)
    
    # Send the caption to OBS Studio
    send_to_obs(caption)
    
    # Display the caption on the frame (optional)
    cv2.putText(frame, caption, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
    
    # Show the frame
    cv2.imshow("Webcam", frame)
    
    # Exit the loop on 'q' key press
    if cv2.waitKey(1) == ord('q'):
        break

# Release the capture, close the window, and disconnect from OBS
cap.release()
cv2.destroyAllWindows()
ws.disconnect()


# Remember to replace `"localhost"` in step 6 with the actual address of your OBS WebSocket server. Also, modify `"caption_source_name"` in step 7 with the name of the text source you want to update in OBS Studio.