In [1]:
!pip install git+https://github.com/Breakthrough/PySceneDetect.git
!pip install scenedetect
!pip install scenedetect
!pip install deepface
!pip install torch torchvision torchaudio
!pip install transformers
!pip install opencv-python-headless
!pip install ultralytics
!pip install matplotlib
!pip install fastapi uvicorn python-multipart pyngrok nest-asyncio
!pip install torch transformers ultralytics deepface
!apt-get install -y libgl1-mesa-glx  # For OpenCV

Collecting git+https://github.com/Breakthrough/PySceneDetect.git
  Cloning https://github.com/Breakthrough/PySceneDetect.git to /tmp/pip-req-build-6s0sr2va
  Running command git clone --filter=blob:none --quiet https://github.com/Breakthrough/PySceneDetect.git /tmp/pip-req-build-6s0sr2va
  Resolved https://github.com/Breakthrough/PySceneDetect.git to commit 34dffabd8666bf4cbb94ff1995f74fcf593eb368
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scenedetect
  Building wheel for scenedetect (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scenedetect: filename=scenedetect-0.7.dev0-py3-none-any.whl size=121473 sha256=54b552207c1f7d9cee52ad82454d0d666a87566a72f82edcb85a11208da872ea
  Stored in directory: /tmp/pip-ephem-wheel-cache-53z6gurn/wheels/16/fe/e4/1d9a15143d8c0e6e7262c7167a904db8368d292e2b6ef6b91f
Successfully bu

In [2]:
# Install required packages


# Import libraries
from fastapi import FastAPI, UploadFile, File, Request, HTTPException
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
from fastapi.responses import HTMLResponse, JSONResponse, FileResponse
from fastapi.middleware.cors import CORSMiddleware
import torch
import requests
import cv2
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
from ultralytics import YOLO
from deepface import DeepFace
import numpy as np
import time
import collections
import os
import uvicorn
from pyngrok import ngrok
import nest_asyncio
import shutil
from pathlib import Path
from fastapi.responses import StreamingResponse
# Initialize FastAPI app
app = FastAPI()

# Allow CORS for frontend development
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Mount static files from Google Drive
drive_path = "/content/drive/MyDrive/Project"
app.mount("/static", StaticFiles(directory=f"{drive_path}/static"), name="static")
templates = Jinja2Templates(directory=drive_path)

# 🔑 Replace with your OpenRouter API Key
API_KEY = "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"

# ✅ Global Model Initialization
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base", torch_dtype=torch.float16)
blip_device = "cuda" if torch.cuda.is_available() else "cpu"
blip_model.to(blip_device)

yolo_model_large = YOLO("yolov8n.pt")
yolo_model_small = YOLO("yolov8n.pt")

# Create uploads directory if not exists
os.makedirs(f"{drive_path}/static/uploads", exist_ok=True)

def get_task_from_prompt(prompt):
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json"
    }
    system_msg = (
        "You're a strict classifier. ONLY respond with one of the following exact words:\n"
        "image_captioning, scene_description, object_detection, object_counting, "
        "question_answering, emotion_detection, video_summarization, real_time_tracking.\n"
        "No explanation. No extra text."
    )
    data = {
        "model": "deepseek/deepseek-chat",
        "messages": [
            {"role": "system", "content": system_msg},
            {"role": "user", "content": prompt}
        ],
        "temperature": 0
    }
    try:
        response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=data)
        result = response.json()
        return result['choices'][0]['message']['content'].strip().lower()
    except:
        return "error"

# 🖊️ Generic Chat

def get_small_talk_reply(prompt):
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json"
    }
    data = {
        "model": "deepseek/deepseek-chat",
        "messages": [
            {"role": "system", "content": "You're a helpful and friendly assistant."},
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.7
    }
    try:
        response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=data)
        return response.json()['choices'][0]['message']['content'].strip()
    except:
        return "❌ Error getting reply."

# 🏷️ Image Captioning
def caption_image(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = blip_processor(images=image, return_tensors="pt").to(blip_device, torch.float16)
    generated_ids = blip_model.generate(**inputs, max_new_tokens=50)
    return blip_processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

# 📦 Object Detection
def detect_objects(image_path):
    results = yolo_model_large(image_path)
    classes = results[0].names
    class_ids = results[0].boxes.cls.tolist()
    detected_objects = list(set(classes[int(cls_id)] for cls_id in class_ids))
    annotated_img = results[0].plot()

    # Convert to PIL Image and save
    annotated_path = f"{drive_path}/static/uploads/annotated_{Path(image_path).name}"
    Image.fromarray(annotated_img).save(annotated_path)

    return {
        "objects": detected_objects,
        "annotated_image": f"/static/uploads/annotated_{Path(image_path).name}"
    }

# 🔢 Object Counting
def count_objects(image_path):
    results = yolo_model_large(image_path)
    classes = results[0].names
    class_ids = results[0].boxes.cls.tolist()
    count_dict = {}
    for cls_id in class_ids:
        label = classes[int(cls_id)]
        count_dict[label] = count_dict.get(label, 0) + 1
    return count_dict

# 😊 Emotion Detection
def detect_emotion_np(image_np):
    try:
        result = DeepFace.analyze(img_path=image_np, actions=["emotion"], enforce_detection=False)
        if isinstance(result, list):
            return [r["dominant_emotion"] for r in result]
        else:
            return [result['dominant_emotion']]
    except:
        return []

# 📝 Scene Description
def describe_caption_with_deepseek(caption, objects):
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json"
    }
    prompt = f"Describe the scene based on the following details:\nCaption: {caption}\nDetected Objects: {', '.join(objects)}"
    data = {
        "model": "deepseek/deepseek-chat",
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.7
    }
    try:
        response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=data)
        return response.json()['choices'][0]['message']['content'].strip()
    except:
        return "❌ Error generating scene description."

# 🎩 Video Summarization
def summarize_video(video_path):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        yield f"data: ❌ Could not open video.\n\n"
        return

    frame_id = 0
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    detected = collections.Counter()
    emotions = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frame_id += 1

        if frame_id % (total_frames // 20) == 0:
            yield f"data: 🎥 Summarizing... {frame_id}/{total_frames} frames...\n\n"

        if frame_id % 10 == 0:
            results = yolo_model_large.predict(frame)
            boxes = results[0].boxes
            if boxes:
                class_ids = boxes.cls.tolist()
                class_names = [results[0].names[int(c)] for c in class_ids]
                detected.update(class_names)

            emotions.extend(detect_emotion_np(frame))

    cap.release()

    summary_prompt = f"Summarize this video:\nDetected objects: {dict(detected)}\nEmotions: {collections.Counter(emotions)}"
    summary = get_small_talk_reply(summary_prompt)

    yield f"data: ✅ Summarization Complete!\n\n"
    yield f"data: 📄 {summary}\n\n"
# 🔄 Real-Time Tracking
def track_objects_from_video(video_path):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        yield f"data: ❌ Could not open video.\n\n"
        return

    output_path = f"{drive_path}/static/uploads/output_tracked.avi"
    out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'XVID'), 2.5, (640, 360))

    frame_id = 0
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    snapshots = []
    snapshot_interval = total_frames // 6 if total_frames >= 6 else 1

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frame_id += 1

        if frame_id % snapshot_interval == 0:
            frame_path = f"{drive_path}/static/uploads/frame_{frame_id}.jpg"
            cv2.imwrite(frame_path, frame)
            snapshots.append(f"/static/uploads/frame_{frame_id}.jpg")

        if frame_id % 10 == 0:
            frame = cv2.resize(frame, (640, 360))
            results = yolo_model_small.track(frame, persist=True)
            out.write(frame)

        if frame_id % (total_frames // 20) == 0:
            yield f"data: 📸 {frame_id}/{total_frames} frames processed...\n\n"

    cap.release()
    out.release()

    yield f"data: ✅ Tracking Complete! 🎯 Download [here](/static/uploads/output_tracked.avi)\n\n"
    if snapshots:
        yield f"data: SNAPSHOTS {'|'.join(snapshots)}\n\n"
def upload_file():
    uploaded = files.upload()
    for filename in uploaded.keys():
        return filename


# API Endpoints
@app.post("/upload")
async def upload_file(file: UploadFile = File(...)):
    try:
        file_location = f"{drive_path}/static/uploads/{file.filename}"
        with open(file_location, "wb+") as file_object:
            shutil.copyfileobj(file.file, file_object)

        # Detect if uploaded file is a video
        is_video = file.filename.lower().endswith(('.mp4', '.avi', '.mov', '.mkv'))

        # If it's an image, immediately process caption and detection
        if not is_video:
            caption = caption_image(file_location)
            objects = detect_objects(file_location)
        else:
            caption = None
            objects = None

        return {
            "filename": file.filename,
            "filepath": f"/static/uploads/{file.filename}",
            "is_video": is_video,
            "caption": caption,
            "objects": objects
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.post("/process")
async def process_request(request_data: dict):
    try:
        prompt = request_data.get("prompt", "").strip().lower()
        filepath = request_data.get("filepath", "")
        is_video = request_data.get("is_video", False)
        caption = request_data.get("caption", "")
        objects = request_data.get("objects", [])

        small_talk_phrases = ["hi", "hello", "hey", "yo", "howdy", "sup",
                             "good morning", "good evening", "what's up"]
        if prompt in small_talk_phrases:
            return {
                "response": "🤖 Hey there! 👋\n✨ By the way, I can help you with tasks like:\n- 🖼️ Captioning\n- 📷 Descriptions\n- 🔍 Detection\n- 🔢 Counting\n- 😊 Emotions\n- ❓ Q&A\n- 🎬 Summarization\n- 🔄 Tracking",
                "type": "small_talk"
            }

        # Get task from prompt
        task = get_task_from_prompt(prompt)

        if not task:
            return {
                "response": "🤖 I'm here for anything you need! Feel free to ask me about an image or video task anytime.\n🎯 I'm capable of:\n- 🖼️ Captioning\n- 📷 Descriptions\n- 🔍 Detection\n- 🔢 Counting\n- 😊 Emotions\n- ❓ Q&A\n- 🎬 Summarization\n- 🔄 Tracking",
                "type": "info"
            }

        if task in ["real_time_tracking", "video_summarization"] and not is_video:
            return {
                "response": "❌ This task requires a video file.",
                "type": "error"
            }
        elif task in ["image_captioning", "scene_description", "object_detection",
                      "object_counting", "emotion_detection", "question_answering"] and is_video:
            return {
                "response": "❌ This task requires an image file.",
                "type": "error"
            }

        # Process the task
        full_path = f"{drive_path}{filepath}"
        result = {}
        if task == "image_captioning":
            result["response"] = f"🖼️ Caption: {caption}"
        elif task == "scene_description":
            description = describe_caption_with_deepseek(caption, objects)
            result["response"] = f"📝 Scene Description: {description}"
        elif task == "object_detection":
            result["response"] = f"📦 Detected Objects: {objects['objects']}"  # 🛠 fix here
            result["annotated_image"] = objects['annotated_image']             # 🛠 fix here
        elif task == "object_counting":
            count = count_objects(full_path)
            result["response"] = f"🔢 Object Count: {count}"
        elif task == "emotion_detection":
            img = cv2.imread(full_path)
            emotions = detect_emotion_np(img)
            result["response"] = f"😊 Detected Emotions: {collections.Counter(emotions)}"
        elif task == "question_answering":
            answer = get_small_talk_reply(f"Based on the image caption: '{caption}', answer: {prompt}")
            result["response"] = f"💡 Answer: {answer}"
        elif task == "real_time_tracking":
            tracking_result = track_objects_from_video(full_path)
            result["response"] = f"🧠 AI Summary of the video:\n{tracking_result['summary']}\n\n📦 Final Unique Object Count:\n{tracking_result['object_summary']}"
            result["video_path"] = tracking_result["video_path"]
        elif task == "video_summarization":
            summary = summarize_video(full_path)
            result["response"] = f"🎩 Video Summary:\n{summary}"

        result["type"] = "task_response"
        return result

    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))


@app.get("/", response_class=HTMLResponse)
async def serve_frontend(request: Request):
    return templates.TemplateResponse("index.html", {"request": request})

# Run the FastAPI server with ngrok
def run_server():
    # Set up ngrok (replace with your auth token)
    ngrok.set_auth_token("2wH83sGp2f24MRnLJqg7y7j5Mgy_87wid7Apcni7rhTr76Bmt")
    public_url = ngrok.connect(8000).public_url
    print(f" * Public URL: {public_url}")

    # Configure FastAPI to run with uvicorn
    nest_asyncio.apply()
    uvicorn.run(app, host="0.0.0.0", port=8000)

# Run this to start the server
run_server()

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
25-06-26 05:37:16 - Directory /root/.deepface has been created
25-06-26 05:37:16 - Directory /root/.deepface/weights has been created


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n.pt to 'yolov8n.pt'...



100%|██████████| 6.25M/6.25M [00:00<00:00, 129MB/s]


 * Public URL: https://6839-35-204-160-197.ngrok-free.app


INFO:     Started server process [1211]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


INFO:     139.135.41.199:0 - "GET / HTTP/1.1" 200 OK
INFO:     139.135.41.199:0 - "GET /static/css/style.css HTTP/1.1" 200 OK
INFO:     139.135.41.199:0 - "GET /static/img/ai-avatar.png HTTP/1.1" 404 Not Found
INFO:     139.135.41.199:0 - "GET /static/js/script.js HTTP/1.1" 200 OK
INFO:     139.135.41.199:0 - "GET /static/img/robot.png HTTP/1.1" 200 OK
INFO:     139.135.41.199:0 - "GET /favicon.ico HTTP/1.1" 404 Not Found

image 1/1 /content/drive/MyDrive/Project/static/uploads/Screenshot 2025-04-21 221745.png: 448x640 3 persons, 1 fire hydrant, 1 backpack, 64.3ms
Speed: 17.7ms preprocess, 64.3ms inference, 220.6ms postprocess per image at shape (1, 3, 448, 640)
INFO:     139.135.41.199:0 - "POST /upload HTTP/1.1" 200 OK
INFO:     139.135.41.199:0 - "POST /process HTTP/1.1" 200 OK
INFO:     139.135.41.199:0 - "GET / HTTP/1.1" 200 OK
INFO:     139.135.41.199:0 - "GET /static/img/ai-avatar.png HTTP/1.1" 404 Not Found

image 1/1 /content/drive/MyDrive/Project/static/uploads/zr4kx3iw.png: 

Downloading...
From: https://github.com/serengil/deepface_models/releases/download/v1.0/facial_expression_model_weights.h5
To: /root/.deepface/weights/facial_expression_model_weights.h5
100%|██████████| 5.98M/5.98M [00:00<00:00, 68.0MB/s]


INFO:     139.135.41.199:0 - "POST /process HTTP/1.1" 200 OK
INFO:     139.135.41.199:0 - "GET / HTTP/1.1" 200 OK
INFO:     139.135.41.199:0 - "GET /static/img/ai-avatar.png HTTP/1.1" 404 Not Found

INFO:     139.135.41.199:0 - "POST /upload HTTP/1.1" 500 Internal Server Error
INFO:     139.135.41.237:0 - "POST /process HTTP/1.1" 200 OK
INFO:     139.135.41.237:0 - "GET / HTTP/1.1" 200 OK
INFO:     139.135.41.237:0 - "GET /static/img/ai-avatar.png HTTP/1.1" 404 Not Found


INFO:     Shutting down
INFO:     Finished server process [1211]
