# ðŸ§ª CookMate AI: Qwen-VL Backend (Memory Optimized)
This version uses **4-bit quantization** to fit comfortably in Colab's T4 GPU memory.

### Instructions:
1. Run the cells below.
2. Copy the **Ngrok URL** at the end.
3. Paste into website settings.

**Fixes implemented:**
- 4-bit Quantization (saves ~75% GPU memory)
- Automatic Cache Clearing after every detection
- Image resolution capping (prevents OOM on large images)
- Explicit CORS handling for website connection

In [None]:
# 1. Install Optimized Dependencies
!pip install -q transformers torch torchvision qwen-vl-utils accelerate flask flask-cors pyngrok flash-attn bitsandbytes --no-build-isolation
!pip install -q git+https://github.com/huggingface/transformers.git

In [None]:
# 2. Load Model with 4-bit Quantization
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
from qwen_vl_utils import process_vision_info
import os
import gc

os.environ["HF_HUB_READ_TIMEOUT"] = "300"
device = "cuda" if torch.cuda.is_available() else "cpu"
model_id = "Qwen/Qwen2.5-VL-3B-Instruct"

print("Loading Model in 4-bit mode...")

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_id, 
    device_map="auto",
    quantization_config=quantization_config
)
processor = AutoProcessor.from_pretrained(model_id)
print("Model Ready!")

In [None]:
# 3. API Server Setup
from flask import Flask, request, jsonify, make_response
from flask_cors import CORS
import base64
from PIL import Image
import io
from pyngrok import ngrok

app = Flask(__name__)
CORS(app, resources={r"/*": {"origins": "*"}})

@app.after_request
def add_header(response):
    response.headers['Access-Control-Allow-Origin'] = '*'
    response.headers['Access-Control-Allow-Headers'] = 'Content-Type,Authorization,ngrok-skip-browser-warning'
    response.headers['Access-Control-Allow-Methods'] = 'GET,PUT,POST,DELETE,OPTIONS'
    return response

@app.route('/ping', methods=['GET', 'OPTIONS'])
def ping():
    if request.method == 'OPTIONS': return make_response('', 200)
    return jsonify({"status": "alive"})

@app.route('/detect', methods=['POST', 'OPTIONS'])
def detect():
    if request.method == 'OPTIONS': return make_response('', 200)
        
    data = request.json
    if not data or 'image' not in data:
        return jsonify({"error": "No image provided"}), 400

    try:
        # Decode Image and Resize to prevent OOM
        image_data = base64.b64decode(data['image'])
        image = Image.open(io.BytesIO(image_data))
        image.thumbnail((1280, 1280)) # Scale down large images
        
        # Inference
        messages = [
            {
                "role": "system",
                "content": "Identify raw food ingredients, vegetables, or fruits. Output ONLY the names as a plain comma-separated list. No intro. Example: Tomato, Onion"
            },
            {
                "role": "user",
                "content": [{"type": "image", "image": image}, {"type": "text", "text": "List the ingredients."}]
            }
        ]
        
        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        image_inputs, _ = process_vision_info(messages)
        inputs = processor(text=[text], images=image_inputs, padding=True, return_tensors="pt").to(device)
        
        with torch.no_grad():
            generated_ids = model.generate(**inputs, max_new_tokens=50)
        
        output_text = processor.batch_decode(
            [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)], 
            skip_special_tokens=True
        )[0]

        # Cleanup memory immediately
        del inputs, generated_ids
        gc.collect()
        torch.cuda.empty_cache()

        # Parse results
        clean_text = output_text.replace("Detected:", "").replace("*", "").replace("-", "")
        ingredients = [idx.strip() for idx in clean_text.split(",") if idx.strip()]
        print(f"Detected: {ingredients}")
        
        return jsonify({"ingredients": ingredients})

    except Exception as e:
        print(f"Error: {e}")
        torch.cuda.empty_cache()
        return jsonify({"error": str(e)}), 500

# 4. Start Tunnel and Run
print("Enter Ngrok Auth Token (or set in Colab Secrets as 'NGROK_AUTH_TOKEN'):")
try:
    from google.colab import userdata
    auth_token = userdata.get('NGROK_AUTH_TOKEN')
    if auth_token:
        print("Loaded token from Colab Secrets.")
except:
    auth_token = input("Paste your Ngrok Auth Token: ")

if auth_token:
    ngrok.set_auth_token(auth_token)

try:
    public_url = ngrok.connect(5000, bind_tls=True).public_url
    print(f"\nðŸš€ SERVER LIVE AT: {public_url}")
except Exception as e:
    print(f"Ngrok error: {e}")

app.run(host='0.0.0.0', port=5000)