In [None]:
# all installations
!pip install -q mediapipe==0.10.20 opencv-python-headless==4.9.0.80 numpy==1.26.4 pillow rembg

import cv2
import mediapipe as mp
import numpy as np
from PIL import Image
from rembg import remove
from google.colab import files
from IPython.display import HTML
from base64 import b64encode
import io, os, warnings

warnings.filterwarnings("ignore")

# upload video + image u want to overlay
print("Upload your face video (MP4, <10s recommended):")
uploaded = files.upload()
video_path = list(uploaded.keys())[0]

print("\nUpload your overlay image (PNG/JPG - e.g. crown, halo, beard, etc.):")
uploaded_img = files.upload()
overlay_path = list(uploaded_img.keys())[0]

# clean file name to prevent errors
safe_name = overlay_path.replace("(", "_").replace(")", "_").replace(" ", "_")
if overlay_path != safe_name:
    os.rename(overlay_path, safe_name)
    overlay_path = safe_name

# load image
print("\nProcessing overlay image...")

try:
    image = Image.open(overlay_path).convert("RGBA")
except:
    img_cv = cv2.imread(overlay_path, cv2.IMREAD_UNCHANGED)
    if img_cv is None:
        raise ValueError("Could not load image.")
    image = Image.fromarray(cv2.cvtColor(img_cv, cv2.COLOR_BGRA2RGBA))

# bg removal
print("\nRemoving background and cleaning transparency...")

def clean_white_regions(img_bgra):
    """Make near-white or white pixels transparent while preserving edges."""
    if img_bgra.shape[2] == 4:
        b, g, r, a = cv2.split(img_bgra)
    else:
        b, g, r = cv2.split(img_bgra)
        a = np.ones(b.shape, dtype=np.uint8) * 255

    # detect near-white pixels (inner whites, background leftovers)
    white_mask = (r > 240) & (g > 240) & (b > 240)

    kernel = np.ones((3,3), np.uint8)
    white_mask = cv2.erode(white_mask.astype(np.uint8), kernel, iterations=1).astype(bool)

    a[white_mask] = 0
    return cv2.merge([b, g, r, a])

try:
    img_bytes = io.BytesIO()
    image.save(img_bytes, format='PNG')
    img_bytes = img_bytes.getvalue()
    output_img = remove(img_bytes)
    image = Image.open(io.BytesIO(output_img)).convert("RGBA")
    overlay_img = np.array(image)
    overlay_img = cv2.cvtColor(overlay_img, cv2.COLOR_RGBA2BGRA)
    overlay_img = clean_white_regions(overlay_img)
    print("bg_removal_done")
except Exception as e:
    print(f"bg_removal_fail: {e}")
    overlay_img = np.array(image)
    overlay_img = cv2.cvtColor(overlay_img, cv2.COLOR_RGBA2BGRA)
    overlay_img = clean_white_regions(overlay_img)
    print("doing normally")

# mediapipe
mp_face_mesh = mp.solutions.face_mesh
face_mesh = mp_face_mesh.FaceMesh(
    static_image_mode=False,
    max_num_faces=1,
    refine_landmarks=True,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5,
)

# placement options
placements = {
    "head",
    "forehead",
    "eyes",
    "nose",
    "mouth",
    "chin",
    "both_cheeks"
}

print("\nlocations ")
for key, desc in placements.items():
    print(f"  â€¢ {key}: {desc}")

placement = input("\nenter choice ").strip().lower()
if placement not in placements:
    placement = "nose"

# overlaying + landmarks
def get_point(lms, idx, w, h):
    return int(lms[idx].x * w), int(lms[idx].y * h)

def distance(p1, p2):
    return np.sqrt((p1[0]-p2[0])**2 + (p1[1]-p2[1])**2)

def overlay_transparent(frame, overlay, x, y):
    h, w = overlay.shape[:2]
    if x >= frame.shape[1] or y >= frame.shape[0] or x + w <= 0 or y + h <= 0:
        return frame
    y1, y2 = max(0, y), min(frame.shape[0], y + h)
    x1, x2 = max(0, x), min(frame.shape[1], x + w)
    overlay_y1, overlay_y2 = max(0, -y), min(h, frame.shape[0] - y)
    overlay_x1, overlay_x2 = max(0, -x), min(w, frame.shape[1] - x)
    alpha_overlay = overlay[overlay_y1:overlay_y2, overlay_x1:overlay_x2, 3] / 255.0
    alpha_frame = 1.0 - alpha_overlay
    for c in range(3):
        frame[y1:y2, x1:x2, c] = (
            alpha_overlay * overlay[overlay_y1:overlay_y2, overlay_x1:overlay_x2, c]
            + alpha_frame * frame[y1:y2, x1:x2, c]
        )
    return frame

# auto-fit logic
def get_config(lms, w, h, placement):
    left_eye_outer = get_point(lms, 133, w, h)
    right_eye_outer = get_point(lms, 362, w, h)
    left_ear = get_point(lms, 234, w, h)
    right_ear = get_point(lms, 454, w, h)
    nose_tip = get_point(lms, 1, w, h)
    top_head = get_point(lms, 10, w, h)
    chin = get_point(lms, 152, w, h)
    mouth_left = get_point(lms, 61, w, h)
    mouth_right = get_point(lms, 291, w, h)

    ear_to_ear = distance(left_ear, right_ear)
    top_to_chin = distance(top_head, chin)
    eye_center = ((left_eye_outer[0]+right_eye_outer[0])//2,
                  (left_eye_outer[1]+right_eye_outer[1])//2)
    mouth_center = ((mouth_left[0]+mouth_right[0])//2,
                    (mouth_left[1]+mouth_right[1])//2)

    if placement == "head":
        anchor = (eye_center[0], top_head[1] - int(0.1 * top_to_chin))
        fit_width = ear_to_ear * 1.1
    elif placement == "forehead":
        anchor = (eye_center[0], top_head[1] + int(0.25 * top_to_chin))
        fit_width = ear_to_ear * 0.9
    elif placement == "eyes":
        anchor = eye_center
        fit_width = ear_to_ear * 0.8
    elif placement == "nose":
        anchor = nose_tip
        fit_width = ear_to_ear * 0.4
    elif placement == "mouth":
        anchor = mouth_center
        fit_width = ear_to_ear * 0.6
    elif placement == "chin":
        anchor = (chin[0], chin[1] - int(0.05 * top_to_chin))
        fit_width = ear_to_ear * 1.2
    elif placement == "both_cheeks":
        anchor = None
        fit_width = ear_to_ear * 0.4
    else:
        anchor = nose_tip
        fit_width = ear_to_ear * 0.6

    return anchor, fit_width, left_ear, right_ear

# process video
cap = cv2.VideoCapture(video_path)
frame_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)
out = cv2.VideoWriter('output.mp4', cv2.VideoWriter_fourcc(*'mp4v'), fps, (frame_w, frame_h))

while True:
    ret, frame = cap.read()
    if not ret:
        break

    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = face_mesh.process(rgb)

    if results.multi_face_landmarks:
        for face_landmarks in results.multi_face_landmarks:
            lms = face_landmarks.landmark
            anchor, fit_width, left_ear, right_ear = get_config(lms, frame_w, frame_h, placement)
            aspect_ratio = overlay_img.shape[0] / overlay_img.shape[1]
            new_w = int(fit_width)
            new_h = int(new_w * aspect_ratio)
            resized_overlay = cv2.resize(overlay_img, (new_w, new_h), interpolation=cv2.INTER_AREA)

            if placement == "both_cheeks":
                for pos in [left_ear, right_ear]:
                    x = int(pos[0] - new_w / 2)
                    y = int(pos[1] - new_h / 2)
                    frame = overlay_transparent(frame, resized_overlay, x, y)
            else:
                x = int(anchor[0] - new_w / 2)
                y = int(anchor[1] - new_h / 2)
                frame = overlay_transparent(frame, resized_overlay, x, y)

    out.write(frame)

cap.release()
out.release()
face_mesh.close()
print("output video saved as output.mp4")

# display inline - not necessary
mp4 = open('output.mp4','rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML(f"""
<video width=640 controls autoplay loop>
    <source src="{data_url}" type="video/mp4">
</video>
""")


Upload your face video (MP4, <10s recommended):
