<a href="https://colab.research.google.com/github/keerthi612004/text-to-hand-gesture/blob/main/text_to_hand_gesture.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# ===== Extract features ONLY for label 'O' and print ready-to-paste rows =====
!pip install mediapipe
import os, cv2, numpy as np, pandas as pd
from tqdm import tqdm
from google.colab import drive
import mediapipe as mp

# --- Mount drive ---
# drive.mount('/content/drive')

# --- CONFIG: adjust if your folder name differs (e.g., 'o' vs 'O') ---
LABEL = 'O'
LETTER_DIR = f'/content/drive/MyDrive/BDA/Preprocessed_data/{LABEL}'  # your enhanced images by letter
SAVE_CSV = '/content/drive/MyDrive/BDA/features_O_only.csv'          # mini CSV for just 'O'

os.makedirs(os.path.dirname(SAVE_CSV), exist_ok=True)

# --- MediaPipe setups (Hands + Holistic fallback) ---
mp_hands = mp.solutions.hands
mp_holistic = mp.solutions.holistic
hands = mp_hands.Hands(static_image_mode=True, max_num_hands=1, min_detection_confidence=0.30)
holistic = mp_holistic.Holistic(static_image_mode=True)

# --- Enhancement: gamma + CLAHE(L channel) + bilateral denoise ---
def enhance_for_features(img, size=(256,256)):
    img = cv2.resize(img, size, interpolation=cv2.INTER_CUBIC)
    # gamma brighten
    gamma = 1.8
    invGamma = 1.0 / gamma
    table = np.array([(i / 255.0) ** invGamma * 255 for i in np.arange(256)]).astype("uint8")
    img = cv2.LUT(img, table)
    # CLAHE on L channel
    lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
    l, a, b = cv2.split(lab)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    l = clahe.apply(l)
    img = cv2.cvtColor(cv2.merge((l, a, b)), cv2.COLOR_LAB2BGR)
    # denoise (edge-preserving)
    img = cv2.bilateralFilter(img, 7, 75, 75)
    return img

# --- Simple skin-based crop as a retry path ---
def crop_hand_region(img):
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    lower = np.array([0, 20, 70], dtype=np.uint8)   # broad skin-ish range
    upper = np.array([20, 255, 255], dtype=np.uint8)
    mask1 = cv2.inRange(hsv, lower, upper)
    # also try a secondary range to be safe
    lower2 = np.array([160, 20, 70], dtype=np.uint8)
    upper2 = np.array([179, 255, 255], dtype=np.uint8)
    mask = cv2.bitwise_or(mask1, cv2.inRange(hsv, lower2, upper2))
    cnts, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if cnts:
        c = max(cnts, key=cv2.contourArea)
        x,y,w,h = cv2.boundingRect(c)
        # pad a bit
        pad = int(0.05*max(w,h))
        x = max(0, x-pad); y = max(y-pad); h = h+2*pad; w = w+2*pad;
        # ensure bounds stay within image dimensions
        img_h, img_w = img.shape[:2]
        y = max(0, y)
        x = max(0, x)
        h = min(img_h - y, h)
        w = min(img_w - x, w)
        return img[y:y+h, x:x+w]
    return img

def extract_landmarks_any(image_bgr):
    # 1) try Hands
    res = hands.process(cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB))
    if res.multi_hand_landmarks:
        lm = res.multi_hand_landmarks[0].landmark
        return [(p.x, p.y, p.z) for p in lm]
    # 2) try Holistic
    res2 = holistic.process(cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB))
    if res2.left_hand_landmarks:
        lm = res2.left_hand_landmarks.landmark
        return [(p.x, p.y, p.z) for p in lm]
    if res2.right_hand_landmarks:
        lm = res2.right_hand_landmarks.landmark
        return [(p.x, p.y, p.z) for p in lm]
    return None

rows, failed = [], []

if not os.path.isdir(LETTER_DIR):
    raise FileNotFoundError(f"Folder not found: {LETTER_DIR}")

files = [f for f in os.listdir(LETTER_DIR) if f.lower().endswith(('.jpg','.jpeg','.png'))]
files.sort()

for filename in tqdm(files, desc=f"Extracting '{LABEL}'"):
    path = os.path.join(LETTER_DIR, filename)
    img = cv2.imread(path)
    if img is None:
        failed.append(path); continue

    # enhanced
    enh = enhance_for_features(img)

    # try direct
    lms = extract_landmarks_any(enh)

    # retry with crop if needed
    if lms is None:
        cropped = crop_hand_region(enh)
        if cropped.size != 0:
            lms = extract_landmarks_any(cropped)

    if lms:
        row = {'label': LABEL, 'file': filename}
        # ensure exactly 21 landmarks (mediapipe standard)
        if len(lms) >= 21:
            lms = lms[:21]
        elif len(lms) < 21:
            # pad with NaNs to keep schema consistent
            lms = lms + [(np.nan, np.nan, np.nan)]*(21 - len(lms))

        for i, (x,y,z) in enumerate(lms):
            row[f'x{i}'] = x
            row[f'y{i}'] = y
            row[f'z{i}'] = z
        rows.append(row)
    else:
        failed.append(path)

# save mini CSV
df_o = pd.DataFrame(rows)
df_o.to_csv(SAVE_CSV, index=False)
print(f"\n‚úÖ Saved {len(df_o)} extracted rows to: {SAVE_CSV}")
print(f"‚ö†Ô∏è Failed: {len(failed)}")

# ALSO: print ready-to-paste CSV lines (matching your main file schema)
if len(df_o) > 0:
    # build header once
    header = ['label','file'] + [f'{ax}{i}' for i in range(21) for ax in ('x','y','z')]
    # ensure column order
    df_o = df_o.reindex(columns=header)
    print("\n======== COPY BELOW (header) ========")
    print(','.join(header))
    print("======== COPY BELOW (rows) ==========")
    for _, r in df_o.iterrows():
        vals = [str(r[c]) if pd.notna(r[c]) else '' for c in header]
        print(','.join(vals))
else:
    print("No successful extractions for 'O'. Try lowering confidence to 0.2 or check image quality.")

Collecting mediapipe
  Downloading mediapipe-0.10.21-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (9.7 kB)
Collecting numpy<2 (from mediapipe)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m61.0/61.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf<5,>=4.25.3 (from mediapipe)
  Downloading protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting sounddevice>=0.4.4 (from mediapipe)
  Downloading sounddevice-0.5.2-py3-none-any.whl.metadata (1.6 kB)
INFO: pip is looking at multiple versions of opencv-contrib-python to determine which version is compatible with other requirements. This could take a while.
Collecting opencv-contrib-python (from mediapipe)
  Downloading opencv_contrib_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
!pip uninstall pandas -y
!pip install pandas mediapipe

# Re-import the libraries after re-installation
import os, cv2, numpy as np, pandas as pd
from tqdm import tqdm
from google.colab import drive
import mediapipe as mp

# --- Mount drive ---
# drive.mount('/content/drive')

# --- CONFIG: adjust if your folder name differs (e.g., 'o' vs 'O') ---
LABEL = 'O'
LETTER_DIR = f'/content/drive/MyDrive/BDA/Preprocessed_data/{LABEL}'  # your enhanced images by letter
SAVE_CSV = '/content/drive/MyDrive/BDA/features_O_only.csv'          # mini CSV for just 'O'

os.makedirs(os.path.dirname(SAVE_CSV), exist_ok=True)

# --- MediaPipe setups (Hands + Holistic fallback) ---
mp_hands = mp.solutions.hands
mp_holistic = mp.solutions.holistic
hands = mp_hands.Hands(static_image_mode=True, max_num_hands=1, min_detection_confidence=0.30)
holistic = mp_holistic.Holistic(static_image_mode=True)

# --- Enhancement: gamma + CLAHE(L channel) + bilateral denoise ---
def enhance_for_features(img, size=(256,256)):
    img = cv2.resize(img, size, interpolation=cv2.INTER_CUBIC)
    # gamma brighten
    gamma = 1.8
    invGamma = 1.0 / gamma
    table = np.array([(i / 255.0) ** invGamma * 255 for i in np.arange(256)]).astype("uint8")
    img = cv2.LUT(img, table)
    # CLAHE on L channel
    lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
    l, a, b = cv2.split(lab)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    l = clahe.apply(l)
    img = cv2.cvtColor(cv2.merge((l, a, b)), cv2.COLOR_LAB2BGR)
    # denoise (edge-preserving)
    img = cv2.bilateralFilter(img, 7, 75, 75)
    return img

# --- Simple skin-based crop as a retry path ---
def crop_hand_region(img):
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    lower = np.array([0, 20, 70], dtype=np.uint8)   # broad skin-ish range
    upper = np.array([20, 255, 255], dtype=np.uint8)
    mask1 = cv2.inRange(hsv, lower, upper)
    # also try a secondary range to be safe
    lower2 = np.array([160, 20, 70], dtype=np.uint8)
    upper2 = np.array([179, 255, 255], dtype=np.uint8)
    mask = cv2.bitwise_or(mask1, cv2.inRange(hsv, lower2, upper2))
    cnts, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if cnts:
        c = max(cnts, key=cv2.contourArea)
        x,y,w,h = cv2.boundingRect(c)
        # pad a bit
        pad = int(0.05*max(w,h))
        x = max(0, x-pad)
        y = max(0, y-pad)
        h = h+2*pad
        w = w+2*pad
        # ensure bounds stay within image dimensions
        img_h, img_w = img.shape[:2]
        y = max(0, y)
        x = max(0, x)
        h = min(img_h - y, h)
        w = min(img_w - x, w)
        return img[y:y+h, x:x+w]
    return img


def extract_landmarks_any(image_bgr):
    # 1) try Hands
    res = hands.process(cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB))
    if res.multi_hand_landmarks:
        lm = res.multi_hand_landmarks[0].landmark
        return [(p.x, p.y, p.z) for p in lm]
    # 2) try Holistic
    res2 = holistic.process(cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB))
    if res2.left_hand_landmarks:
        lm = res2.left_hand_landmarks.landmark
        return [(p.x, p.y, p.z) for p in lm]
    if res2.right_hand_landmarks:
        lm = res2.right_hand_landmarks.landmark
        return [(p.x, p.y, p.z) for p in lm]
    return None

rows, failed = [], []

if not os.path.isdir(LETTER_DIR):
    raise FileNotFoundError(f"Folder not found: {LETTER_DIR}")

files = [f for f in os.listdir(LETTER_DIR) if f.lower().endswith(('.jpg','.jpeg','.png'))]
files.sort()

for filename in tqdm(files, desc=f"Extracting '{LABEL}'"):
    path = os.path.join(LETTER_DIR, filename)
    img = cv2.imread(path)
    if img is None:
        failed.append(path); continue

    # enhanced
    enh = enhance_for_features(img)

    # try direct
    lms = extract_landmarks_any(enh)

    # retry with crop if needed
    if lms is None:
        cropped = crop_hand_region(enh)
        if cropped.size != 0:
            lms = extract_landmarks_any(cropped)

    if lms:
        row = {'label': LABEL, 'file': filename}
        # ensure exactly 21 landmarks (mediapipe standard)
        if len(lms) >= 21:
            lms = lms[:21]
        elif len(lms) < 21:
            # pad with NaNs to keep schema consistent
            lms = lms + [(np.nan, np.nan, np.nan)]*(21 - len(lms))

        for i, (x,y,z) in enumerate(lms):
            row[f'x{i}'] = x
            row[f'y{i}'] = y
            row[f'z{i}'] = z
        rows.append(row)
    else:
        failed.append(path)

# save mini CSV
df_o = pd.DataFrame(rows)
df_o.to_csv(SAVE_CSV, index=False)
print(f"\n‚úÖ Saved {len(df_o)} extracted rows to: {SAVE_CSV}")
print(f"‚ö†Ô∏è Failed: {len(failed)}")

# ALSO: print ready-to-paste CSV lines (matching your main file schema)
if len(df_o) > 0:
    # build header once
    header = ['label','file'] + [f'{ax}{i}' for i in range(21) for ax in ('x','y','z')]
    # ensure column order
    df_o = df_o.reindex(columns=header)
    print("\n======== COPY BELOW (header) ========")
    print(','.join(header))
    print("======== COPY BELOW (rows) ==========")
    for _, r in df_o.iterrows():
        vals = [str(r[c]) if pd.notna(r[c]) else '' for c in header]
        print(','.join(vals))
else:
    print("No successful extractions for 'O'. Try lowering confidence to 0.2 or check image quality.")

Found existing installation: pandas 2.3.3
Uninstalling pandas-2.3.3:
  Successfully uninstalled pandas-2.3.3
Collecting pandas
  Using cached pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
Using cached pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.4 MB)
Installing collected packages: pandas
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.3.3 which is incompatible.
cudf-cu12 25.6.0 requires pandas<2.2.4dev0,>=2.0, but you have pandas 2.3.3 which is incompatible.
dask-cudf-cu12 25.6.0 requires pandas<2.2.4dev0,>=2.0, but you have pandas 2.3.3 which is incompatible.[0m[31m
[0mSuccessfully installed pandas-2.3.3


Extracting 'O': 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 300/300 [00:39<00:00,  7.65it/s]



‚úÖ Saved 4 extracted rows to: /content/drive/MyDrive/BDA/features_O_only.csv
‚ö†Ô∏è Failed: 296

label,file,x0,y0,z0,x1,y1,z1,x2,y2,z2,x3,y3,z3,x4,y4,z4,x5,y5,z5,x6,y6,z6,x7,y7,z7,x8,y8,z8,x9,y9,z9,x10,y10,z10,x11,y11,z11,x12,y12,z12,x13,y13,z13,x14,y14,z14,x15,y15,z15,x16,y16,z16,x17,y17,z17,x18,y18,z18,x19,y19,z19,x20,y20,z20
O,1200.jpg,0.5698581337928772,0.18805618584156036,2.620690793264657e-07,0.5920262336730957,0.18876826763153076,-0.004561762325465679,0.6234095692634583,0.18219773471355438,-0.008262060582637787,0.6462398171424866,0.1793142706155777,-0.012676806189119816,0.6644434332847595,0.1764194816350937,-0.016964832320809364,0.6281495094299316,0.15269194543361664,-0.004288057330995798,0.6645672917366028,0.16203689575195312,-0.014773876406252384,0.6894688010215759,0.17363449931144714,-0.023177409544587135,0.7060701847076416,0.18521180748939514,-0.028140660375356674,0.6258429288864136,0.15314032137393951,-0.007064793258905411,0.6629476547241211,0.16287508606910706,-0.0162719

In [None]:
import os

# ----------------------------
# CONFIG PATHS
# ----------------------------
GEN_FRAMES_PATH = "/content/drive/MyDrive/BDA/generated_frames"
REAL_FRAMES_PATH = "/content/drive/MyDrive/BDA/real_signs"

# Create the directories if they don't exist
os.makedirs(GEN_FRAMES_PATH, exist_ok=True)
os.makedirs(REAL_FRAMES_PATH, exist_ok=True)

print(f"Directories created or already exist: {GEN_FRAMES_PATH} and {REAL_FRAMES_PATH}")

Directories created or already exist: /content/drive/MyDrive/BDA/generated_frames and /content/drive/MyDrive/BDA/real_signs


UI based text to sign generation


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
import gradio as gr
import os, cv2, numpy as np, pandas as pd
from PIL import Image, ImageDraw, ImageFont
from moviepy.editor import ImageSequenceClip
import tempfile

# ---- CONFIG (adjust to your Drive paths) ----
DATA_PATH = "/content/drive/MyDrive/BDA/Stratified_data"
FEATURES_CSV = "/content/drive/MyDrive/BDA/feature_extraction/features_.csv"
OUTPUT_DIR = "/content/drive/MyDrive/BDA"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load once
df = pd.read_csv(FEATURES_CSV)

# Font fallback
def get_font():
    try:
        return ImageFont.truetype("/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf", 50)
    except:
        return ImageFont.load_default()

FONT = get_font()

def generate_animation(word, seconds_per_letter):
    if not word or not word.strip():
        return None, "Please type some text."

    word_upper = word.strip().upper()
    unique_letters = list(set(word_upper))
    subset = df[df["label"].isin(unique_letters)].copy()

    frames = []
    paths = []
    last_valid = None

    # Build sequence of image paths
    for ch in word_upper:
        row = subset[subset["label"] == ch]
        if not row.empty:
            sel = row.sample(1, random_state=42).iloc[0]
            p = os.path.join(DATA_PATH, ch, sel["file"])
            paths.append(p)
            last_valid = p
        else:
            if last_valid:
                paths.append(last_valid)

    if not paths:
        return None, "No matching letters found in dataset."

    # Create labeled frames + transitions
    for i, path in enumerate(paths):
        img = cv2.imread(path)
        if img is None:
            continue
        img = cv2.resize(img, (400, 400))
        rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        pil = Image.fromarray(rgb)
        draw = ImageDraw.Draw(pil)
        draw.text((20, 20), os.path.basename(os.path.dirname(path)), fill=(255, 255, 255), font=FONT)
        frames.append(np.array(pil))

        # Fade to next
        if i < len(paths)-1:
            nxt = cv2.imread(paths[i+1])
            if nxt is not None:
                nxt = cv2.resize(nxt, (400, 400))
                nxt_rgb = cv2.cvtColor(nxt, cv2.COLOR_BGR2RGB)
                for a in np.linspace(0, 1, 10):
                    blend = cv2.addWeighted(rgb, 1-a, nxt_rgb, a, 0)
                    frames.append(blend)

    if not frames:
        return None, "Failed to build frames."

    # FPS so that each letter ‚âà seconds_per_letter
    # Each letter adds 1 base frame + ~10 transition frames ‚Üí ‚âà11 frames/letter
    frames_per_letter = 11
    total_letters = len(paths)
    total_frames = len(frames)
    # Target fps = total_frames / (total_letters * seconds_per_letter)
    fps = max(1, int(round(total_frames / (total_letters * seconds_per_letter))))

    # Save mp4 to a temp file and also copy to Drive
    tmpfile = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
    tmpfile.close()
    clip = ImageSequenceClip(frames, fps=fps)
    clip.write_videofile(tmpfile.name, codec="libx264", fps=fps, audio=False, verbose=False, logger=None)

    # Also save a named copy under your Drive output
    final_path = os.path.join(OUTPUT_DIR, f"{word.lower()}_animation.mp4")
    clip.write_videofile(final_path, codec="libx264", fps=fps, audio=False, verbose=False, logger=None)

    return tmpfile.name, f"Saved to Drive: {final_path}"

with gr.Blocks(title="Text ‚Üí Sign Animation") as demo:
    gr.Markdown("## ü§ü Text-to-Sign Gesture Animator (UI-only)\nType text below to generate a labeled sign animation.")
    with gr.Row():
        inp = gr.Textbox(label="Enter text", value="HELLO")
        secs = gr.Slider(label="Duration per letter (seconds)", minimum=2, maximum=5, value=3, step=1)
    btn = gr.Button("üé¨ Generate Animation")
    out_video = gr.Video(label="Preview")
    out_msg = gr.Markdown()

    btn.click(fn=generate_animation, inputs=[inp, secs], outputs=[out_video, out_msg])

demo.launch(debug=False)  # in Colab it opens right below this cell


  IMAGEMAGICK_BINARY = r"C:\Program Files\ImageMagick-6.8.8-Q16\magick.exe"
  lines_video = [l for l in lines if ' Video: ' in l and re.search('\d+x\d+', l)]
  rotation_lines = [l for l in lines if 'rotate          :' in l and re.search('\d+$', l)]
  match = re.search('\d+$', rotation_line)
  if event.key is 'enter':



It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://b891d4169ee8029ebb.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
# ======================================================
# METRIC EVALUATION FOR SIGN ANIMATION GENERATION
# ======================================================
import os
import numpy as np
from PIL import Image
import torch
import torchvision.transforms as T
import torchvision.models as models
from skimage.metrics import structural_similarity as ssim
import lpips
from scipy import linalg
from tqdm import tqdm
import clip

# ----------------------------
# CONFIG PATHS
# ----------------------------
GEN_FRAMES_PATH = "/content/drive/MyDrive/BDA/generated_frames"
REAL_FRAMES_PATH = "/content/drive/MyDrive/BDA/real_signs"

# Create the directories if they don't exist
os.makedirs(GEN_FRAMES_PATH, exist_ok=True)
os.makedirs(REAL_FRAMES_PATH, exist_ok=True)


device = "cuda" if torch.cuda.is_available() else "cpu"

# ----------------------------
# 1Ô∏è‚É£ SSIM - Structural Similarity
# ----------------------------
def compute_ssim(gen_path):
    images = sorted([os.path.join(gen_path, f) for f in os.listdir(gen_path) if f.endswith(".png") or f.endswith(".jpg")])
    ssim_values = []
    for i in range(len(images) - 1):
        img1 = np.array(Image.open(images[i]).convert("L"))
        img2 = np.array(Image.open(images[i+1]).convert("L"))
        ssim_values.append(ssim(img1, img2))
    return np.mean(ssim_values)

# ----------------------------
# 2Ô∏è‚É£ LPIPS - Perceptual Similarity
# ----------------------------
def compute_lpips(real_path, gen_path):
    model = lpips.LPIPS(net='alex').to(device)
    real_imgs = sorted([os.path.join(real_path, f) for f in os.listdir(real_path)])
    gen_imgs = sorted([os.path.join(gen_path, f) for f in os.listdir(gen_path)])
    scores = []
    # Ensure both lists have the same length
    min_len = min(len(real_imgs), len(gen_imgs))
    for r, g in zip(real_imgs[:min_len], gen_imgs[:min_len]):
        r_img = lpips.im2tensor(lpips.load_image(r)).to(device)
        g_img = lpips.im2tensor(lpips.load_image(g)).to(device)
        with torch.no_grad():
            d = model(r_img, g_img)
        scores.append(d.item())
    return np.mean(scores)

# ----------------------------
# 3Ô∏è‚É£ FID - Fr√©chet Inception Distance
# ----------------------------
def compute_fid(real_path, gen_path):
    inception = models.inception_v3(pretrained=True, transform_input=False).to(device).eval()
    preprocess = T.Compose([
        T.Resize((299, 299)),
        T.ToTensor(),
        T.Normalize(mean=[0.5]*3, std=[0.5]*3)
    ])

    def get_feats(folder):
        feats = []
        for img_name in tqdm(os.listdir(folder), desc=f"Extracting features from {folder}"):
            path = os.path.join(folder, img_name)
            img = Image.open(path).convert("RGB")
            x = preprocess(img).unsqueeze(0).to(device)
            with torch.no_grad():
                # Use the auxiliary logits for FID
                f = inception(x)[0] if isinstance(inception(x), tuple) else inception(x)
            feats.append(f.cpu().numpy().flatten())
        return np.array(feats)

    real_feats = get_feats(real_path)
    gen_feats = get_feats(gen_path)

    mu1, sigma1 = real_feats.mean(axis=0), np.cov(real_feats, rowvar=False)
    mu2, sigma2 = gen_feats.mean(axis=0), np.cov(gen_feats, rowvar=False)
    # Ensure square root of a matrix is real
    covmean = linalg.sqrtm(sigma1.dot(sigma2), disp=False)[0]
    if not np.isfinite(covmean).all():
        print("FID calculation failed: sqrtm returned non-finite values.")
        return np.nan # Or raise an error

    fid = np.sum((mu1 - mu2) ** 2) + np.trace(sigma1 + sigma2 - 2 * covmean)
    return np.real(fid)

# ----------------------------
# 4Ô∏è‚É£ CLIP Semantic Similarity
# ----------------------------
def compute_clip_similarity(text, gen_path):
    model, preprocess = clip.load("ViT-B/32", device=device)
    text_tokens = clip.tokenize([text]).to(device)
    text_features = model.encode_text(text_tokens)

    img_paths = sorted([os.path.join(gen_path, f) for f in os.listdir(gen_path)])
    sims = []
    for path in tqdm(img_paths, desc="Evaluating CLIP similarity"):
        img = preprocess(Image.open(path)).unsqueeze(0).to(device)
        with torch.no_grad():
            img_feat = model.encode_image(img)
        sims.append(torch.cosine_similarity(img_feat, text_features).item())
    return np.mean(sims)

# ----------------------------
# RUN ALL METRICS
# ----------------------------
text_input = "HELLO"  # example phrase to evaluate
ssim_score = compute_ssim(GEN_FRAMES_PATH)
lpips_score = compute_lpips(REAL_FRAMES_PATH, GEN_FRAMES_PATH)
fid_score = compute_fid(REAL_FRAMES_PATH, GEN_FRAMES_PATH)
clip_score = compute_clip_similarity(text_input, GEN_FRAMES_PATH)

print("\n===== SIGN ANIMATION QUALITY METRICS =====")
print(f"üñºÔ∏è Structural Similarity (SSIM): {ssim_score:.3f}  ‚Üí Higher = smoother transitions")
print(f"üëÅÔ∏è Perceptual Distance (LPIPS): {lpips_score:.3f}  ‚Üí Lower = more realistic visuals")
print(f"üìà Fr√©chet Inception Distance (FID): {fid_score:.2f}  ‚Üí Lower = closer to real distribution")
print(f"üî§ CLIP Semantic Similarity: {clip_score:.3f}  ‚Üí Higher = semantically aligned")

  return _methods._mean(a, axis=axis, dtype=dtype,

  ret = ret.dtype.type(ret / rcount)





Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off]





Loading model from: /usr/local/lib/python3.12/dist-packages/lpips/weights/v0.1/alex.pth


Extracting features from /content/drive/MyDrive/BDA/real_signs: 0it [00:00, ?it/s]
Extracting features from /content/drive/MyDrive/BDA/generated_frames: 0it [00:00, ?it/s]
  mu1, sigma1 = real_feats.mean(axis=0), np.cov(real_feats, rowvar=False)

  avg = a.mean(axis, **keepdims_kw)

  ret = um.true_divide(

  mu1, sigma1 = real_feats.mean(axis=0), np.cov(real_feats, rowvar=False)

  c *= np.true_divide(1, fact)

  c *= np.true_divide(1, fact)

  mu2, sigma2 = gen_feats.mean(axis=0), np.cov(gen_feats, rowvar=False)

  mu2, sigma2 = gen_feats.mean(axis=0), np.cov(gen_feats, rowvar=False)

  covmean = linalg.sqrtm(sigma1.dot(sigma2), disp=False)[0]



FID calculation failed: sqrtm returned non-finite values.


Evaluating CLIP similarity: 0it [00:00, ?it/s]


===== SIGN ANIMATION QUALITY METRICS =====
üñºÔ∏è Structural Similarity (SSIM): nan  ‚Üí Higher = smoother transitions
üëÅÔ∏è Perceptual Distance (LPIPS): nan  ‚Üí Lower = more realistic visuals
üìà Fr√©chet Inception Distance (FID): nan  ‚Üí Lower = closer to real distribution
üî§ CLIP Semantic Similarity: nan  ‚Üí Higher = semantically aligned





In [None]:
# ======================================================
# METRIC EVALUATION FOR SIGN ANIMATION GENERATION
# ======================================================
import os
import numpy as np
from PIL import Image
import torch
import torchvision.transforms as T
import torchvision.models as models
from skimage.metrics import structural_similarity as ssim
import lpips
from scipy import linalg
from tqdm import tqdm
import clip

# ----------------------------
# CONFIG PATHS
# ----------------------------
GEN_FRAMES_PATH = "/content/drive/MyDrive/BDA/generated_frames"
REAL_FRAMES_PATH = "/content/drive/MyDrive/BDA/real_signs"

device = "cuda" if torch.cuda.is_available() else "cpu"

# ----------------------------
# 1Ô∏è‚É£ SSIM - Structural Similarity
# ----------------------------
def compute_ssim(gen_path):
    images = sorted([os.path.join(gen_path, f) for f in os.listdir(gen_path) if f.endswith(".png") or f.endswith(".jpg")])
    ssim_values = []
    for i in range(len(images) - 1):
        img1 = np.array(Image.open(images[i]).convert("L"))
        img2 = np.array(Image.open(images[i+1]).convert("L"))
        ssim_values.append(ssim(img1, img2))
    return np.mean(ssim_values)

# ----------------------------
# 2Ô∏è‚É£ LPIPS - Perceptual Similarity
# ----------------------------
def compute_lpips(real_path, gen_path):
    model = lpips.LPIPS(net='alex').to(device)
    real_imgs = sorted([os.path.join(real_path, f) for f in os.listdir(real_path)])
    gen_imgs = sorted([os.path.join(gen_path, f) for f in os.listdir(gen_path)])
    scores = []
    for r, g in zip(real_imgs, gen_imgs):
        r_img = lpips.im2tensor(lpips.load_image(r)).to(device)
        g_img = lpips.im2tensor(lpips.load_image(g)).to(device)
        with torch.no_grad():
            d = model(r_img, g_img)
        scores.append(d.item())
    return np.mean(scores)

# ----------------------------
# 3Ô∏è‚É£ FID - Fr√©chet Inception Distance
# ----------------------------
def compute_fid(real_path, gen_path):
    inception = models.inception_v3(pretrained=True, transform_input=False).to(device).eval()
    preprocess = T.Compose([
        T.Resize((299, 299)),
        T.ToTensor(),
        T.Normalize(mean=[0.5]*3, std=[0.5]*3)
    ])

    def get_feats(folder):
        feats = []
        for img_name in tqdm(os.listdir(folder), desc=f"Extracting features from {folder}"):
            path = os.path.join(folder, img_name)
            img = Image.open(path).convert("RGB")
            x = preprocess(img).unsqueeze(0).to(device)
            with torch.no_grad():
                f = inception(x)
            feats.append(f.cpu().numpy().flatten())
        return np.array(feats)

    real_feats = get_feats(real_path)
    gen_feats = get_feats(gen_path)

    mu1, sigma1 = real_feats.mean(axis=0), np.cov(real_feats, rowvar=False)
    mu2, sigma2 = gen_feats.mean(axis=0), np.cov(gen_feats, rowvar=False)
    fid = np.sum((mu1 - mu2) ** 2) + np.trace(sigma1 + sigma2 - 2 * linalg.sqrtm(sigma1.dot(sigma2)))
    return np.real(fid)

# ----------------------------
# 4Ô∏è‚É£ CLIP Semantic Similarity
# ----------------------------
def compute_clip_similarity(text, gen_path):
    model, preprocess = clip.load("ViT-B/32", device=device)
    text_tokens = clip.tokenize([text]).to(device)
    text_features = model.encode_text(text_tokens)

    img_paths = sorted([os.path.join(gen_path, f) for f in os.listdir(gen_path)])
    sims = []
    for path in tqdm(img_paths, desc="Evaluating CLIP similarity"):
        img = preprocess(Image.open(path)).unsqueeze(0).to(device)
        with torch.no_grad():
            img_feat = model.encode_image(img)
        sims.append(torch.cosine_similarity(img_feat, text_features).item())
    return np.mean(sims)

# ----------------------------
# RUN ALL METRICS
# ----------------------------
text_input = "HELLO"  # example phrase to evaluate
ssim_score = compute_ssim(GEN_FRAMES_PATH)
lpips_score = compute_lpips(REAL_FRAMES_PATH, GEN_FRAMES_PATH)
fid_score = compute_fid(REAL_FRAMES_PATH, GEN_FRAMES_PATH)
clip_score = compute_clip_similarity(text_input, GEN_FRAMES_PATH)

print("\n===== SIGN ANIMATION QUALITY METRICS =====")
print(f"üñº Structural Similarity (SSIM): {ssim_score:.3f}  ‚Üí Higher = smoother transitions")
print(f"üëÅ Perceptual Distance (LPIPS): {lpips_score:.3f}  ‚Üí Lower = more realistic visuals")
print(f"üìà Fr√©chet Inception Distance (FID): {fid_score:.2f}  ‚Üí Lower = closer to real distribution")
print(f"üî§ CLIP Semantic Similarity: {clip_score:.3f}  ‚Üí Higher = semantically aligned")

  return _methods._mean(a, axis=axis, dtype=dtype,

  ret = ret.dtype.type(ret / rcount)





Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off]





Loading model from: /usr/local/lib/python3.12/dist-packages/lpips/weights/v0.1/alex.pth


Extracting features from /content/drive/MyDrive/BDA/real_signs: 0it [00:00, ?it/s]
Extracting features from /content/drive/MyDrive/BDA/generated_frames: 0it [00:00, ?it/s]
  mu1, sigma1 = real_feats.mean(axis=0), np.cov(real_feats, rowvar=False)

  avg = a.mean(axis, **keepdims_kw)

  ret = um.true_divide(

  mu1, sigma1 = real_feats.mean(axis=0), np.cov(real_feats, rowvar=False)

  c *= np.true_divide(1, fact)

  c *= np.true_divide(1, fact)

  mu2, sigma2 = gen_feats.mean(axis=0), np.cov(gen_feats, rowvar=False)

  mu2, sigma2 = gen_feats.mean(axis=0), np.cov(gen_feats, rowvar=False)

Evaluating CLIP similarity: 0it [00:00, ?it/s]


===== SIGN ANIMATION QUALITY METRICS =====
üñº Structural Similarity (SSIM): nan  ‚Üí Higher = smoother transitions
üëÅ Perceptual Distance (LPIPS): nan  ‚Üí Lower = more realistic visuals
üìà Fr√©chet Inception Distance (FID): nan  ‚Üí Lower = closer to real distribution
üî§ CLIP Semantic Similarity: nan  ‚Üí Higher = semantically aligned





In [None]:
!pip install ftfy regex tqdm git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-dslz1egm
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-dslz1egm
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.8/44.8 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369490 sha256=60779c3d79967a6a811cbcee2554700ed7b6e2b0d01dd21e2533841cc82ebf1b
  Stored in directory: /tmp/p

In [None]:
!pip install lpips

Collecting lpips
  Downloading lpips-0.1.4-py3-none-any.whl.metadata (10 kB)
Downloading lpips-0.1.4-py3-none-any.whl (53 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m53.8/53.8 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lpips
Successfully installed lpips-0.1.4
