
# üé•‚ÜíüñºÔ∏è‚Üíüß¨‚Üíüìà‚Üíü§ñ Deepfake vs R√©el ‚Äî Pipeline Latent + ML (Notebook Complet)

Ce notebook couvre **de bout en bout** :
1) Extraction de frames depuis des vid√©os  
2) Calcul de **vecteurs latents** par *GAN Inversion* (fonction √† brancher)  
3) **Analyse de l‚Äôespace latent** (PCA / t‚ÄëSNE / UMAP‚Ä†) + stats (moyenne/variance) + distances/clustering  
4) **Apprentissage** (SVM / MLP / RandomForest) + √©valuation + export du mod√®le  
5) **Pr√©diction** sur nouvelle vid√©o ou image
> ‚Ä†UMAP n√©cessite `umap-learn`. Si non install√©, le code le contournera automatiquement.


## ‚öôÔ∏è Setup & Configuration

In [1]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"


In [2]:
from encoder4editing.models.psp import pSp
from encoder4editing.utils.common import tensor2im


If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].


In [2]:

# --- Chemins de donn√©es (adapte si besoin) ---
from pathlib import Path

# Dossiers d'images (FAKE / REAL) d√©j√† extraites ou pr√™tes
folders = [
    r"C:\Users\EliteLaptop\Desktop\kawtar\GAN_inversion\extracted_frames\fake",
    r"C:\Users\EliteLaptop\Desktop\kawtar\GAN_inversion\extracted_frames\real"
]

# Dossiers de travail (cr√©√©s automatiquement)
ARTIFACTS_DIR = Path("artifacts")
FRAMES_DIR = ARTIFACTS_DIR / "frames"
LATENTS_DIR = ARTIFACTS_DIR / "latents"
PLOTS_DIR = ARTIFACTS_DIR / "plots"
MODELS_DIR = ARTIFACTS_DIR / "models"

for d in [ARTIFACTS_DIR, FRAMES_DIR, LATENTS_DIR, PLOTS_DIR, MODELS_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print("[OK] Dossiers pr√™ts:", ARTIFACTS_DIR.resolve())

# Param√®tres g√©n√©raux
SEED = 42
TEST_SIZE = 0.2
LATENT_DIM = 512   # Adapter selon votre mod√®le d'inversion (StyleGAN: souvent 512)
MAX_FRAMES_PER_VIDEO = 200  # pour limiter le nombre de frames (modifie selon puissance)


[OK] Dossiers pr√™ts: C:\Users\EliteLaptop\Desktop\kawtar\GAN_inversion\artifacts


## üì¶ D√©pendances

In [3]:

import sys, subprocess, importlib

def ensure(pkg, import_name=None):
    name = import_name or pkg
    try:
        importlib.import_module(name)
        print(f"[OK] {pkg} d√©j√† install√©.")
    except ImportError:
        print(f"[INFO] Installation de {pkg} ...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])
        importlib.import_module(name)
        print(f"[OK] {pkg} install√©.")

# Essentiels
for pkg, imp in [
    ("numpy", "numpy"),
    ("pillow", "PIL"),
    ("opencv-python", "cv2"),
    ("scikit-learn", "sklearn"),
    ("matplotlib", "matplotlib"),
    ("umap-learn", "umap"),  # optionnel, sera g√©r√© par try/except √† l'import r√©elle
    ("joblib", "joblib"),
]:
    try:
        ensure(pkg, imp)
    except Exception as e:
        print(f"[WARN] {pkg} non disponible: {e}")


[OK] numpy d√©j√† install√©.
[OK] pillow d√©j√† install√©.
[OK] opencv-python d√©j√† install√©.
[OK] scikit-learn d√©j√† install√©.
[OK] matplotlib d√©j√† install√©.
[OK] umap-learn d√©j√† install√©.
[OK] joblib d√©j√† install√©.


## üß∞ Utilitaires

In [4]:

import os
import json
import math
import random
import numpy as np
from PIL import Image
from pathlib import Path
from typing import List, Tuple, Dict

import matplotlib.pyplot as plt  # R√®gles: pas de seaborn, pas de couleurs fix√©es

random.seed(SEED)
np.random.seed(SEED)

def list_images_in_folders(folders: List[str]) -> Tuple[list, list]:
    """Retourne chemins & labels (0=fake,1=real) √† partir de deux dossiers nomm√©s 'fake' et 'real'."""
    X_paths, y = [], []
    labels_map = {"fake": 0, "real": 1}
    for folder in folders:
        base = Path(folder)
        if not base.exists():
            print(f"[WARN] Dossier inexistant: {base}")
            continue
        label = None
        # d√©duction robuste du label
        parts = str(base).lower().split(os.sep)
        if "fake" in parts:
            label = 0
        elif "real" in parts:
            label = 1
        else:
            # fallback sur le nom du dossier
            label = labels_map.get(base.name.lower(), None)
        if label is None:
            raise ValueError(f"Impossible de d√©duire le label pour {base}")
        for fn in base.glob("**/*"):
            if fn.suffix.lower() in {".jpg",".jpeg",".png",".bmp"}:
                X_paths.append(str(fn))
                y.append(label)
    return X_paths, y

def save_plot(fig, out_path: Path, title: str = ""):
    if title:
        fig.suptitle(title)
    fig.savefig(out_path, bbox_inches="tight", dpi=140)
    plt.close(fig)
    print(f"[OK] Plot enregistr√© ‚Üí {out_path}")

print("[OK] Utils charg√©s.")


[OK] Utils charg√©s.


## üéûÔ∏è Extraction de frames depuis des vid√©os 


In [7]:

import cv2

VIDEOS_DIR = Path(r"C:\Users\EliteLaptop\Desktop\kawtar\GAN_inversion\raw")
OUT_FRAMES_FAKE = FRAMES_DIR / "fake"
OUT_FRAMES_REAL = FRAMES_DIR / "real"
OUT_FRAMES_FAKE.mkdir(parents=True, exist_ok=True)
OUT_FRAMES_REAL.mkdir(parents=True, exist_ok=True)

def extract_frames_from_video(video_path: Path, out_dir: Path, max_frames: int = MAX_FRAMES_PER_VIDEO, every_n: int = 5):
    cap = cv2.VideoCapture(str(video_path))
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    idx, saved = 0, 0
    while True and saved < max_frames:
        ret = cap.grab()
        if not ret:
            break
        if idx % every_n == 0:
            ret, frame = cap.retrieve()
            if not ret:
                break
            out_path = out_dir / f"{video_path.stem}_f{idx:06d}.jpg"
            cv2.imwrite(str(out_path), frame)
            saved += 1
        idx += 1
    cap.release()
    print(f"[OK] {video_path.name}: {saved} frames")

# D√©commentez pour extraire si vous avez des vid√©os locales:
for label in ["fake","real"]:
    vdir = VIDEOS_DIR / label
    if not vdir.exists(): 
        continue
    for mp in vdir.glob("*.mp4"):
        out_dir = OUT_FRAMES_FAKE if label=="fake" else OUT_FRAMES_REAL
        extract_frames_from_video(mp, out_dir)


[OK] id0_id16_0000.mp4: 94 frames
[OK] id0_id16_0001.mp4: 61 frames
[OK] id0_id16_0002.mp4: 70 frames
[OK] id0_id16_0003.mp4: 106 frames
[OK] id0_id16_0004.mp4: 66 frames
[OK] id0_id16_0005.mp4: 92 frames
[OK] id0_id16_0006.mp4: 107 frames
[OK] id0_id16_0007.mp4: 96 frames
[OK] id0_id16_0008.mp4: 93 frames
[OK] id0_id16_0009.mp4: 104 frames
[OK] id0_id17_0000.mp4: 94 frames
[OK] id0_id17_0001.mp4: 61 frames
[OK] id0_id17_0002.mp4: 70 frames
[OK] id0_id17_0003.mp4: 106 frames
[OK] id0_id17_0005.mp4: 92 frames
[OK] id0_id17_0006.mp4: 107 frames
[OK] id0_id17_0007.mp4: 96 frames
[OK] id0_id17_0009.mp4: 104 frames
[OK] id0_id1_0000.mp4: 94 frames
[OK] id0_id1_0001.mp4: 61 frames
[OK] id0_id1_0002.mp4: 70 frames
[OK] id0_id1_0003.mp4: 106 frames
[OK] id0_id1_0005.mp4: 92 frames
[OK] id0_id1_0006.mp4: 107 frames
[OK] id0_id1_0007.mp4: 96 frames
[OK] id0_id1_0009.mp4: 104 frames
[OK] id0_id2_0000.mp4: 94 frames
[OK] id0_id2_0001.mp4: 61 frames
[OK] id0_id2_0002.mp4: 70 frames
[OK] id0_id2_000

##  Inversion GAN ‚Üí Vecteurs latents `z`

In [2]:

# ‚ö†Ô∏è √Ä BRANCHER : Remplacez `gan_invert(img)` par votre impl√©mentation.
# Par d√©faut, on renvoie un vecteur latent al√©atoire (d√©mo).
# Int√©gration typique : pSp/e4e/ReStyle/Encoder StyleGAN ‚Üí np.ndarray de taille LATENT_DIM.

def gan_invert(pil_img: Image.Image) -> np.ndarray:
    # TODO: Remplacez par votre code d'inversion (retour: np.ndarray shape (LATENT_DIM,))
    # Exemple d'API attendue :
    #   z = your_encoder.encode(pil_img)  # (LATENT_DIM,)
    #   return np.asarray(z, dtype=np.float32)
    return np.random.randn(LATENT_DIM).astype(np.float32)

def build_latent_dataset(folders_or_frames: list, cache_prefix: str = "dataset"):
    """Calcule/charge X(latents) et y(labels) √† partir de dossiers images.
       Sauvegarde X.npy, y.npy pour r√©utiliser rapidement."""
    X_cache = LATENTS_DIR / f"{cache_prefix}_X.npy"
    y_cache = LATENTS_DIR / f"{cache_prefix}_y.npy"
    if X_cache.exists() and y_cache.exists():
        X = np.load(X_cache)
        y = np.load(y_cache)
        print(f"[OK] Charg√© cache: {X.shape}, labels: {y.shape}")
        return X, y

    paths, labels = list_images_in_folders(folders_or_frames)
    print(f"[INFO] Images trouv√©es: {len(paths)}")
    X_list = []
    for i, p in enumerate(paths, 1):
        try:
            img = Image.open(p).convert("RGB")
            z = gan_invert(img)
            if z.ndim != 1:
                z = z.reshape(-1)
            X_list.append(z)
        except Exception as e:
            print(f"[WARN] Skip {p}: {e}")
        if i % 100 == 0:
            print(f"  ... {i}/{len(paths)}")

    X = np.vstack(X_list).astype(np.float32)
    y = np.asarray(labels[:len(X_list)], dtype=np.int64)

    np.save(X_cache, X)
    np.save(y_cache, y)
    print(f"[OK] Sauvegard√©: {X_cache.name}, {y_cache.name} - Shapes: {X.shape}, {y.shape}")
    return X, y

# Exemple d'utilisation: on peut choisir d'apprendre sur `folders` (images pr√™tes) 
# ou sur `FRAMES_DIR/fake|real` si vous avez extrait des frames depuis vid√©os.
DATA_SOURCES = folders  # ou: [str(OUT_FRAMES_FAKE), str(OUT_FRAMES_REAL)]
X, y = build_latent_dataset(DATA_SOURCES, cache_prefix="raw")
X.shape, y.shape, np.bincount(y)


NameError: name 'folders' is not defined

## üìà Analyse de l‚Äôespace latent (PCA / t‚ÄëSNE / UMAP)

In [None]:

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# UMAP est optionnel
try:
    import umap
    HAS_UMAP = True
except Exception:
    HAS_UMAP = False
    print("[INFO] UMAP non disponible (installez `umap-learn` pour l'activer).")

def plot_2d(emb2d, labels, title, out_file):
    fig = plt.figure()
    xs, ys = emb2d[:,0], emb2d[:,1]
    plt.scatter(xs, ys, s=12, alpha=0.8, c=labels)
    plt.title(title)
    out_path = PLOTS_DIR / out_file
    save_plot(fig, out_path, title=title)

# PCA
pca = PCA(n_components=min(50, X.shape[1]))
Xp = pca.fit_transform(X)
print("[PCA] Explained var (10 premi√®res):", pca.explained_variance_ratio_[:10])
plot_2d(Xp[:,:2], y, "PCA (2D)", "pca_2d.png")

# t-SNE
Xt = TSNE(n_components=2, random_state=SEED, init="random", learning_rate="auto").fit_transform(X)
plot_2d(Xt, y, "t-SNE (2D)", "tsne_2d.png")

# UMAP (si dispo)
if HAS_UMAP:
    Xu = umap.UMAP(n_components=2, random_state=SEED).fit_transform(X)
    plot_2d(Xu, y, "UMAP (2D)", "umap_2d.png")


## üìä Statistiques & Distances (intra/inter)

In [None]:

from sklearn.metrics import pairwise_distances

def class_stats(X, y, cls):
    Xc = X[y==cls]
    mu = Xc.mean(axis=0)
    var = Xc.var(axis=0)
    return Xc, mu, var

X_fake, mu_fake, var_fake = class_stats(X, y, 0)
X_real, mu_real, var_real = class_stats(X, y, 1)

print("[FAKE] n=", len(X_fake), " mean|var dims:", mu_fake.shape, var_fake.shape)
print("[REAL] n=", len(X_real), " mean|var dims:", mu_real.shape, var_real.shape)

# Distances intra-classes (moyennes)
intra_fake = pairwise_distances(X_fake).mean() if len(X_fake)>1 else float("nan")
intra_real = pairwise_distances(X_real).mean() if len(X_real)>1 else float("nan")
# Distance inter-classes (entre barycentres)
inter_centroids = np.linalg.norm(mu_fake - mu_real)

print(f"Distance intra FAKE: {intra_fake:.4f}")
print(f"Distance intra REAL: {intra_real:.4f}")
print(f"Distance entre centro√Ødes FAKE/REAL: {inter_centroids:.4f}")


## ü§ñ Entra√Ænement ML (SVM / RandomForest / MLP)

In [None]:

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay
import joblib

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=SEED, stratify=y)

models = {
    "svm_linear": Pipeline([("scaler", StandardScaler()), ("clf", SVC(kernel="linear", probability=True, random_state=SEED))]),
    "svm_rbf":    Pipeline([("scaler", StandardScaler()), ("clf", SVC(kernel="rbf", probability=True, random_state=SEED))]),
    "rf":         RandomForestClassifier(n_estimators=300, random_state=SEED),
    "mlp":        Pipeline([("scaler", StandardScaler()), ("clf", MLPClassifier(hidden_layer_sizes=(256,128), max_iter=200, random_state=SEED))]),
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

reports = {}
for name, model in models.items():
    try:
        scores = cross_val_score(model, X_train, y_train, cv=cv, scoring="accuracy")
        print(f"[CV] {name}: acc={scores.mean():.3f}¬±{scores.std():.3f}")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:,1] if hasattr(model, "predict_proba") else None

        report = classification_report(y_test, y_pred, target_names=["FAKE","REEL"], output_dict=True)
        cm = confusion_matrix(y_test, y_pred).tolist()
        auc = roc_auc_score(y_test, y_proba) if y_proba is not None else float("nan")

        reports[name] = {"cv_acc_mean": float(scores.mean()), "cv_acc_std": float(scores.std()),
                         "report": report, "confusion_matrix": cm, "roc_auc": float(auc)}

        # Sauvegarde mod√®le
        out_model = MODELS_DIR / f"{name}.joblib"
        joblib.dump(model, out_model)
        print(f"[OK] Mod√®le sauvegard√© ‚Üí {out_model}")

        # ROC plot (si proba dispo)
        if y_proba is not None:
            fig = plt.figure()
            RocCurveDisplay.from_predictions(y_test, y_proba)
            save_plot(fig, PLOTS_DIR / f"roc_{name}.png", title=f"ROC {name}")
    except Exception as e:
        print(f"[WARN] {name} a √©chou√©: {e}")

# Sauvegarde des m√©triques
with open(MODELS_DIR / "metrics.json", "w", encoding="utf-8") as f:
    json.dump(reports, f, indent=2)
print("[OK] M√©triques sauvegard√©es ‚Üí", MODELS_DIR / "metrics.json")

reports


## üéØ Pr√©diction (FAKE vs R√âEL) sur images/vid√©o

In [None]:

def predict_on_images(image_paths: list, model_path: Path) -> Dict[str, float]:
    """Charge un mod√®le .joblib et renvoie proba R√©el (1) par image."""
    import joblib
    results = {}
    model = joblib.load(model_path)
    for p in image_paths:
        try:
            img = Image.open(p).convert("RGB")
            z = gan_invert(img)
            if z.ndim != 1:
                z = z.reshape(-1)
            z = z.reshape(1, -1)
            if hasattr(model, "predict_proba"):
                proba = float(model.predict_proba(z)[0,1])
            else:
                # fallback: decision_function -> approx via sigmoid
                dec = float(model.decision_function(z)[0])
                proba = 1/(1+np.exp(-dec))
            results[str(p)] = proba
        except Exception as e:
            results[str(p)] = f"ERROR: {e}"
    return results

# Exemple d'utilisation:
# sample_imgs = [list(Path(folders[0]).glob('*.jpg'))[0], list(Path(folders[1]).glob('*.jpg'))[0]]
# preds = predict_on_images(sample_imgs, MODELS_DIR / "svm_linear.joblib")
# preds



## üìù Notes & Conseils

- **GAN Inversion** : Remplacez `gan_invert(pil_img)` par votre encodeur (pSp/e4e/ReStyle‚Ä¶).
- **Normalisation** : Conservez la m√™me pr√©‚Äëproc image (taille, centrage, normalisation) que celle attendue par votre encodeur.
- **Latent dimension** : Ajustez `LATENT_DIM` si votre espace latent est `W`, `W+`, `S`, etc.
- **√âquilibrage** : Si le dataset est d√©s√©quilibr√©, explorez `class_weight='balanced'` (SVM) ou r√©√©chantillonnage.
- **Sauvegardes** : `LATENTS_DIR/*.npy` + `MODELS_DIR/*.joblib` + graphiques dans `PLOTS_DIR/`.
- **G√©n√©ralisation** : Validez sur vid√©os/images tenues **hors** du jeu d‚Äôentra√Ænement.


In [2]:
import torch

print("Torch version:", torch.__version__)
print("CUDA dispo :", torch.cuda.is_available())
print("Nom GPU :", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "Aucun GPU d√©tect√©")


Torch version: 2.5.1
CUDA dispo : True
Nom GPU : NVIDIA GeForce GTX 1650 Ti with Max-Q Design


In [None]:
import os
import numpy as np
import torch
from torchvision import transforms
from PIL import Image
import matplotlib.pyplot as plt

# ML + r√©duction dimension
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from argparse import Namespace

###############################################
# 1) CONFIGURATION
###############################################

device = "cuda" if torch.cuda.is_available() else "cpu"

FAKE_DIR = r'C:/Users/EliteLaptop/Desktop/kawtar/GAN_inversion/artifacts/frames/fake'
REAL_DIR = r'C:/Users/EliteLaptop/Desktop/kawtar/GAN_inversion/artifacts/frames/real'
CKPT_PATH = r'C:/Users/EliteLaptop/Desktop/kawtar/GAN_inversion/encoder4editing/pretrained_models/e4e_ffhq_encode.pt'

###############################################
# 2) MODEL e4e
###############################################

from encoder4editing.models.psp import pSp

print("[INFO] Loading model...")

ckpt = torch.load(CKPT_PATH, map_location=device)
opts = ckpt['opts']
if isinstance(opts, dict):
    opts = Namespace(**opts)
opts.checkpoint_path = CKPT_PATH

model = pSp(opts).to(device).eval()
print("[OK] Model loaded.")

###############################################
# 3) FACE ALIGNMENT (OBLIGATOIRE pour e4e)
###############################################

from facenet_pytorch import MTCNN

mtcnn = MTCNN(
    image_size=256,
    margin=0,
    post_process=False,
    device=device
)

def align_face(pil_img):
    """Retourne une image align√©e FFHQ-style pour e4e"""
    try:
        aligned = mtcnn(pil_img)
        if aligned is None:
            print("[WARN] No face detected.")
            return None
        aligned = aligned.permute(1,2,0).byte().cpu().numpy()
        return Image.fromarray(aligned)
    except Exception as e:
        print("[ERROR] Face alignment failed:", e)
        return None

###############################################
# 4) GAN INVERSION
###############################################

transform = transforms.Compose([transforms.ToTensor()])

def gan_invert(pil_img):
    """Retourne latent vector via e4e (latent W+)"""
    try:
        aligned = align_face(pil_img)
        if aligned is None:
            return None

        img_tensor = transform(aligned).unsqueeze(0).to(device)

        with torch.no_grad():
            output, latents = model(img_tensor, return_latents=True)

        latent = latents.squeeze().detach().cpu().numpy().astype(np.float32)
        return latent

    except Exception as e:
        print(f"[ERROR] GAN inversion failed: {e}")
        return None


###############################################
# 5) Construire dataset latent (X, y)
###############################################

def load_latents(fake_dir, real_dir):
    X, y = [], []

    print("[INFO] Processing REAL images...")
    for fname in os.listdir(real_dir):
        try:
            img = Image.open(os.path.join(real_dir, fname))
            latent = gan_invert(img)
            if latent is not None:
                X.append(latent)
                y.append(0)
        except:
            continue

    print("[INFO] Processing FAKE images...")
    for fname in os.listdir(fake_dir):
        try:
            img = Image.open(os.path.join(fake_dir, fname))
            latent = gan_invert(img)
            if latent is not None:
                X.append(latent)
                y.append(1)
        except:
            continue

    X = np.array(X)
    y = np.array(y)
    print("[OK] Latent dataset built:", X.shape, y.shape)
    return X, y

###############################################
# 6) Charger ou cr√©er latents
###############################################

LATENT_PATH = "latents.npy"
LABEL_PATH = "labels.npy"

if os.path.exists(LATENT_PATH) and os.path.exists(LABEL_PATH):
    X = np.load(LATENT_PATH)
    y = np.load(LABEL_PATH)
else:
    X, y = load_latents(FAKE_DIR, REAL_DIR)
    np.save(LATENT_PATH, X)
    np.save(LABEL_PATH, y)

###############################################
# 7) Standardisation
###############################################

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

###############################################
# 8) PCA plotting
###############################################

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
plt.figure(figsize=(7,5))
plt.scatter(X_pca[:,0], X_pca[:,1], c=y, cmap="coolwarm", s=5)
plt.title("PCA - Latent Space")
plt.show()

###############################################
# 9) t-SNE
###############################################

X_pca50 = PCA(n_components=50).fit_transform(X_scaled)
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
X_tsne = tsne.fit_transform(X_pca50)

plt.figure(figsize=(7,5))
plt.scatter(X_tsne[:,0], X_tsne[:,1], c=y, cmap="coolwarm", s=5)
plt.title("t-SNE - Latent Space")
plt.show()

###############################################
# 10) KMeans clustering
###############################################

kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

plt.figure(figsize=(7,5))
plt.scatter(X_pca[:,0], X_pca[:,1], c=clusters, cmap="viridis", s=5)
plt.title("KMeans clustering")
plt.show()

###############################################
# 11) ML MODELS
###############################################

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

svm = SVC(kernel="rbf", probability=True)
svm.fit(X_train, y_train)
pred = svm.predict(X_test)
print("SVM ACC:", accuracy_score(y_test, pred))

mlp = MLPClassifier(hidden_layer_sizes=(512,256,64), max_iter=300)
mlp.fit(X_train, y_train)

rf = RandomForestClassifier(n_estimators=300)
rf.fit(X_train, y_train)

###############################################
# 12) PREDICTION
###############################################

def predict_image(path, model=svm):
    img = Image.open(path)
    latent = gan_invert(img)
    if latent is None:
        return "ERROR: no face detected"
    latent = scaler.transform([latent])
    return "FAKE" if model.predict(latent)[0] == 1 else "REAL"

print("\nPipeline complet ex√©cut√© ‚úî")


If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].


NameError: name 'Path' is not defined

In [10]:
# ==========================================================
# NOTEBOOK UNIQUE : Deepfake detection pipeline
# ==========================================================
# Requirements:
# pip install torch torchvision timm scikit-learn xgboost umap-learn matplotlib tqdm opencv-python

import os
import cv2
import numpy as np
import torch
import torchvision.transforms as T
import torchvision.models as models
from tqdm import tqdm
import matplotlib.pyplot as plt
from pathlib import Path
import joblib
import json

# sklearn & others
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap.umap_ as umap
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from scipy.spatial.distance import cdist

# ----------------------------------------------------------
# CONFIGURATION
# ----------------------------------------------------------
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device:", device)

# Tes dossiers contenant des frames D√âJ√Ä extraites
path_real = r"C:\Users\EliteLaptop\Desktop\kawtar\GAN_inversion\extracted_frames\real"
path_fake = r"C:\Users\EliteLaptop\Desktop\kawtar\GAN_inversion\extracted_frames\fake"

# checkpoint e4e si disponible
ckpt_e4e = os.path.join('encoder4editing','pretrained_models','e4e_ffhq_encode.pt')

# dossier de sortie
output_dir = r"C:\Users\EliteLaptop\Desktop\kawtar\GAN_inversion\results_gan_pipeline"
os.makedirs(output_dir, exist_ok=True)

use_gan_inversion = True     # True = essayer e4e, sinon fallback ResNet50

# ----------------------------------------------------------
# CHARGEMENT DU MODELE : e4e OU RESNET50
# ----------------------------------------------------------
e4e_model = None
resnet_model = None

transform_resnet = T.Compose([
    T.ToPILImage(),
    T.Resize((224,224)),
    T.ToTensor(),
    T.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])

# tentative e4e
def try_load_e4e(ckpt_path):
    try:
        from models.psp import pSp
        ckpt = torch.load(ckpt_path, map_location='cpu')
        opts = ckpt['opts']
        opts['checkpoint_path'] = ckpt_path
        opts = type('Options', (), opts)()
        net = pSp(opts).to(device).eval()

        def encode(img_bgr):
            img_rgb = img_bgr[:,:,::-1]
            tf = T.Compose([T.ToPILImage(), T.Resize((256,256)), T.ToTensor()])
            t = tf(img_rgb).unsqueeze(0).to(device)
            with torch.no_grad():
                out = net.encoder(t)
                return out.cpu().numpy().reshape(-1)
        print("e4e loaded successfully.")
        return encode
    except Exception as e:
        print("Erreur chargement e4e :", e)
        return None

def load_resnet_fallback():
    model = models.resnet50(pretrained=True)
    model.fc = torch.nn.Identity()
    model.to(device).eval()
    def encode(img_bgr):
        img_rgb = img_bgr[:,:,::-1]
        t = transform_resnet(img_rgb).unsqueeze(0).to(device)
        with torch.no_grad():
            return model(t).cpu().numpy().reshape(-1)
    return encode

if use_gan_inversion and os.path.exists(ckpt_e4e):
    e4e_model = try_load_e4e(ckpt_e4e)

encoder_fn = e4e_model if e4e_model is not None else load_resnet_fallback()
print("Using encoder:", "e4e" if e4e_model else "ResNet50")

# ----------------------------------------------------------
# 1) LECTURE DES IMAGES (supporte les sous-dossiers)
# ----------------------------------------------------------
def list_images(folder):
    exts = ('.jpg','.jpeg','.png','.bmp')
    all_files = []
    for root, dirs, files in os.walk(folder):
        for f in files:
            if f.lower().endswith(exts):
                all_files.append(os.path.join(root, f))
    return all_files

imgs_real = list_images(path_real)
imgs_fake = list_images(path_fake)

print("Found REAL:", len(imgs_real))
print("Found FAKE:", len(imgs_fake))

# ----------------------------------------------------------
# 2) EXTRACTION DES FEATURES / LATENTS
# ----------------------------------------------------------
def safe_imread(p):
    img = cv2.imread(p)
    if img is None:
        raise ValueError("Unable to read image " + p)
    return img

X_list = []
y_list = []

for p in tqdm(imgs_real, desc="Encoding REAL"):
    try:
        img = safe_imread(p)
        X_list.append(encoder_fn(img))
        y_list.append(0)
    except Exception as e:
        print("Error:", p, e)

for p in tqdm(imgs_fake, desc="Encoding FAKE"):
    try:
        img = safe_imread(p)
        X_list.append(encoder_fn(img))
        y_list.append(1)
    except Exception as e:
        print("Error:", p, e)

X = np.array(X_list)
y = np.array(y_list)

print("Latents shape:", X.shape)

np.save(os.path.join(output_dir, "latent_vectors.npy"), X)
np.save(os.path.join(output_dir, "labels.npy"), y)

# ----------------------------------------------------------
# 3) ANALYSE DE L‚ÄôESPACE LATENT
# ----------------------------------------------------------
results_summary = {}

# PCA
if X.shape[0] >= 2:
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)
    plt.scatter(X_pca[:,0], X_pca[:,1], c=y)
    plt.title("PCA 2D")
    plt.savefig(os.path.join(output_dir, "pca.png"))
    plt.close()

# t-SNE
if X.shape[0] >= 5:
    tsne = TSNE(n_components=2, random_state=42)
    X_tsne = tsne.fit_transform(X)
    plt.scatter(X_tsne[:,0], X_tsne[:,1], c=y)
    plt.title("t-SNE")
    plt.savefig(os.path.join(output_dir, "tsne.png"))
    plt.close()

# UMAP
if X.shape[0] >= 5:
    reducer = umap.UMAP(n_components=2, random_state=42)
    X_umap = reducer.fit_transform(X)
    plt.scatter(X_umap[:,0], X_umap[:,1], c=y)
    plt.title("UMAP")
    plt.savefig(os.path.join(output_dir, "umap.png"))
    plt.close()

# distances intra/inter
if len(np.unique(y)) == 2:
    X_real = X[y==0]
    X_fake = X[y==1]

    dist_real = np.mean(cdist(X_real, X_real)) if len(X_real)>=2 else None
    dist_fake = np.mean(cdist(X_fake, X_fake)) if len(X_fake)>=2 else None
    dist_inter = np.mean(cdist(X_real, X_fake)) if len(X_real)>=1 and len(X_fake)>=1 else None

    results_summary.update({
        'intra_real': dist_real,
        'intra_fake': dist_fake,
        'inter': dist_inter
    })

with open(os.path.join(output_dir, "latent_analysis.json"), "w") as f:
    json.dump(results_summary, f, indent=2)

# ----------------------------------------------------------
# 4) MACHINE LEARNING
# ----------------------------------------------------------
if X.shape[0] < 10:
    print("Pas assez d‚Äô√©chantillons pour ML !")
else:
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, stratify=y, random_state=42
    )

    models_results = {}

    # SVM
    svm = SVC(kernel='rbf')
    svm.fit(X_train, y_train)
    pred = svm.predict(X_test)
    models_results["svm"] = float(accuracy_score(y_test, pred))
    joblib.dump(svm, os.path.join(output_dir, "model_svm.joblib"))

    # Random Forest
    rf = RandomForestClassifier(n_estimators=200)
    rf.fit(X_train, y_train)
    pred = rf.predict(X_test)
    models_results["rf"] = float(accuracy_score(y_test, pred))
    joblib.dump(rf, os.path.join(output_dir, "model_rf.joblib"))

    # MLP
    mlp = MLPClassifier(hidden_layer_sizes=(512,256), max_iter=500)
    mlp.fit(X_train, y_train)
    pred = mlp.predict(X_test)
    models_results["mlp"] = float(accuracy_score(y_test, pred))
    joblib.dump(mlp, os.path.join(output_dir, "model_mlp.joblib"))

    # XGBoost
    xgb = XGBClassifier(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=6,
        eval_metric="logloss"
    )
    xgb.fit(X_train, y_train)
    pred = xgb.predict(X_test)
    models_results["xgb"] = float(accuracy_score(y_test, pred))
    joblib.dump(xgb, os.path.join(output_dir, "model_xgb.joblib"))

    joblib.dump(scaler, os.path.join(output_dir, "scaler.joblib"))

    with open(os.path.join(output_dir, "results_models.json"), "w") as f:
        json.dump(models_results, f, indent=2)

    print("\nAccuracy des mod√®les :")
    for m,a in models_results.items():
        print(f" - {m}: {a:.4f}")

# ----------------------------------------------------------
# FIN
# ----------------------------------------------------------
print("\nPipeline termin√© ! R√©sultats dans :", output_dir)


Device: cuda
Erreur chargement e4e : No module named 'models'




Using encoder: ResNet50
Found REAL: 790
Found FAKE: 3975


Encoding REAL:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 332/790 [00:09<00:13, 33.97it/s]


KeyboardInterrupt: 