In [None]:
import os
import torch
from torch.utils.data import Dataset, DataLoader, ConcatDataset
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as T
from torchvision.transforms.functional import to_pil_image
import cv2
from einops import repeat
from einops.layers.torch import Rearrange
import numpy as np
import json
from datetime import datetime
from torch.optim import lr_scheduler  # <-- added


# =============================
# Dataset
# =============================
class VideoFaceDataset(Dataset):
    def __init__(self, root_dir, split="train", category="real", transform=None, num_frames=32):
        self.video_root = os.path.join(root_dir, "images", split, category)
        self.transform = transform
        self.num_frames = num_frames
        self.category = category

        self.video_dirs = sorted([
            d for d in os.listdir(self.video_root)
            if os.path.isdir(os.path.join(self.video_root, d))
        ])

        print(f"[{split}][{category}] Found {len(self.video_dirs)} videos")
        self.label_value = 0 if category == "real" else 1

    def __len__(self):
        return len(self.video_dirs)

    def __getitem__(self, idx):
        video_dir = os.path.join(self.video_root, self.video_dirs[idx])
        frame_files = sorted([
            f for f in os.listdir(video_dir)
            if f.lower().endswith((".jpg", ".png"))
        ])

        total_frames = len(frame_files)
        indices = torch.linspace(0, total_frames - 1, steps=self.num_frames).long().tolist()

        frames = []
        for frame_idx in indices:
            frame_path = os.path.join(video_dir, frame_files[frame_idx])
            frame = cv2.imread(frame_path)
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = to_pil_image(frame)
            if self.transform:
                frame = self.transform(frame)
            frames.append(frame)

        if len(frames) < self.num_frames:
            pad_tensor = torch.zeros_like(frames[0])
            frames.extend([pad_tensor] * (self.num_frames - len(frames)))
        frames = torch.stack(frames)  # (T, C, H, W)

        label_tensor = torch.tensor(self.label_value, dtype=torch.long)
        return frames, label_tensor


# =============================
# Model
# =============================
class PatchEmbedding(nn.Module):
    def __init__(self, in_channels=3, patch_size=8, emb_size=128):
        super().__init__()
        self.projection = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=patch_size, p2=patch_size),
            nn.Linear(patch_size * patch_size * in_channels, emb_size)
        )

    def forward(self, x):
        return self.projection(x)


class Attention(nn.Module):
    def __init__(self, dim, n_heads, dropout=0.):
        super().__init__()
        self.att = nn.MultiheadAttention(embed_dim=dim, num_heads=n_heads, dropout=dropout)
        self.q = nn.Linear(dim, dim)
        self.k = nn.Linear(dim, dim)
        self.v = nn.Linear(dim, dim)

    def forward(self, x):
        q = self.q(x).permute(1, 0, 2)
        k = self.k(x).permute(1, 0, 2)
        v = self.v(x).permute(1, 0, 2)
        attn_output, _ = self.att(q, k, v)
        return attn_output.permute(1, 0, 2)


class PreNorm(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.fn = fn

    def forward(self, x, **kwargs):
        return self.fn(self.norm(x), **kwargs)


class FeedForward(nn.Sequential):
    def __init__(self, dim, hidden_dim, dropout=0.):
        super().__init__(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )


class ResidualAdd(nn.Module):
    def __init__(self, fn):
        super().__init__()
        self.fn = fn

    def forward(self, x, **kwargs):
        res = x
        x = self.fn(x, **kwargs)
        return x + res


class ViT(nn.Module):
    def __init__(self, ch=3, img_size=144, patch_size=16, emb_dim=64,
                 n_layers=4, dropout=0.1, heads=2, out_dim=2):
        super().__init__()
        self.patch_embedding = PatchEmbedding(ch, patch_size, emb_dim)
        num_patches = (img_size // patch_size) ** 2

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, emb_dim))
        self.cls_token = nn.Parameter(torch.rand(1, 1, emb_dim))

        self.layers = nn.ModuleList([
            nn.Sequential(
                ResidualAdd(PreNorm(emb_dim, Attention(emb_dim, heads, dropout))),
                ResidualAdd(PreNorm(emb_dim, FeedForward(emb_dim, emb_dim * 2, dropout)))
            ) for _ in range(n_layers)
        ])

        self.head = nn.Sequential(
            nn.LayerNorm(emb_dim),
            nn.Linear(emb_dim, out_dim),
        )

    def forward(self, imgs):
        # imgs: (B, T, C, H, W)
        B, T, C, H, W = imgs.shape
        imgs = imgs.view(B * T, C, H, W)  # (B*T, C, H, W)

        x = self.patch_embedding(imgs)
        b, n, _ = x.shape
        cls_tokens = repeat(self.cls_token, '1 1 d -> b 1 d', b=b)
        x = torch.cat([cls_tokens, x], dim=1)
        x += self.pos_embedding[:, :(n + 1)]
        for layer in self.layers:
            x = layer(x)
        out = self.head(x[:, 0, :])  # (B*T, out_dim)

        return out.view(B, T, -1).mean(dim=1)


# =============================
# Collate
# =============================
def custom_collate(batch):
    frames = torch.stack([item[0] for item in batch], dim=0)  # (B, T, C, H, W)
    labels = torch.tensor([item[1] for item in batch], dtype=torch.long)
    return frames, labels


# =============================
# Model Saving Functions
# =============================
def save_model(model, optimizer, epoch, train_loss, val_loss, save_dir="./saved_models1adamw"):
    """
    Save the model, optimizer state, and training information
    """
    os.makedirs(save_dir, exist_ok=True)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    model_path = os.path.join(save_dir, f"vit_model_epoch_{epoch+1}_{timestamp}.pth")
    checkpoint_path = os.path.join(save_dir, f"checkpoint_epoch_{epoch+1}_{timestamp}.pth")

    torch.save(model.state_dict(), model_path)

    checkpoint = {
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'train_loss': train_loss,
        'val_loss': val_loss,
        'model_config': {
            'ch': 3,
            'img_size': 144,
            'patch_size': 16,
            'emb_dim': 64,
            'n_layers': 4,
            'dropout': 0.1,
            'heads': 2,
            'out_dim': 2
        }
    }
    torch.save(checkpoint, checkpoint_path)

    print(f"Model saved to: {model_path}")
    print(f"Checkpoint saved to: {checkpoint_path}")

    return model_path, checkpoint_path


def load_model(model_path, model_config=None):
    """
    Load a saved model for inference
    """
    if model_config is None:
        model_config = {
            'ch': 3,
            'img_size': 144,
            'patch_size': 16,
            'emb_dim': 64,
            'n_layers': 4,
            'dropout': 0.1,
            'heads': 2,
            'out_dim': 2
        }

    model = ViT(**model_config)
    model.load_state_dict(torch.load(model_path))
    model.eval()
    return model


def load_checkpoint(checkpoint_path):
    """
    Load a checkpoint for resuming training
    """
    checkpoint = torch.load(checkpoint_path)

    model = ViT(**checkpoint['model_config'])
    model.load_state_dict(checkpoint['model_state_dict'])

    return model, checkpoint


# =============================
# Training
# =============================
transform = T.Compose([
    T.Resize((144, 144)),
    T.ToTensor(),
])

root_dir = "D://projectdeepfake"

train_real_dataset = VideoFaceDataset(root_dir, split="train", category="real", transform=transform)
train_attack_dataset = VideoFaceDataset(root_dir, split="train", category="attack", transform=transform)
devel_real_dataset = VideoFaceDataset(root_dir, split="devel", category="real", transform=transform)
devel_attack_dataset = VideoFaceDataset(root_dir, split="devel", category="attack", transform=transform)
test_real_dataset = VideoFaceDataset(root_dir, split="test", category="real", transform=transform)
test_attack_dataset = VideoFaceDataset(root_dir, split="test", category="attack", transform=transform)

train_dataset = ConcatDataset([train_real_dataset, train_attack_dataset])
devel_dataset = ConcatDataset([devel_real_dataset, devel_attack_dataset])
test_dataset = ConcatDataset([test_real_dataset, test_attack_dataset])

train_loader = DataLoader(train_dataset, batch_size=6, shuffle=True, collate_fn=custom_collate)
devel_loader = DataLoader(devel_dataset, batch_size=6, shuffle=False, collate_fn=custom_collate)
test_loader = DataLoader(test_dataset, batch_size=6, shuffle=False, collate_fn=custom_collate)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ViT(out_dim=2).to(device)

optimizer = optim.AdamW(model.parameters(), lr=0.0003)
criterion = nn.CrossEntropyLoss()

scaler = torch.cuda.amp.GradScaler()  # correct for CUDA/GPU

# =============================
# Learning Rate Scheduler
# =============================
num_epochs = 20

# Option 1: CosineAnnealingLR over epochs
scheduler = lr_scheduler.CosineAnnealingLR(
    optimizer,
    T_max=num_epochs,   # number of epochs to anneal over
    eta_min=1e-6        # minimum LR
)  # [web:4]

# Option 2: StepLR (comment out CosineAnnealingLR above if you use this)
# scheduler = lr_scheduler.StepLR(
#     optimizer,
#     step_size=15,      # drop LR every 15 epochs
#     gamma=0.1
# )  # [web:2][web:19]

# Training history for saving
training_history = {
    'train_losses': [],
    'val_losses': [],
    'epochs': []
}

best_val_loss = float('inf')
best_model_path = None

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    train_correct = 0
    train_total = 0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            outputs = model(inputs)
            loss = criterion(outputs, labels)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        running_loss += loss.item()

        # Training accuracy
        _, predicted = torch.max(outputs.data, 1)
        train_total += labels.size(0)
        train_correct += (predicted == labels).sum().item()

        torch.cuda.empty_cache()

    avg_train_loss = running_loss / len(train_loader)
    train_accuracy = 100 * train_correct / train_total

    # Validation
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for inputs, labels in devel_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            with torch.cuda.amp.autocast():
                outputs = model(inputs)
                loss = criterion(outputs, labels)
            val_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()

    avg_val_loss = val_loss / len(devel_loader)
    val_accuracy = 100 * val_correct / val_total

    # Step scheduler once per epoch after optimizer has been updated
    scheduler.step()

    # Optional: inspect current LR
    current_lr = scheduler.get_last_lr()[0]
    print(
        f"Epoch {epoch+1}/{num_epochs}, "
        f"LR: {current_lr:.6f}, "
        f"Train Loss: {avg_train_loss:.4f}, Train Acc: {train_accuracy:.2f}%, "
        f"Val Loss: {avg_val_loss:.4f}, Val Acc: {val_accuracy:.2f}%"
    )

    # Save training history
    training_history['train_losses'].append(avg_train_loss)
    training_history['val_losses'].append(avg_val_loss)
    training_history['epochs'].append(epoch + 1)

    # Save model checkpoint every epoch
    model_path, checkpoint_path = save_model(model, optimizer, epoch, avg_train_loss, avg_val_loss)

    # Save best model based on validation loss
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        best_model_path = model_path
        print(f"New best model saved with validation loss: {best_val_loss:.4f}")

# Save training history
history_path = "./saved_models/training_history.json"
os.makedirs("./saved_models", exist_ok=True)
with open(history_path, 'w') as f:
    json.dump(training_history, f, indent=4)
print(f"Training history saved to: {history_path}")

# =============================
# Testing
# =============================
model.eval()
test_loss = 0.0
correct_predictions = 0
total_predictions = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        with torch.cuda.amp.autocast():
            outputs = model(inputs)
            loss = criterion(outputs, labels)
        test_loss += loss.item()

        _, predicted = torch.max(outputs.data, 1)
        total_predictions += labels.size(0)
        correct_predictions += (predicted == labels).sum().item()

avg_test_loss = test_loss / len(test_loader)
test_accuracy = 100 * correct_predictions / total_predictions

print(f"Test Loss: {avg_test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.2f}%")

# Save final model with test results
final_model_path = "./saved_models/final_model.pth"
final_checkpoint = {
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'training_history': training_history,
    'test_loss': avg_test_loss,
    'test_accuracy': test_accuracy,
    'best_val_loss': best_val_loss,
    'model_config': {
        'ch': 3,
        'img_size': 144,
        'patch_size': 16,
        'emb_dim': 64,
        'n_layers': 4,
        'dropout': 0.1,
        'heads': 2,
        'out_dim': 2
    }
}

torch.save(final_checkpoint, final_model_path)
print(f"Final model with complete training info saved to: {final_model_path}")

# =============================
# Example: How to load the saved model for inference
# =============================
"""
# To load the model for inference:
loaded_model = load_model("./saved_models/final_model.pth")
loaded_model = loaded_model.to(device)

# To load a checkpoint for resuming training:
loaded_model, checkpoint = load_checkpoint("./saved_models/final_model.pth")
loaded_model = loaded_model.to(device)
optimizer = optim.AdamW(loaded_model.parameters(), lr=0.0003)
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
start_epoch = checkpoint.get('epoch', 0)
"""

print(f"\nBest model saved at: {best_model_path}")
print(f"Best validation loss: {best_val_loss:.4f}")
print("Training completed successfully!")


[train][real] Found 30 videos
[train][attack] Found 300 videos
[devel][real] Found 60 videos
[devel][attack] Found 300 videos
[test][real] Found 80 videos
[test][attack] Found 400 videos




In [None]:
#for images

In [2]:
import os
import torch
import torch.nn as nn
import torchvision.transforms as T
from torchvision.transforms.functional import to_pil_image
import cv2
from PIL import Image

# ---------------------------
# Load Model
# ---------------------------
def load_model(model_path, model_config=None):
    if model_config is None:
        model_config = {
            'ch': 3, 'img_size': 144, 'patch_size': 16, 'emb_dim': 64,
            'n_layers': 4, 'dropout': 0.1, 'heads': 2, 'out_dim': 2
        }
    model = ViT(**model_config)  # Make sure ViT class is defined/imported
    checkpoint = torch.load(model_path, map_location='cpu')
    if 'model_state_dict' in checkpoint:
        model.load_state_dict(checkpoint['model_state_dict'])
    else:
        model.load_state_dict(checkpoint)
    model.eval()
    return model

# ---------------------------
# Predict Single Image
# ---------------------------
def predict_image(model, image_path, transform, device):
    """
    Predict if a single image is REAL (0) or FAKE (1)
    """
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"Image not found: {image_path}")
    
    # Load and convert image to RGB
    img = cv2.imread(image_path)
    if img is None:
        raise ValueError(f"Could not read image: {image_path}")
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = to_pil_image(img)  # Convert to PIL image
    
    # Apply transforms and add batch dimension and temporal dimension
    img_tensor = transform(img).unsqueeze(0).unsqueeze(0).to(device)  # (1, 1, C, H, W)
    # First unsqueeze adds batch dimension, second adds temporal dimension
    
    # Prediction
    model.eval()
    with torch.no_grad():
        outputs = model(img_tensor)
        probs = torch.softmax(outputs, dim=1)
        confidence, pred = torch.max(probs, 1)
    
    label = "FAKE" if pred.item() == 1 else "REAL"
    confidence = confidence.item() * 100
    return label, confidence, probs.cpu().numpy()[0]

# ---------------------------
# Main Execution
# ---------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load model
model_path = "D://projectdeepfake//vit_model_epoch_20_20251212_110939.pth"
model = load_model(model_path)
model = model.to(device)
print("Model loaded successfully!")

# Image transforms
transform = T.Compose([
    T.Resize((144, 144)),
    T.ToTensor(),
])

# Predict on a single image
image_path = "D://projectdeepfake//images//devel//attack//attack_highdef_client005_session01_highdef_photo_adverse//00012.jpg"
label, confidence, probs = predict_image(model, image_path, transform, device)

print(f"Prediction: {label} (Confidence: {confidence:.2f}%)")
print(f"Real probability: {probs[0]:.4f}, Fake probability: {probs[1]:.4f}")

Using device: cpu


NameError: name 'ViT' is not defined

In [3]:
import os
import torch
import torch.nn as nn
import torchvision.transforms as T
from torchvision.transforms.functional import to_pil_image
import cv2
from einops import repeat
from einops.layers.torch import Rearrange
import numpy as np

# =============================
# COMPLETE MODEL DEFINITION (copy from training script)
# =============================
class PatchEmbedding(nn.Module):
    def __init__(self, in_channels=3, patch_size=8, emb_size=128):
        super().__init__()
        self.projection = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=patch_size, p2=patch_size),
            nn.Linear(patch_size * patch_size * in_channels, emb_size)
        )

    def forward(self, x):
        return self.projection(x)

class Attention(nn.Module):
    def __init__(self, dim, n_heads, dropout=0.):
        super().__init__()
        self.att = nn.MultiheadAttention(embed_dim=dim, num_heads=n_heads, dropout=dropout)
        self.q = nn.Linear(dim, dim)
        self.k = nn.Linear(dim, dim)
        self.v = nn.Linear(dim, dim)

    def forward(self, x):
        q = self.q(x).permute(1, 0, 2)
        k = self.k(x).permute(1, 0, 2)
        v = self.v(x).permute(1, 0, 2)
        attn_output, _ = self.att(q, k, v)
        return attn_output.permute(1, 0, 2)

class PreNorm(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.fn = fn

    def forward(self, x, **kwargs):
        return self.fn(self.norm(x), **kwargs)

class FeedForward(nn.Sequential):
    def __init__(self, dim, hidden_dim, dropout=0.):
        super().__init__(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )

class ResidualAdd(nn.Module):
    def __init__(self, fn):
        super().__init__()
        self.fn = fn

    def forward(self, x, **kwargs):
        res = x
        x = self.fn(x, **kwargs)
        return x + res

class ViT(nn.Module):
    def __init__(self, ch=3, img_size=144, patch_size=16, emb_dim=64,
                 n_layers=4, dropout=0.1, heads=2, out_dim=2):
        super().__init__()
        self.patch_embedding = PatchEmbedding(ch, patch_size, emb_dim)
        num_patches = (img_size // patch_size) ** 2

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, emb_dim))
        self.cls_token = nn.Parameter(torch.rand(1, 1, emb_dim))

        self.layers = nn.ModuleList([
            nn.Sequential(
                ResidualAdd(PreNorm(emb_dim, Attention(emb_dim, heads, dropout))),
                ResidualAdd(PreNorm(emb_dim, FeedForward(emb_dim, emb_dim * 2, dropout)))
            ) for _ in range(n_layers)
        ])

        self.head = nn.Sequential(
            nn.LayerNorm(emb_dim),
            nn.Linear(emb_dim, out_dim),
        )

    def forward(self, imgs):
        # imgs: (B, T, C, H, W)
        B, T, C, H, W = imgs.shape
        imgs = imgs.view(B * T, C, H, W)  # (B*T, C, H, W)

        x = self.patch_embedding(imgs)
        b, n, _ = x.shape
        cls_tokens = repeat(self.cls_token, '1 1 d -> b 1 d', b=b)
        x = torch.cat([cls_tokens, x], dim=1)
        x += self.pos_embedding[:, :(n + 1)]
        for layer in self.layers:
            x = layer(x)
        out = self.head(x[:, 0, :])  # (B*T, out_dim)

        return out.view(B, T, -1).mean(dim=1)

# ---------------------------
# Load Model
# ---------------------------
def load_model(model_path, model_config=None):
    if model_config is None:
        model_config = {
            'ch': 3, 'img_size': 144, 'patch_size': 16, 'emb_dim': 64,
            'n_layers': 4, 'dropout': 0.1, 'heads': 2, 'out_dim': 2
        }
    model = ViT(**model_config)
    checkpoint = torch.load(model_path, map_location='cpu')
    if 'model_state_dict' in checkpoint:
        model.load_state_dict(checkpoint['model_state_dict'])
    else:
        model.load_state_dict(checkpoint)
    model.eval()
    return model

# ---------------------------
# Predict Single Image
# ---------------------------
def predict_image(model, image_path, transform, device):
    """
    Predict if a single image is REAL (0) or FAKE (1)
    """
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"Image not found: {image_path}")
    
    # Load and convert image to RGB
    img = cv2.imread(image_path)
    if img is None:
        raise ValueError(f"Could not read image: {image_path}")
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = to_pil_image(img)
    
    # Apply transforms and add batch + temporal dimensions
    img_tensor = transform(img).unsqueeze(0).unsqueeze(0).to(device)  # (1, 1, C, H, W)
    
    # Prediction
    model.eval()
    with torch.no_grad():
        outputs = model(img_tensor)
        probs = torch.softmax(outputs, dim=1)
        confidence, pred = torch.max(probs, 1)
    
    label = "FAKE" if pred.item() == 1 else "REAL"
    confidence = confidence.item() * 100
    return label, confidence, probs.cpu().numpy()[0]

# ---------------------------
# Main Execution
# ---------------------------
if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Load model
    model_path = "D://projectdeepfake//vit_model_epoch_20_20251212_110939.pth"
    model = load_model(model_path)
    model = model.to(device)
    print("Model loaded successfully!")

    # Image transforms
    transform = T.Compose([
        T.Resize((144, 144)),
        T.ToTensor(),
    ])

    # Predict on a single image
    image_path = "D://projectdeepfake//images//devel//attack//attack_highdef_client005_session01_highdef_photo_adverse//00012.jpg"
    label, confidence, probs = predict_image(model, image_path, transform, device)

    print(f"Prediction: {label} (Confidence: {confidence:.2f}%)")
    print(f"Real probability: {probs[0]:.4f}, Fake probability: {probs[1]:.4f}")


Using device: cpu
Model loaded successfully!
Prediction: FAKE (Confidence: 99.93%)
Real probability: 0.0007, Fake probability: 0.9993
