<a href="https://colab.research.google.com/github/hongjinkong/opensw/blob/main/1212/deepfakedetection_autoencoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/mystlee/dfdc_deepfake_challenge.git

Cloning into 'dfdc_deepfake_challenge'...
remote: Enumerating objects: 134, done.[K
remote: Counting objects: 100% (69/69), done.[K
remote: Compressing objects: 100% (65/65), done.[K
remote: Total 134 (delta 23), reused 4 (delta 4), pack-reused 65 (from 1)[K
Receiving objects: 100% (134/134), 69.20 MiB | 10.66 MiB/s, done.
Resolving deltas: 100% (40/40), done.
Updating files: 100% (63/63), done.


In [None]:
from google.colab import files
files.upload()  # 여기서 'kaggle.json' 파일을 선택하여 업로드


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"hongjinkong","key":"4c9bd1c41ad59a0b429b20ee1998c074"}'}

In [None]:
import os

# Kaggle API 파일을 Colab에서 사용할 수 있는 위치로 이동
os.makedirs('/root/.kaggle', exist_ok=True)
!cp kaggle.json /root/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json  # 권한 설정


In [None]:
!kaggle datasets download -d manjilkarki/deepfake-and-real-images --force


Dataset URL: https://www.kaggle.com/datasets/manjilkarki/deepfake-and-real-images
License(s): unknown
Downloading deepfake-and-real-images.zip to /content
 98% 1.66G/1.68G [00:16<00:00, 102MB/s]
100% 1.68G/1.68G [00:16<00:00, 112MB/s]


In [None]:
!unzip deepfake-and-real-images.zip -d /content/deepfake_and_real_images


In [None]:
import os

dataset_dir = '/content/deepfake_and_real_images'  # 데이터셋 경로
os.listdir(dataset_dir)


['Dataset']

In [None]:
import os
import json
import cv2
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from google.colab import drive
import zipfile
import matplotlib.pyplot as plt

# 1. Google Drive 마운트 및 데이터셋 준비
def setup_dataset_from_drive(drive_path, local_extract_path):
    # Google Drive 마운트
    drive.mount('/content/drive')
    zip_path = os.path.join('/content/drive/My Drive', drive_path)

    # 압축 해제
    if not os.path.exists(local_extract_path):
        os.makedirs(local_extract_path)

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(local_extract_path)

    print(f"Dataset extracted to {local_extract_path}")
    return local_extract_path

# 2. metadata.json 기반으로 프레임 추출 및 라벨링
def extract_frames_with_metadata(video_dir, metadata_path, output_base_dir, frame_rate=5):
    with open(metadata_path, 'r') as f:
        metadata = json.load(f)

    for video_file, attributes in metadata.items():
        label = 'fake' if attributes['label'] == 'FAKE' else 'real'
        video_path = os.path.join(video_dir, video_file)
        output_label_dir = os.path.join(output_base_dir, label)
        os.makedirs(output_label_dir, exist_ok=True)

        if not os.path.exists(video_path):
            print(f"Warning: {video_path} does not exist.")
            continue

        cap = cv2.VideoCapture(video_path)
        count = 0
        frame_id = 0

        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            if count % frame_rate == 0:  # frame_rate마다 프레임 추출
                frame_path = os.path.join(output_label_dir, f"{video_file}_frame_{frame_id}.jpg")
                cv2.imwrite(frame_path, frame)
                frame_id += 1
            count += 1
        cap.release()
    print(f"Frames extracted to {output_base_dir}")

# 3. 데이터 로드
def load_data_from_frames(base_dir):
    datagen = ImageDataGenerator(
        rescale=1./255,
        rotation_range=15,
        width_shift_range=0.1,
        height_shift_range=0.1,
        horizontal_flip=True,
        validation_split=0.2  # Train/Validation Split
    )
    train_generator = datagen.flow_from_directory(
        base_dir,
        target_size=(128, 128),
        batch_size=32,
        class_mode='categorical',
        subset='training'
    )
    val_generator = datagen.flow_from_directory(
        base_dir,
        target_size=(128, 128),
        batch_size=32,
        class_mode='categorical',
        subset='validation'
    )
    return train_generator, val_generator

# 4. 모델 정의
def build_classifier(input_shape):
    inputs = tf.keras.Input(shape=input_shape)
    x = layers.Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Flatten()(x)
    x = layers.Dense(64, activation='relu')(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(2, activation='softmax')(x)
    model = Model(inputs, outputs)
    return model

# 5. 새로운 영상 프레임 추출 및 판별
def extract_frames(video_path, output_dir="temp_frames", frame_rate=5):
    os.makedirs(output_dir, exist_ok=True)
    cap = cv2.VideoCapture(video_path)
    frames = []
    count = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if count % frame_rate == 0:
            frame = cv2.resize(frame, (128, 128))
            frames.append(frame)
        count += 1
    cap.release()
    frames = np.array(frames)
    return frames

def predict_video(model, video_path, frame_rate=5):
    frames = extract_frames(video_path, frame_rate=frame_rate)
    frames = frames / 255.0  # Normalize
    predictions = model.predict(frames)
    fake_probabilities = predictions[:, 1]  # Fake 클래스 확률
    avg_fake_probability = np.mean(fake_probabilities)

    if avg_fake_probability > 0.5:
        result = "Deepfake"
    else:
        result = "Real"

    print(f"Video Prediction: {result} (Average Fake Probability: {avg_fake_probability:.2f})")
    return result, avg_fake_probability

# 6. 실행
if __name__ == '__main__':
    # Google Drive에서 데이터셋 준비
    drive_path = 'dataset/dfdc_train_part_00.zip'  # Google Drive 내의 ZIP 파일 경로
    extract_path = './train_data'  # 로컬로 압축 해제할 디렉토리
    dataset_path = setup_dataset_from_drive(drive_path, extract_path)

    # metadata.json 경로
    metadata_path = os.path.join(dataset_path, 'metadata.json')

    # 프레임 추출
    output_frames_dir = './train_frames'
    extract_frames_with_metadata(dataset_path, metadata_path, output_frames_dir, frame_rate=5)

    # 데이터 로드
    train_gen, val_gen = load_data_from_frames(output_frames_dir)

    # 모델 생성 및 컴파일
    input_shape = (128, 128, 3)
    model = build_classifier(input_shape)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # 모델 훈련
    model.fit(train_gen, epochs=10, validation_data=val_gen)

    # 새로운 동영상 판별
    test_video_path = './test_video.mp4'  # 테스트할 동영상
    predict_video(model, test_video_path, frame_rate=10)


Mounted at /content/drive
