In [None]:
# Experiment 2: Training and Validation of the Grid Search Hyperparameter Optimized ResNet-50 Model on the Celeb-DF (Version 1) Dataset

from google.colab import drive
drive.mount('/content/drive')

In [None]:
import joblib

# Saved model is loaded from Google drive account
loaded_model = joblib.load('/content/drive/MyDrive/model_filename_original_resnet.pkl')

In [None]:
# Imported libraries
import os
import cv2
import numpy as np
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import GlobalAveragePooling2D
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, roc_auc_score, recall_score, f1_score, confusion_matrix
from tqdm import tqdm
import joblib 
import seaborn as sns  
import matplotlib.pyplot as plt  
from sklearn.linear_model import LogisticRegression

# Batch size
batch_size = 32

# Base folder path to utilized dataset
base_path = '/content/drive/MyDrive/Celeb-DFF'

# Subfolder paths for real and fake videos
real_path_1 = os.path.join(base_path, 'Celeb-real')
fake_path = os.path.join(base_path, 'Celeb-synthesis')

# File paths for real and fake videos
fake_video_files = [os.path.join(fake_path, file) for file in os.listdir(fake_path) if file.endswith('.mp4')]
real_video_files = [os.path.join(real_path_1, file) for file in os.listdir(real_path_1) if file.endswith('.mp4')]

# Label creation
real_labels = ['echt'] * len(real_video_files)
fake_labels = ['df'] * len(fake_video_files)

# Combined file paths and labels
file_paths = real_video_files + fake_video_files
labels = real_labels + fake_labels

# Training (80%) and validation (20%)
X_rain, X_val, y_train, y_val = train_test_split(file_paths, labels, test_size=0.2, random_state=42)


base_model = ResNet50(weights='imagenet', include_top=False)


x = base_model.output
x = GlobalAveragePooling2D()(x)


feature_extraction_model = Model(inputs=base_model.input, outputs=x)

# Function loads and preprocesses video frames with tqdm progress bars
def load_and_preprocess_video(file_path, label, desired_num_frames=16):
    cap = cv2.VideoCapture(file_path)
    frames = []

    # Code enables reading of frames from videos
    num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    for _ in tqdm(range(min(num_frames, desired_num_frames)), desc=f"Processing {os.path.basename(file_path)}", unit=" frame"):
        ret, frame = cap.read()
        if not ret:
            break

        # Code resizes the frame to match ResNet-50 input size
        frame = cv2.resize(frame, (224, 224))
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert to RGB
        frame = np.expand_dims(frame, axis=0)  # Add batch dimension

        frames.append(frame)

    cap.release()

    # Checks if any frames were read
    if not frames:
        return None, label


    if len(frames) < desired_num_frames:
        frames = np.concatenate([frames] * (desired_num_frames // len(frames) + 1), axis=0)
        frames = frames[:desired_num_frames]
    elif len(frames) > desired_num_frames:
        frames = frames[:desired_num_frames]

    
    frames = np.vstack(frames) 
    return frames, label


train_features = []
val_features = []

for file_path, label in zip(X_train, y_train):
    frames, _ = load_and_preprocess_video(file_path, label)
    if frames is not None:
        features = feature_extraction_model.predict(frames)
        train_features.append(features)

for file_path, label in zip(X_val, y_val):
    frames, _ = load_and_preprocess_video(file_path, label)
    if frames is not None:
        features = feature_extraction_model.predict(frames)
        val_features.append(features)


X_train_features = np.array(train_features).reshape(len(train_features), -1)
X_val_features = np.array(val_features).reshape(len(val_features), -1)

# Defines hyperparameter search space for Grid Search
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'max_iter': [100, 200, 300, 400, 500, 1000],
    'solver': ['lbfgs', 'liblinear', 'newton-cg'],
    'penalty': ['l1', 'l2', 'elasticnet'],
}

# Initializes logistic regression classifier
classifier = LogisticRegression()

# Initializes and runs GridSearchCV
grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train_features, y_train)

# Saves the optimized model to Google Drive directory
joblib.dump(grid_search.best_estimator_, '/content/drive/MyDrive/grid_search_optimized.pkl')

# Predictions on validation data using the best estimator from grid search
y_val_pred = grid_search.best_estimator_.predict(X_val_features)


accuracy = accuracy_score(y_val, y_val_pred)


precision = precision_score(y_val, y_val_pred, pos_label='df')


y_val_proba = grid_search.best_estimator_.predict_proba(X_val_features)[:, 1]
roc_auc = roc_auc_score(y_val, y_val_proba)


recall = recall_score(y_val, y_val_pred, pos_label='df')


f1 = f1_score(y_val, y_val_pred, pos_label='df')

# Creation and Visualization of confusion matrix heatmap
cm = confusion_matrix(y_val, y_val_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['echt', 'df'], yticklabels=['echt', 'df'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix Heatmap')
plt.show()

# Displays metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"ROC AUC: {roc_auc:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")


In [None]:

# Accuracy: 0.73
# Precision: 0.70
# ROC AUC: 0.70
# Recall: 0.91
# F1-score: 0.79