In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import EfficientNetB0  # Lighter architecture
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# GPU Configuration
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    for device in physical_devices:
        tf.config.experimental.set_memory_growth(device, True)
    print("Configured GPU with memory growth.")
else:
    print("No GPU detected, running on CPU.")

# Paths
FAKE_IMAGE_DIR = 'drive/MyDrive/fake'
REAL_IMAGE_DIR = 'drive/MyDrive/train-real'

# Hyperparameters
IMAGE_SIZE = (96, 96)  # Reduced size
BATCH_SIZE = 16  # Smaller batch size for faster processing
EPOCHS = 5  # Temporarily reduced for faster feedback
LEARNING_RATE = 1e-4
MODEL_CHECKPOINT = 'best_model.keras'

# Function to collect all image paths
def collect_image_paths(base_dir, category_name):
    image_paths = []
    for subfolder in tqdm(sorted(os.listdir(base_dir)), desc=f"Scanning {category_name} folders"):
        subfolder_path = os.path.join(base_dir, subfolder)
        if os.path.isdir(subfolder_path):  # Ensure it's a directory
            for img_file in os.listdir(subfolder_path):
                if img_file.endswith('.png'):  # Check for PNG images
                    image_paths.append(os.path.join(subfolder_path, img_file))
    return image_paths

# Load all image paths
print("Collecting image paths...")
fake_image_paths = collect_image_paths(FAKE_IMAGE_DIR, "fake")
real_image_paths = collect_image_paths(REAL_IMAGE_DIR, "real")

# Labels: Fake = 0, Real = 1
fake_labels = [0] * len(fake_image_paths)
real_labels = [1] * len(real_image_paths)

# Combine and shuffle data
print("Shuffling dataset...")
all_image_paths = fake_image_paths + real_image_paths
all_labels = fake_labels + real_labels
indices = np.arange(len(all_image_paths))
np.random.shuffle(indices)

all_image_paths = np.array(all_image_paths)[indices]
all_labels = np.array(all_labels)[indices]

# Function to preprocess images using tf.data API
def preprocess_image(image_path):
    image = tf.io.read_file(image_path)
    image = tf.image.decode_png(image, channels=3)
    image = tf.image.resize(image, IMAGE_SIZE)
    image = tf.keras.applications.efficientnet.preprocess_input(image)  # Use EfficientNet's preprocessing
    return image

# Create a tf.data Dataset
def create_dataset(image_paths, labels, batch_size):
    # Convert image paths to a Dataset
    image_paths = tf.data.Dataset.from_tensor_slices(image_paths)
    labels = tf.data.Dataset.from_tensor_slices(labels)

    # Map preprocess function
    image_dataset = image_paths.map(lambda x: preprocess_image(x), num_parallel_calls=tf.data.AUTOTUNE)

    # Combine image dataset and label dataset
    dataset = tf.data.Dataset.zip((image_dataset, labels))

    # Shuffle, batch, and prefetch the dataset
    dataset = dataset.shuffle(buffer_size=1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

# Split dataset into training and validation sets
print("Splitting dataset into training and validation...")
X_train, X_val, y_train, y_val = train_test_split(
    all_image_paths, all_labels, test_size=0.2, random_state=42, stratify=all_labels
)

# Create TensorFlow Dataset
train_dataset = create_dataset(X_train, y_train, BATCH_SIZE)
val_dataset = create_dataset(X_val, y_val, BATCH_SIZE)

# Model Definition
print("Building the model...")
base_model = EfficientNetB0(input_shape=(*IMAGE_SIZE, 3), include_top=False, weights='imagenet')
base_model.trainable = False  # Freeze base model

x = Flatten()(base_model.output)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
output = Dense(1, activation='sigmoid')(x)

model = Model(inputs=base_model.input, outputs=output)

# Compile the model with mixed precision
print("Compiling the model...")
tf.keras.mixed_precision.set_global_policy('mixed_float16')  # Enable mixed precision
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Class weights to handle data imbalance
class_weights = {0: 0.5778843174070766, 1: 3.7098888238736105}

# Callbacks
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1),
    ModelCheckpoint(MODEL_CHECKPOINT, monitor='val_loss', save_best_only=True, verbose=1)
]

# Train the model
print("Training the model...")
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=EPOCHS,
    class_weight=class_weights,  # Account for class imbalance
    callbacks=callbacks,
    verbose=1
)

# Save the model
print(f"Saving model to {MODEL_CHECKPOINT}...")
model.save(MODEL_CHECKPOINT)

print("Training complete.")


Configured GPU with memory growth.
Collecting image paths...


Scanning fake folders: 100%|██████████| 17/17 [00:01<00:00, 11.86it/s]
Scanning real folders: 100%|██████████| 17/17 [00:00<00:00, 33.39it/s]


Shuffling dataset...
Splitting dataset into training and validation...
Building the model...
Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Compiling the model...
Training the model...
Epoch 1/5
[1m   8/6341[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3:22:46[0m 2s/step - accuracy: 0.5520 - loss: 0.7865

In [None]:
!pip install mediapipe opencv-python
!pip install xgboost

Collecting mediapipe
  Downloading mediapipe-0.10.20-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (9.7 kB)
Collecting sounddevice>=0.4.4 (from mediapipe)
  Downloading sounddevice-0.5.1-py3-none-any.whl.metadata (1.4 kB)
Downloading mediapipe-0.10.20-cp311-cp311-manylinux_2_28_x86_64.whl (35.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.6/35.6 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sounddevice-0.5.1-py3-none-any.whl (32 kB)
Installing collected packages: sounddevice, mediapipe
Successfully installed mediapipe-0.10.20 sounddevice-0.5.1


In [None]:
# Step 1: Download the shape predictor file
!wget http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2

# Step 2: Extract the downloaded bz2 file
!bunzip2 shape_predictor_68_face_landmarks.dat.bz2

# Step 3: Set the correct path to the downloaded file
predictor_path = '/content/shape_predictor_68_face_landmarks.dat'

--2025-01-19 04:59:07--  http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2
Resolving dlib.net (dlib.net)... 107.180.26.78
Connecting to dlib.net (dlib.net)|107.180.26.78|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 64040097 (61M)
Saving to: ‘shape_predictor_68_face_landmarks.dat.bz2’


2025-01-19 04:59:08 (90.9 MB/s) - ‘shape_predictor_68_face_landmarks.dat.bz2’ saved [64040097/64040097]



In [None]:
import os
import numpy as np
import pandas as pd
import dlib
import cv2
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
import tensorflow as tf

# GPU Configuration
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    for device in physical_devices:
        tf.config.experimental.set_memory_growth(device, True)
    print("Configured GPU with memory growth.")
else:
    print("No GPU detected, running on CPU.")

# Paths
FAKE_IMAGE_DIR = 'drive/MyDrive/fake'
REAL_IMAGE_DIR = 'drive/MyDrive/train-real'
OUTPUT_MODEL = 'landmark_model.joblib'

# Hyperparameters
NUM_THREADS = 16  # Increased threads for faster processing
BATCH_SIZE = 64  # Batch size for image processing
RANDOM_STATE = 42

# Dlib setup
predictor_path = '/content/shape_predictor_68_face_landmarks.dat'
detector = dlib.get_frontal_face_detector()
predictor = dlib.shape_predictor(predictor_path)
# Function to extract facial landmarks using Dlib
def extract_landmarks(image_path):
    try:
        image = cv2.imread(image_path)
        if image is None:
            return None

        # Convert to grayscale for Dlib
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        faces = detector(gray, 1)
        if len(faces) == 0:
            return None

        # Assume the first detected face is the target
        shape = predictor(gray, faces[0])
        landmarks = [(p.x, p.y) for p in shape.parts()]
        return np.array(landmarks).flatten()

    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

# Function to collect image paths
def collect_image_paths(base_dir):
    image_paths = []
    for root, _, files in os.walk(base_dir):
        for file in files:
            if file.endswith('.png'):
                image_paths.append(os.path.join(root, file))
    return image_paths

# Parallelized landmark extraction with batch processing
def process_images_in_batches(image_paths):
    results = []
    with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
        for result in tqdm(executor.map(extract_landmarks, image_paths), total=len(image_paths), desc="Extracting landmarks"):
            results.append(result)
    return results

# Collect and preprocess data
print("Collecting image paths...")
fake_image_paths = collect_image_paths(FAKE_IMAGE_DIR)
real_image_paths = collect_image_paths(REAL_IMAGE_DIR)

# Balance dataset by limiting fake samples to 2x real samples
print("Balancing dataset...")
if len(fake_image_paths) > 2 * len(real_image_paths):
    fake_image_paths = np.random.choice(fake_image_paths, 2 * len(real_image_paths), replace=False)

print("Extracting landmarks...")
fake_landmarks = process_images_in_batches(fake_image_paths)
real_landmarks = process_images_in_batches(real_image_paths)

# Filter out None values
fake_landmarks = [lm for lm in fake_landmarks if lm is not None]
real_landmarks = [lm for lm in real_landmarks if lm is not None]

# Labels: Fake = 0, Real = 1
fake_labels = [0] * len(fake_landmarks)
real_labels = [1] * len(real_landmarks)

# Combine and shuffle data
print("Combining and shuffling data...")
all_landmarks = np.array(fake_landmarks + real_landmarks)
all_labels = np.array(fake_labels + real_labels)

indices = np.arange(len(all_landmarks))
np.random.shuffle(indices)
all_landmarks = all_landmarks[indices]
all_labels = all_labels[indices]

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(all_landmarks, all_labels, test_size=0.2, random_state=RANDOM_STATE, stratify=all_labels)

# Handle class imbalance using scale_pos_weight
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

# Convert data to DMatrix for XGBoost (GPU acceleration)
import xgboost as xgb
train_dmatrix = xgb.DMatrix(X_train, label=y_train)
val_dmatrix = xgb.DMatrix(X_val, label=y_val)

# XGBoost Parameters with GPU
params = {
    'objective': 'binary:logistic',
    'tree_method': 'gpu_hist',  # Use GPU acceleration
    'eval_metric': 'logloss',
    'scale_pos_weight': scale_pos_weight,
    'eta': 0.1,
    'max_depth': 6,
}

# Train the model
print("Training XGBoost model on GPU...")
evals = [(train_dmatrix, 'train'), (val_dmatrix, 'validation')]
model = xgb.train(params, train_dmatrix, num_boost_round=200, evals=evals, early_stopping_rounds=10, verbose_eval=10)

# Save the model
print(f"Saving model to {OUTPUT_MODEL}...")
joblib.dump(model, OUTPUT_MODEL)

# Evaluate model
print("Evaluating model...")
y_pred = (model.predict(val_dmatrix) > 0.5).astype(int)
print(classification_report(y_val, y_pred))
print(f"Validation Accuracy: {accuracy_score(y_val, y_pred):.4f}")

print("Processing complete.")


Configured GPU with memory growth.
Collecting image paths...
Balancing dataset...
Extracting landmarks...


Extracting landmarks: 100%|██████████| 34180/34180 [27:24<00:00, 20.79it/s]
Extracting landmarks: 100%|██████████| 17090/17090 [12:24<00:00, 22.94it/s]


Combining and shuffling data...
Training XGBoost model on GPU...



    E.g. tree_method = "hist", device = "cuda"



[0]	train-logloss:0.68931	validation-logloss:0.69029
[10]	train-logloss:0.66348	validation-logloss:0.67391
[20]	train-logloss:0.64413	validation-logloss:0.66473
[30]	train-logloss:0.62904	validation-logloss:0.65725
[40]	train-logloss:0.61605	validation-logloss:0.65227
[50]	train-logloss:0.60308	validation-logloss:0.64746
[60]	train-logloss:0.59278	validation-logloss:0.64390
[70]	train-logloss:0.58269	validation-logloss:0.64017
[80]	train-logloss:0.57208	validation-logloss:0.63678
[90]	train-logloss:0.56312	validation-logloss:0.63354
[100]	train-logloss:0.55468	validation-logloss:0.63132
[110]	train-logloss:0.54717	validation-logloss:0.62967
[120]	train-logloss:0.54035	validation-logloss:0.62801
[130]	train-logloss:0.53342	validation-logloss:0.62575
[140]	train-logloss:0.52602	validation-logloss:0.62391
[150]	train-logloss:0.51924	validation-logloss:0.62215
[160]	train-logloss:0.51165	validation-logloss:0.61944
[170]	train-logloss:0.50559	validation-logloss:0.61800
[180]	train-logloss:0


    E.g. tree_method = "hist", device = "cuda"



In [None]:
import os
import numpy as np
import pandas as pd
import dlib
import cv2
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from sklearn.metrics import classification_report, accuracy_score
import joblib
import xgboost as xgb

# GPU Configuration
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    for device in physical_devices:
        tf.config.experimental.set_memory_growth(device, True)
    print("Configured GPU with memory growth.")
else:
    print("No GPU detected, running on CPU.")

# Paths
FAKE_VALID_IMAGE_DIR = 'drive/MyDrive/validation/fake_valid/fake'
REAL_VALID_IMAGE_DIR = 'drive/MyDrive/validation/real_valid/real'
OUTPUT_MODEL = 'landmark_model.joblib'

# Hyperparameters
NUM_THREADS = 16  # Increased threads for faster processing

# Dlib setup
predictor_path = "shape_predictor_68_face_landmarks.dat"
detector = dlib.get_frontal_face_detector()
predictor = dlib.shape_predictor(predictor_path)

# Function to extract facial landmarks using Dlib
def extract_landmarks(image_path):
    try:
        image = cv2.imread(image_path)
        if image is None:
            return None

        # Convert to grayscale for Dlib
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        faces = detector(gray, 1)
        if len(faces) == 0:
            return None

        # Assume the first detected face is the target
        shape = predictor(gray, faces[0])
        landmarks = [(p.x, p.y) for p in shape.parts()]
        return np.array(landmarks).flatten()

    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

# Function to collect image paths
def collect_image_paths(base_dir):
    image_paths = []
    for root, _, files in os.walk(base_dir):
        for file in files:
            if file.endswith('.png'):
                image_paths.append(os.path.join(root, file))
    return image_paths

# Parallelized landmark extraction with batch processing
def process_images_in_batches(image_paths):
    results = []
    with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
        for result in tqdm(executor.map(extract_landmarks, image_paths), total=len(image_paths), desc="Extracting landmarks"):
            results.append(result)
    return results

# Load model
print("Loading model...")
model = joblib.load(OUTPUT_MODEL)

# Collect validation image paths
print("Collecting validation image paths...")
fake_valid_image_paths = collect_image_paths(FAKE_VALID_IMAGE_DIR)
real_valid_image_paths = collect_image_paths(REAL_VALID_IMAGE_DIR)

# Extract landmarks for validation data
print("Extracting landmarks for validation data...")
fake_valid_landmarks = process_images_in_batches(fake_valid_image_paths)
real_valid_landmarks = process_images_in_batches(real_valid_image_paths)

# Filter out None values
fake_valid_landmarks = [lm for lm in fake_valid_landmarks if lm is not None]
real_valid_landmarks = [lm for lm in real_valid_landmarks if lm is not None]

# Labels: Fake = 0, Real = 1
fake_valid_labels = [0] * len(fake_valid_landmarks)
real_valid_labels = [1] * len(real_valid_landmarks)

# Combine validation data
print("Combining validation data...")
valid_landmarks = np.array(fake_valid_landmarks + real_valid_landmarks)
valid_labels = np.array(fake_valid_labels + real_valid_labels)

# Convert landmarks to DMatrix
print("Converting landmarks to DMatrix...")
valid_dmatrix = xgb.DMatrix(valid_landmarks)

# Predict using the model
print("Predicting on validation data...")
y_pred = (model.predict(valid_dmatrix) > 0.5).astype(int)

# Evaluate model
print("Evaluating model...")
print(classification_report(valid_labels, y_pred))
print(f"Validation Accuracy: {accuracy_score(valid_labels, y_pred):.4f}")


Configured GPU with memory growth.
Loading model...
Collecting validation image paths...
Extracting landmarks for validation data...


Extracting landmarks: 100%|██████████| 1524/1524 [01:08<00:00, 22.14it/s]
Extracting landmarks: 100%|██████████| 1548/1548 [00:51<00:00, 30.25it/s]

Combining validation data...
Converting landmarks to DMatrix...
Predicting on validation data...
Evaluating model...
              precision    recall  f1-score   support

           0       0.52      0.53      0.52      1521
           1       0.53      0.53      0.53      1546

    accuracy                           0.53      3067
   macro avg       0.53      0.53      0.53      3067
weighted avg       0.53      0.53      0.53      3067

Validation Accuracy: 0.5259




    E.g. tree_method = "hist", device = "cuda"



In [None]:
output_dir = '/content/drive/MyDrive/model'
os.makedirs(output_dir, exist_ok=True)

# Step 3: Define the output model path
OUTPUT_MODEL = os.path.join(output_dir, 'landmark_model.joblib')

# Step 4: Save the model
joblib.dump(model, OUTPUT_MODEL)

print(f"Model saved to {OUTPUT_MODEL}")

Model saved to /content/drive/MyDrive/model/landmark_model.joblib


In [None]:
import os
import cv2
import numpy as np
from multiprocessing import Pool, cpu_count
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
import joblib
import pickle

# Paths
FAKE_IMAGE_DIR = 'drive/MyDrive/fake'
REAL_IMAGE_DIR = 'drive/MyDrive/train-real'
OUTPUT_MODEL = 'landmark_model.joblib'
CHECKPOINT_PATH_FAKE = 'fake_checkpoint.pkl'
CHECKPOINT_PATH_REAL = 'real_checkpoint.pkl'
FEATURES_PATH = 'extracted_features.pkl'

# Hyperparameters
NUM_THREADS = 8  # Number of threads per batch
NUM_PROCESSES = min(4, cpu_count())  # Number of parallel processes
BATCH_SIZE = 1000  # Buffered batch size for disk I/O
RANDOM_STATE = 52

# Function to collect image paths
def collect_image_paths(base_dir):
    image_paths = []
    for root, _, files in os.walk(base_dir):
        for file in files:
            if file.endswith('.png'):
                image_paths.append(os.path.join(root, file))
    return image_paths

# Image processing function
def process_image(image_path):
    try:
        image = cv2.imread(image_path)
        if image is None:
            raise ValueError("Image not loaded")
        image = cv2.resize(image, (224, 224))  # Resize to 224x224
        image = image / 255.0  # Normalize the image
        return image.flatten()  # Flatten to 1D
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

# Multiprocessing wrapper
def process_images_batch(batch_paths):
    results = []
    with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
        futures = {executor.submit(process_image, path): path for path in batch_paths}
        for future in as_completed(futures):
            result = future.result()
            if result is not None:
                results.append(result)
    return results

# Extract features with multiprocessing
def extract_features_in_batches(image_paths, checkpoint_path, batch_size=1000):
    start_idx = 0
    features = []

    # Load checkpoint if it exists
    if os.path.exists(checkpoint_path):
        with open(checkpoint_path, 'rb') as f:
            checkpoint_data = pickle.load(f)
            start_idx = checkpoint_data.get('start_idx', 0)
            features = checkpoint_data.get('features', [])
        print(f"Resuming from batch {start_idx}, loaded {len(features)} features.")

    # Split paths into batches
    batches = [image_paths[i:i + batch_size] for i in range(start_idx, len(image_paths), batch_size)]

    # Process batches with multiprocessing
    with Pool(processes=NUM_PROCESSES) as pool:
        for batch_idx, batch_features in enumerate(
            tqdm(pool.imap(process_images_batch, batches), total=len(batches), desc="Processing batches")
        ):
            features.extend(batch_features)

            # Save progress after each batch
            with open(checkpoint_path, 'wb') as f:
                pickle.dump({'start_idx': (batch_idx + 1) * batch_size, 'features': features}, f)

    return features

# Collect image paths
print("Collecting image paths...")
fake_image_paths = collect_image_paths(FAKE_IMAGE_DIR)
real_image_paths = collect_image_paths(REAL_IMAGE_DIR)
print(f"Number of fake images: {len(fake_image_paths)}")
print(f"Number of real images: {len(real_image_paths)}")

# Balance dataset
if len(fake_image_paths) > 2 * len(real_image_paths):
    fake_image_paths = np.random.choice(fake_image_paths, 2 * len(real_image_paths), replace=False)
print(f"Reduced fake images to {len(fake_image_paths)}")

# Extract features for fake and real images
print("Extracting features for fake images...")
fake_features = extract_features_in_batches(fake_image_paths, CHECKPOINT_PATH_FAKE, BATCH_SIZE)

print("Extracting features for real images...")
real_features = extract_features_in_batches(real_image_paths, CHECKPOINT_PATH_REAL, BATCH_SIZE)

# Validate extracted features
assert len(fake_features) > 0, "No fake features extracted!"
assert len(real_features) > 0, "No real features extracted!"

# Save the extracted features in separate files
with open('fake_features.pkl', 'wb') as f:
    pickle.dump(fake_features, f)

with open('real_features.pkl', 'wb') as f:
    pickle.dump(real_features, f)

print(f"Extracted fake features saved to fake_features.pkl")
print(f"Extracted real features saved to real_features.pkl")

# Prepare dataset
all_features = np.array(fake_features + real_features)
all_labels = np.array([0] * len(fake_features) + [1] * len(real_features))

# Shuffle and split the dataset
indices = np.arange(len(all_features))
np.random.shuffle(indices)
all_features = all_features[indices]
all_labels = all_labels[indices]

X_train, X_val, y_train, y_val = train_test_split(
    all_features, all_labels, test_size=0.2, random_state=RANDOM_STATE, stratify=all_labels
)

# Check if both classes are present in the training set
if len(np.unique(y_train)) < 2:
    raise ValueError("Training set has only one class. Ensure both fake and real images are properly loaded.")

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}
print(f"Class weights: {class_weight_dict}")

# Train the model
print("Training XGBoost model...")
model = XGBClassifier(scale_pos_weight=class_weight_dict[0] / class_weight_dict[1], use_label_encoder=False, eval_metric="logloss")
model.fit(X_train, y_train, sample_weight=np.array([class_weight_dict[label] for label in y_train]))

# Save the model
joblib.dump(model, OUTPUT_MODEL)
print(f"Model saved to {OUTPUT_MODEL}")

# Evaluate the model
print("Evaluating model...")
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))
print(f"Validation Accuracy: {accuracy_score(y_val, y_pred):.4f}")


Collecting image paths...
Number of fake images: 109714
Number of real images: 17090
Reduced fake images to 34180
Extracting features for fake images...


UnpicklingError: pickle data was truncated

In [None]:
import pickle

FAKE_FEATURES_PATH = 'fake_checkpoint.pkl'

try:
    with open(FAKE_FEATURES_PATH, 'rb') as f:
        data = pickle.load(f)
    print(f"Data type in fake_features.pkl: {type(data)}")
    print(data)  # Print the data to see what's inside
except FileNotFoundError:
    print("No fake features file found.")
except EOFError:
    print("Fake features file is empty or corrupted.")


Data type in fake_features.pkl: <class 'int'>
8000


In [7]:
import pickle
import joblib
import numpy as np

# Define the paths
model_path = "drive/MyDrive/model/landmark_model.joblib"
fake_data_path = "drive/MyDrive/validation/valid_fake_landmarks.pkl"
real_data_path = "drive/MyDrive/validation/valid_real_landmarks.pkl"
output_file = "submission.txt"

# Load the model
print("Loading model...")
model = joblib.load(model_path)

# Load the validation data
print("Loading validation data...")
with open(fake_data_path, "rb") as f:
    fake_landmarks = pickle.load(f)
with open(real_data_path, "rb") as f:
    real_landmarks = pickle.load(f)

# Preprocess data
print("Preparing data...")
def preprocess_data(data, target_length=1280):
    processed_data = []
    for item in data:
        item = np.array(item).flatten()
        if len(item) < target_length:
            item = np.pad(item, (0, target_length - len(item)), mode="constant")
        elif len(item) > target_length:
            item = item[:target_length]
        processed_data.append(item)
    return np.array(processed_data)

# Preprocess landmarks to match the expected input size
fake_landmarks = preprocess_data(fake_landmarks, target_length=1280)
real_landmarks = preprocess_data(real_landmarks, target_length=1280)

# Combine data and file IDs
fake_file_ids = [f"fake_file{i+1}" for i in range(len(fake_landmarks))]
real_file_ids = [f"real_file{i+1}" for i in range(len(real_landmarks))]
all_landmarks = np.vstack((fake_landmarks, real_landmarks))
all_file_ids = fake_file_ids + real_file_ids

# Predict scores
print("Predicting scores...")
scores = model.predict_proba(all_landmarks)[:, 1]  # Use the probability for "real" (class 1)

# Generate the submission file
print("Writing submission file...")
with open(output_file, "w") as f:
    for file_id, score in zip(all_file_ids, scores):
        f.write(f"{file_id}\t{score:.6f}\n")

print(f"Submission file '{output_file}' created successfully!")


Loading model...
Loading validation data...
Preparing data...


  Loading from a raw memory buffer (like pickle in Python, RDS in R) on a CPU-only
  machine. Consider using `save_model/load_model` instead. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.  Changing `tree_method` to `hist`.


Predicting scores...
Writing submission file...
Submission file 'submission.txt' created successfully!
