In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import concurrent.futures
import multiprocessing
import pickle
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.utils import resample
from tensorflow.keras.layers import (Dense, Dropout, BatchNormalization, Conv1D,
                                     GlobalAveragePooling1D, Input, concatenate,
                                     Flatten, MultiHeadAttention)
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.optimizers.schedules import CosineDecayRestarts
from tensorflow.keras.optimizers import Adam
from tqdm import tqdm
import tensorflow as tf
import os
import random

# GPU configuration
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    for device in physical_devices:
        try:
            tf.config.experimental.set_memory_growth(device, True)
            tf.config.set_logical_device_configuration(
                device, [tf.config.LogicalDeviceConfiguration(memory_limit=11000)]
            )
            print(f"Configured GPU with a memory limit of 11000 MB.")
        except Exception as e:
            print(f"Error configuring GPU: {e}")
else:
    print("No GPU detected, running on CPU.")

# Set seed for reproducibility
SEED = 25
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)

# File paths
MODEL_SAVE_PATH = "drive/MyDrive/SP_cup/main_sp.keras"
FEATURES_PATH = "drive/MyDrive/SP_cup/features_sp.npy"
LABELS_PATH = "drive/MyDrive/SP_cup/labels_sp.npy"
FAKE_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/standardized_fake_train.pkl'
REAL_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/standardized_real_train.pkl'

# Load features from pickle files
def load_features(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

# Validate and extract features with multithreading
def validate_and_extract(features):
    def extract_feature(entry):
        if isinstance(entry, list):
            return [sub_entry['features'] for sub_entry in entry if isinstance(sub_entry, dict) and 'features' in sub_entry]
        elif isinstance(entry, dict) and 'features' in entry:
            return [entry['features']]
        return []

    valid_features = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(extract_feature, features), desc="Validating and extracting features"))
        for res in results:
            valid_features.extend(res)
    return np.array(valid_features, dtype=np.float32)

# Prepare data with parallel processing
def prepare_data(fake_path, real_path):
    with multiprocessing.Pool(processes=2) as pool:
        fake_features, real_features = pool.map(validate_and_extract,
                                                [load_features(fake_path), load_features(real_path)])
    scaler = StandardScaler()
    all_features = np.vstack((fake_features, real_features))
    scaler.fit(all_features)

    fake_features = scaler.transform(fake_features)
    real_features = scaler.transform(real_features)

    fake_labels = np.zeros(len(fake_features))
    real_labels = np.ones(len(real_features))

    X = np.vstack((fake_features, real_features))
    y = np.hstack((fake_labels, real_labels))
    return X, y

# Oversample the minority class
def balance_data(X, y):
    fake_indices = np.where(y == 0)[0]
    real_indices = np.where(y == 1)[0]

    real_upsampled = resample(real_indices, replace=True, n_samples=len(fake_indices), random_state=SEED)
    balanced_indices = np.concatenate([fake_indices, real_upsampled])
    np.random.shuffle(balanced_indices)

    return X[balanced_indices], y[balanced_indices]

# Feature augmentation with multiprocessing
def add_noise(chunk):
    """Add noise to a data chunk."""
    noise = np.random.normal(0, 0.01, chunk.shape)
    return chunk + noise

def augment_data(X):
    """Apply noise augmentation using multiprocessing."""
    chunk_size = len(X) // multiprocessing.cpu_count()
    chunks = [X[i:i + chunk_size] for i in range(0, len(X), chunk_size)]

    with multiprocessing.Pool() as pool:
        augmented_chunks = pool.map(add_noise, chunks)

    return np.vstack(augmented_chunks)


# Build CNN model
def build_cnn_model(input_shape):
    inputs = Input(shape=input_shape)
    x = Conv1D(filters=64, kernel_size=3, activation='relu', kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4))(inputs)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    x = Conv1D(filters=128, kernel_size=5, activation='relu', kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4))(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    x = GlobalAveragePooling1D()(x)
    return Model(inputs, x)

# Build Transformer model
def build_transformer_model(input_shape):
    inputs = Input(shape=input_shape)
    x = MultiHeadAttention(num_heads=4, key_dim=64)(inputs, inputs)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Flatten()(x)
    return Model(inputs, x)

# Main pipeline
if os.path.exists(FEATURES_PATH) and os.path.exists(LABELS_PATH):
    print("Loading preprocessed features and labels...")
    X = np.load(FEATURES_PATH)
    y = np.load(LABELS_PATH)
else:
    print("Loading and preprocessing raw features...")
    X, y = prepare_data(FAKE_FEATURES_PATH, REAL_FEATURES_PATH)
    np.save(FEATURES_PATH, X)
    np.save(LABELS_PATH, y)

# Parallelized data augmentation and balancing
X = augment_data(X)
X_balanced, y_balanced = balance_data(X, y)

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_balanced), y=y_balanced)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Prepare input shape
input_shape = (X_balanced.shape[1], 1)

# Build models
cnn_model = build_cnn_model(input_shape)
transformer_model = build_transformer_model(input_shape)

# Combine outputs
combined_output = concatenate([cnn_model.output, transformer_model.output])
final_output = Dense(1, activation='sigmoid')(combined_output)
ensemble_model = Model(inputs=[cnn_model.input, transformer_model.input], outputs=final_output)

# Compile model
lr_schedule = CosineDecayRestarts(initial_learning_rate=1e-4, first_decay_steps=1000, t_mul=2.0, alpha=0.01)
optimizer = Adam(learning_rate=lr_schedule)

ensemble_model.compile(
    optimizer=optimizer,
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
)

# Callbacks
callbacks = [
    ModelCheckpoint(MODEL_SAVE_PATH, save_best_only=True, monitor='val_loss', mode='min'),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6, verbose=1),
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1)
]

# Train the model
history = ensemble_model.fit(
    [X_balanced[..., np.newaxis], X_balanced[..., np.newaxis]], y_balanced,
    epochs=5,
    batch_size=64,
    validation_split=0.2,
    class_weight=class_weights_dict,
    callbacks=callbacks,
    verbose=1
)

# Evaluate the model
print("Evaluating ensemble model...")
y_pred = ensemble_model.predict([X_balanced[..., np.newaxis], X_balanced[..., np.newaxis]])
y_pred_binary = (y_pred > 0.5).astype(int)

print("Classification Report:")
print(classification_report(y_balanced, y_pred_binary))
print(f"AUC-ROC: {roc_auc_score(y_balanced, y_pred):.4f}")


Configured GPU with a memory limit of 11000 MB.
Loading preprocessed features and labels...
Epoch 1/5
[1m2743/2743[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m639s[0m 225ms/step - accuracy: 0.9820 - auc: 0.9977 - loss: 0.0982 - val_accuracy: 1.0000 - val_auc: 1.0000 - val_loss: 0.0037 - learning_rate: 4.9790e-06
Epoch 2/5
[1m2743/2743[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m613s[0m 206ms/step - accuracy: 1.0000 - auc: 1.0000 - loss: 0.0021 - val_accuracy: 1.0000 - val_auc: 1.0000 - val_loss: 1.5992e-04 - learning_rate: 3.2061e-05
Epoch 3/5
[1m2743/2743[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m600s[0m 219ms/step - accuracy: 1.0000 - auc: 1.0000 - loss: 1.2859e-04 - val_accuracy: 1.0000 - val_auc: 1.0000 - val_loss: 1.4247e-05 - learning_rate: 9.4346e-05
Epoch 4/5
[1m2743/2743[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m621s[0m 219ms/step - accuracy: 1.0000 - auc: 1.0000 - loss: 7.6810e-06 - val_accuracy: 1.0000 - val_auc: 1.0000 - val_loss: 2.7384e-06 - learni

In [None]:
import joblib
import numpy as np
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import load_model
from tensorflow.keras import mixed_precision
import tensorflow as tf
from tqdm import tqdm
from sklearn.metrics import classification_report, roc_auc_score
import os
import concurrent.futures
import multiprocessing
import pickle
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.utils import resample
from tensorflow.keras.layers import (Dense, Dropout, BatchNormalization, Conv1D,
                                     GlobalAveragePooling1D, Input, concatenate,
                                     Flatten, MultiHeadAttention)
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.optimizers.schedules import CosineDecayRestarts
from tensorflow.keras.optimizers import Adam
from tqdm import tqdm
import tensorflow as tf
import os
import random
# Mixed precision setup
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

# File paths for validation features
FAKE_VALID_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/spatial_valid_fake.pkl'
REAL_VALID_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/spatial_valid_real.pkl'

# Load features from pickle files
def load_features(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

# Validate and extract features with parallelism
def validate_and_extract(features):
    def extract_feature(entry):
        if isinstance(entry, list):
            return [sub_entry['features'] for sub_entry in entry if isinstance(sub_entry, dict) and 'features' in sub_entry]
        elif isinstance(entry, dict) and 'features' in entry:
            return [entry['features']]
        return []

    valid_features = []
    with joblib.Parallel(n_jobs=-1, backend='threading') as parallel:
        results = parallel(joblib.delayed(extract_feature)(entry) for entry in features)
        valid_features.extend([item for sublist in results for item in sublist])
    return np.array(valid_features, dtype=np.float32)

# Prepare data with parallel processing using joblib
def prepare_data(fake_path, real_path):
    fake_features = load_features(fake_path)
    real_features = load_features(real_path)

    # Parallel extraction using joblib
    fake_features = validate_and_extract(fake_features)
    real_features = validate_and_extract(real_features)

    # Scaling features
    scaler = StandardScaler()
    all_features = np.vstack((fake_features, real_features))
    scaler.fit(all_features)
    fake_features = scaler.transform(fake_features)
    real_features = scaler.transform(real_features)

    fake_labels = np.zeros(len(fake_features))
    real_labels = np.ones(len(real_features))

    X = np.vstack((fake_features, real_features))
    y = np.hstack((fake_labels, real_labels))
    return X, y

# Load and prepare the validation data
X_valid, y_valid = prepare_data(FAKE_VALID_FEATURES_PATH, REAL_VALID_FEATURES_PATH)

# Standardize the validation data
scaler = StandardScaler()
X_valid = scaler.fit_transform(X_valid)

# Reshape the data for model input
X_valid = X_valid[..., np.newaxis]  # Ensure the shape is (samples, features, 1)

# Load the trained model without loading optimizer weights for efficiency
ensemble_model = load_model('drive/MyDrive/SP_cup/main_sp.keras', compile=False)

# Evaluate the model
print("Evaluating ensemble model on validation data...")
y_pred = ensemble_model.predict([X_valid, X_valid], batch_size=64)
y_pred_binary = (y_pred > 0.5).astype(int)

# Output the classification report and AUC-ROC score
print("Classification Report:")
print(classification_report(y_valid, y_pred_binary))
print(f"AUC-ROC: {roc_auc_score(y_valid, y_pred):.4f}")


Evaluating ensemble model on validation data...
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 61ms/step
Classification Report:
              precision    recall  f1-score   support

         0.0       0.50      1.00      0.66      1524
         1.0       0.00      0.00      0.00      1548

    accuracy                           0.50      3072
   macro avg       0.25      0.50      0.33      3072
weighted avg       0.25      0.50      0.33      3072

AUC-ROC: 0.4291


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import numpy as np
import pickle
from tqdm import tqdm
import os
from tensorflow.keras.mixed_precision import Policy, set_global_policy
from joblib import Parallel, delayed

# Enable mixed precision
set_global_policy(Policy('mixed_float16'))

# GPU Configuration: Set memory growth and limit
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    try:
        for device in physical_devices:
            tf.config.experimental.set_memory_growth(device, True)
        print("GPU memory growth enabled.")
    except Exception as e:
        print(f"Error configuring GPU: {e}")
else:
    print("No GPU detected, running on CPU.")

# File paths
FAKE_TRAIN_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/standardized_fake_train.pkl'
REAL_TRAIN_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/standardized_real_train.pkl'
FAKE_VALID_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/spatial_valid_fake.pkl'
REAL_VALID_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/spatial_valid_real.pkl'
CHECKPOINT_PATH = "drive/MyDrive/SP_cup/checkpoints/model_optimized.keras"
os.makedirs(os.path.dirname(CHECKPOINT_PATH), exist_ok=True)

# Function to load features
def load_features(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

# Parallel feature validation and extraction
def validate_and_extract(features):
    def extract_feature(entry):
        if isinstance(entry, list):
            return [sub_entry['features'] for sub_entry in entry if isinstance(sub_entry, dict) and 'features' in sub_entry]
        elif isinstance(entry, dict) and 'features' in entry:
            return [entry['features']]
        return []

    with Parallel(n_jobs=-1, backend='threading') as parallel:
        results = parallel(delayed(extract_feature)(entry) for entry in features)
    valid_features = [item for sublist in results for item in sublist]
    return np.array(valid_features, dtype=np.float32)

# Augment features for robustness
def augment_features(X, y, augment_factor=2):
    augmented_X, augmented_y = [], []
    for _ in range(augment_factor):
        noise = np.random.normal(0, 0.01, X.shape)
        scale = np.random.uniform(0.9, 1.1, X.shape)
        X_augmented = X + noise
        X_augmented *= scale
        augmented_X.append(X_augmented)
        augmented_y.append(y)
    return np.vstack(augmented_X), np.hstack(augmented_y)

# Build the optimized model
def build_model(input_shape):
    model = Sequential([
        Input(shape=input_shape),
        Dense(128, activation='relu', kernel_regularizer=l2(0.01)),
        BatchNormalization(),
        Dropout(0.4),
        Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
        BatchNormalization(),
        Dropout(0.3),
        Dense(32, activation='relu', kernel_regularizer=l2(0.01)),
        BatchNormalization(),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=1e-4), loss='binary_crossentropy', metrics=['accuracy'])
    return model

def train_and_evaluate():
    print("Loading training features...")

    # Load features
    fake_features = load_features(FAKE_TRAIN_FEATURES_PATH)
    real_features = load_features(REAL_TRAIN_FEATURES_PATH)

    # Validate and extract feature vectors
    X_fake = validate_and_extract(fake_features)
    X_real = validate_and_extract(real_features)

    # Create labels
    y_fake = np.ones(len(X_fake))
    y_real = np.zeros(len(X_real))

    # Combine data and labels
    X_combined = np.vstack((X_fake, X_real))
    y_combined = np.hstack((y_fake, y_real))

    # Normalize features
    scaler = StandardScaler()
    X_combined = scaler.fit_transform(X_combined)

    # Split into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(
        X_combined, y_combined, test_size=0.2, random_state=42, stratify=y_combined
    )

    # Augment training data
    print("Augmenting training data...")
    X_train, y_train = augment_features(X_train, y_train, augment_factor=2)

    # Debugging shapes
    print("Shape of X_train:", X_train.shape)
    print("Shape of X_val:", X_val.shape)
    print("Shape of y_train:", y_train.shape)
    print("Shape of y_val:", y_val.shape)

    # Build the model
    input_shape = (X_train.shape[1],)
    model = build_model(input_shape)

    # Callbacks
    callbacks = [
        EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, mode='min'),
        ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6, verbose=1),
        ModelCheckpoint(CHECKPOINT_PATH, save_best_only=True, monitor='val_loss', mode='min')
    ]

    # Train the model
    print("Starting training...")
    model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=20,
        batch_size=64,
        callbacks=callbacks,
        verbose=1,
        class_weight={0: 1.0, 1: 3.0}  # Adjust class weights as needed
    )

    print("Model training complete!")

# Run training and evaluation
train_and_evaluate()


GPU memory growth enabled.
Loading training features...
Augmenting training data...
Shape of X_train: (202886, 1280)
Shape of X_val: (25361, 1280)
Shape of y_train: (202886,)
Shape of y_val: (25361,)
Starting training...
Epoch 1/20
[1m3171/3171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 5ms/step - accuracy: 0.8468 - loss: 4.1132 - val_accuracy: 1.0000 - val_loss: 1.2801 - learning_rate: 1.0000e-04
Epoch 2/20
[1m3171/3171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.9994 - loss: 0.8681 - val_accuracy: 0.9999 - val_loss: 0.1502 - learning_rate: 1.0000e-04
Epoch 3/20
[1m3171/3171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - accuracy: 0.9996 - loss: 0.1291 - val_accuracy: 0.9999 - val_loss: 0.0629 - learning_rate: 1.0000e-04
Epoch 4/20
[1m3171/3171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.9996 - loss: 0.0537 - val_accuracy: 0.9999 - val_loss: 0.0373 - learning_rate: 1.0000e-04
Epoch 5/20

In [None]:
# Function to validate the model
def validate_model():
    print("Loading validation features...")

    # Load features
    fake_valid_features = load_features(FAKE_VALID_FEATURES_PATH)
    real_valid_features = load_features(REAL_VALID_FEATURES_PATH)

    # Validate and extract feature vectors
    X_fake_val = validate_and_extract(fake_valid_features)
    X_real_val = validate_and_extract(real_valid_features)

    # Create labels
    y_fake_val = np.ones(len(X_fake_val))
    y_real_val = np.zeros(len(X_real_val))

    # Combine validation data and labels
    X_val_combined = np.vstack((X_fake_val, X_real_val))
    y_val_combined = np.hstack((y_fake_val, y_real_val))

    # Normalize validation features using the same scaler from training
    scaler = StandardScaler()
    X_val_combined = scaler.fit_transform(X_val_combined)

    # Load the trained model
    model = load_model(CHECKPOINT_PATH)
    print("Model loaded successfully!")

    # Predict probabilities
    y_pred_probs = model.predict(X_val_combined)
    y_pred = (y_pred_probs > 0.5).astype(int).flatten()

    # Metrics
    accuracy = np.mean(y_pred == y_val_combined)
    auc_roc = roc_auc_score(y_val_combined, y_pred_probs)
    classification_report_text = classification_report(y_val_combined, y_pred)

    # Output metrics
    print(f"Accuracy: {accuracy:.4f}")
    print(f"AUC-ROC: {auc_roc:.4f}")
    print("Classification Report:")
    print(classification_report_text)

# Run validation
validate_model()


Loading validation features...
Model loaded successfully!
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Accuracy: 0.4932
AUC-ROC: 0.4905
Classification Report:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00      1548
         1.0       0.49      0.99      0.66      1524

    accuracy                           0.49      3072
   macro avg       0.25      0.50      0.33      3072
weighted avg       0.25      0.49      0.33      3072



In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import numpy as np
import pickle
import os
from tensorflow.keras.mixed_precision import Policy, set_global_policy
from joblib import Parallel, delayed

# Enable mixed precision
set_global_policy(Policy('mixed_float16'))

# GPU Configuration: Set memory growth and limit
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    try:
        for device in physical_devices:
            tf.config.experimental.set_memory_growth(device, True)
        print("GPU memory growth enabled.")
    except Exception as e:
        print(f"Error configuring GPU: {e}")
else:
    print("No GPU detected, running on CPU.")

# File paths
FAKE_TRAIN_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/merged_facial_fake.pkl'
REAL_TRAIN_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/merged_landmarks_real.pkl'
FAKE_VALID_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/spatial_valid_fake.pkl'
REAL_VALID_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/spatial_valid_real.pkl'
CHECKPOINT_PATH = "drive/MyDrive/SP_cup/checkpoints/model_optimized.keras"
os.makedirs(os.path.dirname(CHECKPOINT_PATH), exist_ok=True)

# Function to load features
def load_features(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

# Parallel feature validation and extraction
def validate_and_extract(features):
    def extract_feature(entry):
        if isinstance(entry, list):
            return [sub_entry['features'] for sub_entry in entry if isinstance(sub_entry, dict) and 'features' in sub_entry]
        elif isinstance(entry, dict) and 'features' in entry:
            return [entry['features']]
        return []

    with Parallel(n_jobs=-1, backend='threading') as parallel:
        results = parallel(delayed(extract_feature)(entry) for entry in features)
    valid_features = [item for sublist in results for item in sublist]
    return np.array(valid_features, dtype=np.float32)

# Augment features for robustness
def augment_features(X, y, augment_factor=2):
    augmented_X, augmented_y = [], []
    for _ in range(augment_factor):
        noise = np.random.normal(0, 0.01, X.shape)
        scale = np.random.uniform(0.9, 1.1, X.shape)
        X_augmented = X + noise
        X_augmented *= scale
        augmented_X.append(X_augmented)
        augmented_y.append(y)
    return np.vstack(augmented_X), np.hstack(augmented_y)

# Build the optimized model
def build_model(input_shape):
    model = Sequential([
        Input(shape=input_shape),
        Dense(128, activation='relu', kernel_regularizer=l2(0.01)),
        BatchNormalization(),
        Dropout(0.4),
        Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
        BatchNormalization(),
        Dropout(0.3),
        Dense(32, activation='relu', kernel_regularizer=l2(0.01)),
        BatchNormalization(),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=1e-4), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Train and evaluate the model
def train_and_evaluate():
    print("Loading training features...")

    # Load features
    fake_features = load_features(FAKE_TRAIN_FEATURES_PATH)
    real_features = load_features(REAL_TRAIN_FEATURES_PATH)

    # Validate and extract feature vectors
    X_fake = validate_and_extract(fake_features)
    X_real = validate_and_extract(real_features)

    # Create labels
    y_fake = np.ones(len(X_fake))
    y_real = np.zeros(len(X_real))

    # Combine data and labels
    X_combined = np.vstack((X_fake, X_real))
    y_combined = np.hstack((y_fake, y_real))

    # Normalize features
    scaler = StandardScaler()
    X_combined = scaler.fit_transform(X_combined)

    # Split into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(
        X_combined, y_combined, test_size=0.2, random_state=42, stratify=y_combined
    )

    # Augment training data
    print("Augmenting training data...")
    X_train, y_train = augment_features(X_train, y_train, augment_factor=2)

    # Debugging shapes
    print("Shape of X_train:", X_train.shape)
    print("Shape of X_val:", X_val.shape)
    print("Shape of y_train:", y_train.shape)
    print("Shape of y_val:", y_val.shape)

    # Build the model
    input_shape = (X_train.shape[1],)
    model = build_model(input_shape)

    # Callbacks
    callbacks = [
        EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, mode='min'),
        ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6, verbose=1),
        ModelCheckpoint(CHECKPOINT_PATH, save_best_only=True, monitor='val_loss', mode='min')
    ]

    # Train the model
    print("Starting training...")
    model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=20,
        batch_size=64,
        callbacks=callbacks,
        verbose=1,
        class_weight={0: 1.0, 1: 3.0}  # Adjust class weights as needed
    )

    print("Model training complete!")

    # After training, load the best model
    model = load_model(CHECKPOINT_PATH)

    # Evaluate on validation set
    print("Evaluating the model...")
    val_predictions = model.predict(X_val, batch_size=64, verbose=1)
    val_predictions = (val_predictions > 0.5).astype(int)  # Threshold at 0.5 for binary classification

    accuracy = np.mean(val_predictions == y_val)
    auc = roc_auc_score(y_val, val_predictions)

    # Classification report
    print("Classification Report:")
    print(classification_report(y_val, val_predictions))

    print(f"Accuracy: {accuracy:.4f}")
    print(f"AUC-ROC: {auc:.4f}")

# Run training and evaluation
train_and_evaluate()


GPU memory growth enabled.
Loading training features...


ValueError: Found array with 0 feature(s) (shape=(2, 0)) while a minimum of 1 is required by StandardScaler.

In [None]:
    # Load validation features
    fake_valid_features = load_features(FAKE_VALID_FEATURES_PATH)
    real_valid_features = load_features(REAL_VALID_FEATURES_PATH)

    # Validate and extract feature vectors for validation
    X_fake_valid = validate_and_extract(fake_valid_features)
    X_real_valid = validate_and_extract(real_valid_features)

    # Create validation labels
    y_fake_valid = np.ones(len(X_fake_valid))
    y_real_valid = np.zeros(len(X_real_valid))

    # Combine validation data and labels
    X_combined_valid = np.vstack((X_fake_valid, X_real_valid))
    y_combined_valid = np.hstack((y_fake_valid, y_real_valid))
    # After training, load the best model
    model = load_model(CHECKPOINT_PATH)

    # Evaluate on validation set
    print("Evaluating the model...")
    val_predictions = model.predict(X_combined_valid, batch_size=64, verbose=1)
    val_predictions = (val_predictions > 0.5).astype(int)  # Threshold at 0.5 for binary classification

    accuracy = np.mean(val_predictions == y_combined_valid)
    auc = roc_auc_score(y_combined_valid, val_predictions)

    # Classification report
    print("Classification Report:")
    print(classification_report(y_combined_valid, val_predictions))

    print(f"Accuracy: {accuracy:.4f}")
    print(f"AUC-ROC: {auc:.4f}")


Evaluating the model...
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Classification Report:
              precision    recall  f1-score   support

         0.0       0.50      1.00      0.67      1548
         1.0       0.00      0.00      0.00      1524

    accuracy                           0.50      3072
   macro avg       0.25      0.50      0.34      3072
weighted avg       0.25      0.50      0.34      3072

Accuracy: 0.5039
AUC-ROC: 0.5000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import numpy as np
import pickle
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score

# Define paths
MODEL_SAVE_PATH = "drive/MyDrive/SP_cup/main1_sp.keras"
FEATURES_PATH = "drive/MyDrive/SP_cup/features1_sp.npy"
LABELS_PATH = "drive/MyDrive/SP_cup/labels1_sp.npy"
FAKE_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/standardized_fake_train.pkl'
REAL_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/standardized_real_train.pkl'
FAKE_VALID_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/spatial_valid_fake.pkl'
REAL_VALID_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/spatial_valid_real.pkl'

# Load features and labels
def load_data(fake_path, real_path):
    with open(fake_path, 'rb') as f:
        fake_features = pickle.load(f)
    with open(real_path, 'rb') as f:
        real_features = pickle.load(f)

    features = np.vstack((fake_features, real_features))
    labels = np.hstack((np.zeros(len(fake_features)), np.ones(len(real_features))))
    return features, labels

# Load training and validation data
print("Loading training data...")
X_train, y_train = load_data(FAKE_FEATURES_PATH, REAL_FEATURES_PATH)

print("Loading validation data...")
X_valid_fake, _ = load_data(FAKE_VALID_FEATURES_PATH, FAKE_VALID_FEATURES_PATH)
X_valid_real, _ = load_data(REAL_VALID_FEATURES_PATH, REAL_VALID_FEATURES_PATH)
X_valid = np.vstack((X_valid_fake, X_valid_real))
y_valid = np.hstack((np.zeros(len(X_valid_fake)), np.ones(len(X_valid_real))))

# Save processed data (optional, in case needed later)
np.save(FEATURES_PATH, X_train)
np.save(LABELS_PATH, y_train)

# Define model
def build_model(input_dim):
    model = Sequential([
        Dense(128, activation='relu', input_dim=input_dim),
        BatchNormalization(),
        Dropout(0.5),
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    return model

# Build and compile model
input_dim = X_train.shape[1]
model = build_model(input_dim)
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train model
print("Training model...")
history = model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=20, batch_size=32, verbose=1)

# Evaluate model on validation set
print("Evaluating model...")
val_predictions = (model.predict(X_valid) > 0.5).astype(int)
accuracy = accuracy_score(y_valid, val_predictions)
print(f"Validation Accuracy: {accuracy:.4f}")

# Save the trained model
print("Saving model...")
model.save(MODEL_SAVE_PATH)
print(f"Model saved at {MODEL_SAVE_PATH}")


Loading training data...


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (16,) + inhomogeneous part.

In [None]:
# Function to validate and extract features
def validate_and_extract(features):
    valid_features = []
    for entry in tqdm(features, desc="Validating and extracting features"):
        if isinstance(entry, list):
            for sub_entry in entry:
                if isinstance(sub_entry, dict) and 'features' in sub_entry:
                    valid_features.append(sub_entry['features'])
        elif isinstance(entry, dict) and 'features' in entry:
            valid_features.append(entry['features'])
    return np.array(valid_features, dtype=np.float32)

# Assuming 'raw_features' is loaded from a file
processed_features = validate_and_extract('drive/MyDrive/SP_cup/features/standardized_fake_train.pkl')

# Inspect the processed features
print(f"Shape of processed features: {processed_features.shape}")
print(f"Example feature vector: {processed_features[0]}")


Validating and extracting features: 100%|██████████| 57/57 [00:00<00:00, 417963.86it/s]

Shape of processed features: (0,)





IndexError: index 0 is out of bounds for axis 0 with size 0