In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-25.1.0-py3-none-any.whl.metadata (12 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-25.1.0-py3-none-any.whl (26 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-25.1.0 scikit-optimize-0.10.2


In [None]:
pip install --upgrade scikit-learn xgboost

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m59.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.0
    Uninstalling scikit-learn-1.6.0:
      Successfully uninstalled scikit-learn-1.6.0
Successfully installed scikit-learn-1.6.1


In [None]:
import numpy as np
import pickle
import os
import tensorflow as tf
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import SMOTE
from multiprocessing import Pool, cpu_count

# GPU Configuration
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    try:
        for device in physical_devices:
            tf.config.experimental.set_memory_growth(device, True)
        print("Configured GPU with memory growth.")
    except Exception as e:
        print(f"Error configuring GPU: {e}")
else:
    print("No GPU detected, running on CPU.")

# File paths
FAKE_TRAIN_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/standardized_fake_train.pkl'
REAL_TRAIN_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/standardized_real_train.pkl'
CHECKPOINT_PATH = "drive/MyDrive/SP_cup/checkpoints/new_model.pkl"
os.makedirs(os.path.dirname(CHECKPOINT_PATH), exist_ok=True)

# Function to load features
def load_features(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

# Helper function for feature extraction
def extract_features(entry):
    if isinstance(entry, dict):
        return entry.get('features', [])
    return []

# Extract features with validation
def validate_and_extract(features):
    with Pool(cpu_count()) as pool:
        valid_features = list(tqdm(pool.imap(extract_features, features), total=len(features), desc="Validating and extracting features"))
    return np.array([item for sublist in valid_features for item in sublist], dtype=np.float32)

# Augment features
def augment_features(X, y, augment_factor=1):
    augmented_X, augmented_y = [], []
    for _ in tqdm(range(augment_factor), desc="Augmenting features"):
        noise = np.random.normal(0, 0.01, X.shape)
        scale = np.random.uniform(0.9, 1.1, X.shape)
        X_augmented = X + noise
        X_augmented *= scale
        augmented_X.append(X_augmented)
        augmented_y.append(y)
    return np.vstack(augmented_X), np.hstack(augmented_y)
def debug_shapes(X_fake, X_real):
    print(f"Shape of X_fake: {X_fake.shape}")
    print(f"Shape of X_real: {X_real.shape}")
    if X_fake.size == 0:
        print("Error: X_fake is empty. Check the feature extraction for FAKE_TRAIN_FEATURES_PATH.")
    if X_real.size == 0:
        print("Error: X_real is empty. Check the feature extraction for REAL_TRAIN_FEATURES_PATH.")

# Train and evaluate the model with debugging
def train_and_evaluate_debug():
    print("Loading training features...")

    # Load features
    X_fake = validate_and_extract(load_features(FAKE_TRAIN_FEATURES_PATH))
    X_real = validate_and_extract(load_features(REAL_TRAIN_FEATURES_PATH))

    # Debug shapes of features
    debug_shapes(X_fake, X_real)

    # Ensure no empty arrays
    if X_fake.size == 0 or X_real.size == 0:
        raise ValueError("One or more feature sets are empty. Check input data and feature extraction logic.")

    # Create labels
    y_fake = np.ones(len(X_fake))
    y_real = np.zeros(len(X_real))

    # Combine data and labels
    X_combined = np.vstack((X_fake, X_real))
    y_combined = np.hstack((y_fake, y_real))

    # Normalize features
    scaler = StandardScaler()
    X_combined = scaler.fit_transform(X_combined)

    # Split into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(
        X_combined, y_combined, test_size=0.2, random_state=52, stratify=y_combined
    )

    # Apply SMOTE
    print("Applying SMOTE...")
    smote = SMOTE(random_state=52, sampling_strategy=0.8)
    X_train, y_train = smote.fit_resample(X_train, y_train)

    # Debug shapes
    print(f"X_train shape: {X_train.shape}, X_val shape: {X_val.shape}")

    # Initialize classifiers
    catboost = CatBoostClassifier(verbose=0)

    # Train classifier
    print("Training CatBoostClassifier...")
    catboost.fit(X_train, y_train)

    # Save model
    with open(CHECKPOINT_PATH, 'wb') as f:
        pickle.dump(catboost, f)

    # Evaluate model
    print("Evaluating model...")
    val_preds = catboost.predict(X_val)
    val_probs = catboost.predict_proba(X_val)[:, 1]

    print(classification_report(y_val, val_preds, zero_division=1))
    print(f"AUC-ROC: {roc_auc_score(y_val, val_probs):.4f}")

# Main execution
if __name__ == "__main__":
    train_and_evaluate_debug()

Configured GPU with memory growth.
Loading training features...


Validating and extracting features: 100%|██████████| 16/16 [00:01<00:00, 13.65it/s]
Validating and extracting features: 100%|██████████| 17090/17090 [00:03<00:00, 5653.23it/s]


Shape of X_fake: (0,)
Shape of X_real: (21875200,)
Error: X_fake is empty. Check the feature extraction for FAKE_TRAIN_FEATURES_PATH.


ValueError: One or more feature sets are empty. Check input data and feature extraction logic.

In [None]:
import numpy as np
import pickle
import os
import tensorflow as tf
from tqdm import tqdm
from sklearn.metrics import classification_report, roc_auc_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler

# GPU Configuration: Set memory growth and limit to 11 GB
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    try:
        for device in physical_devices:
            tf.config.experimental.set_memory_growth(device, True)
            tf.config.experimental.VirtualDeviceConfiguration(
                device,
                [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=11264)]  # Limit to 11 GB
            )
        print("Configured GPU with memory growth and 11 GB limit.")
    except Exception as e:
        print(f"Error configuring GPU: {e}")
else:
    print("No GPU detected, running on CPU.")

# File paths
FAKE_VALID_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/spatial_valid_fake.pkl'
REAL_VALID_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/spatial_valid_real.pkl'
CHECKPOINT_PATH = "drive/MyDrive/SP_cup/checkpoints/ensemble_model.pkl"

# Function to load features
def load_features(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

# Validate and extract feature vectors
def validate_and_extract(features):
    valid_features = []
    for entry in tqdm(features, desc="Validating and extracting features"):
        if isinstance(entry, list):
            valid_features.extend(
                [sub_entry['features'] for sub_entry in entry if isinstance(sub_entry, dict) and 'features' in sub_entry]
            )
        elif isinstance(entry, dict) and 'features' in entry:
            valid_features.append(entry['features'])
    return np.array(valid_features, dtype=np.float32)

# Function to evaluate the model
def evaluate_model():
    print("Loading validation features...")

    # Load validation features
    fake_features = load_features(FAKE_VALID_FEATURES_PATH)
    real_features = load_features(REAL_VALID_FEATURES_PATH)

    # Validate and extract feature vectors
    X_fake = validate_and_extract(fake_features)
    X_real = validate_and_extract(real_features)

    # Create labels
    y_fake = np.ones(len(X_fake))
    y_real = np.zeros(len(X_real))

    # Combine data and labels
    X_val = np.vstack((X_fake, X_real))
    y_val = np.hstack((y_fake, y_real))

    # Normalize features
    scaler = StandardScaler()
    X_val = scaler.fit_transform(X_val)

    # Load the trained ensemble model
    print("Loading the trained model...")
    with open(CHECKPOINT_PATH, 'rb') as f:
        ensemble = pickle.load(f)

    # Make predictions
    print("Making predictions...")
    val_predictions = ensemble.predict(X_val)
    val_probabilities = ensemble.predict_proba(X_val)[:, 1]

    # Evaluate metrics
    accuracy = np.mean(val_predictions == y_val)
    auc = roc_auc_score(y_val, val_probabilities)
    f1 = f1_score(y_val, val_predictions)
    precision = precision_score(y_val, val_predictions)
    recall = recall_score(y_val, val_predictions)

    # Print metrics
    print("Classification Report:")
    print(classification_report(y_val, val_predictions))
    print(f"Accuracy: {accuracy:.4f}")
    print(f"AUC-ROC: {auc:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")

# Run evaluation
if __name__ == "__main__":
    evaluate_model()


Configured GPU with memory growth and 11 GB limit.
Loading validation features...


Validating and extracting features: 100%|██████████| 1524/1524 [00:00<00:00, 1432568.20it/s]
Validating and extracting features: 100%|██████████| 1548/1548 [00:00<00:00, 1024098.20it/s]


Loading the trained model...
Making predictions...
Classification Report:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00      1548
         1.0       0.50      1.00      0.66      1524

    accuracy                           0.50      3072
   macro avg       0.25      0.50      0.33      3072
weighted avg       0.25      0.50      0.33      3072

Accuracy: 0.4961
AUC-ROC: 0.5414
F1 Score: 0.6632
Precision: 0.4961
Recall: 1.0000


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import numpy as np
import pickle
import os
import tensorflow as tf
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from skopt import BayesSearchCV
from joblib import Parallel, delayed
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Flatten, Dropout

# GPU Configuration: Set memory growth and limit to 11 GB
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    try:
        for device in physical_devices:
            tf.config.experimental.set_memory_growth(device, True)
        print("Configured GPU with memory growth and 11 GB limit.")
    except Exception as e:
        print(f"Error configuring GPU: {e}")
else:
    print("No GPU detected, running on CPU.")

# File paths
FAKE_TRAIN_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/standardized_fake_train.pkl'
REAL_TRAIN_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/standardized_real_train.pkl'
FAKE_VALID_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/spatial_valid_fake.pkl'
REAL_VALID_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/spatial_valid_real.pkl'
CHECKPOINT_PATH = "drive/MyDrive/SP_cup/checkpoints/ensemble_model.pkl"
os.makedirs(os.path.dirname(CHECKPOINT_PATH), exist_ok=True)

# Function to load features
def load_features(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

# Validate and extract feature vectors
def validate_and_extract(features):
    def extract_feature(entry):
        if isinstance(entry, list):
            return [sub_entry['features'] for sub_entry in entry if isinstance(sub_entry, dict) and 'features' in sub_entry]
        elif isinstance(entry, dict) and 'features' in entry:
            return [entry['features']]
        return []

    valid_features = Parallel(n_jobs=-1)(
        delayed(extract_feature)(entry) for entry in tqdm(features, desc="Validating and extracting features")
    )
    valid_features = [item for sublist in valid_features for item in sublist]  # Flatten list
    return np.array(valid_features, dtype=np.float32)

# Augment features for robustness
def augment_features(X, y, augment_factor=1):
    augmented_X, augmented_y = [], []
    for _ in tqdm(range(augment_factor), desc="Augmenting features"):
        noise = np.random.normal(0, 0.01, X.shape)
        scale = np.random.uniform(0.9, 1.1, X.shape)
        X_augmented = X + noise
        X_augmented *= scale
        augmented_X.append(X_augmented)
        augmented_y.append(y)
    return np.vstack(augmented_X), np.hstack(augmented_y)

# Function for Bayesian Optimization with memory-safe defaults
def optimize_model(model, param_grid, X_train, y_train, n_iter=5):
    try:
        bayes_search = BayesSearchCV(
            model,
            param_grid,
            n_iter=n_iter,
            cv=3,
            scoring='roc_auc',
            n_jobs=-1,
            verbose=1
        )
        bayes_search.fit(X_train, y_train)
        return bayes_search.best_estimator_
    except Exception as e:
        print(f"Optimization failed for {model.__class__.__name__}: {e}")
        return None

# Fix for XGBoost optimization
def optimize_xgboost(X_train, y_train):
    try:
        xgb_params = {
            'n_estimators': (50, 100),
            'learning_rate': (0.01, 0.2, 'log-uniform'),
            'max_depth': (3, 10),
            'colsample_bytree': (0.5, 1.0),
            'subsample': (0.5, 1.0)
        }
        xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
        best_xgb = optimize_model(xgb_model, xgb_params, X_train, y_train)
        return best_xgb
    except Exception as e:
        print(f"XGBoost optimization failed: {e}")
        return None

# Updated function to optimize models sequentially
def optimize_models_sequentially(X_train, y_train):
    results = {}

    # Optimizing CatBoost
    print("Optimizing CatBoost...")
    try:
        catboost_params = {
            'depth': (4, 6),
            'learning_rate': (1e-3, 0.05, 'log-uniform'),
            'iterations': (50, 100)
        }
        results['catboost'] = optimize_model(
            CatBoostClassifier(verbose=0, task_type='CPU', thread_count=-1),
            catboost_params,
            X_train,
            y_train
        )
    except Exception as e:
        print(f"CatBoost optimization failed: {e}")

    # Optimizing XGBoost
    print("Optimizing XGBoost...")
    results['xgboost'] = optimize_xgboost(X_train, y_train)

    # Ensure at least one model succeeds
    successful_models = {k: v for k, v in results.items() if v is not None}
    if not successful_models:
        raise ValueError("Optimization incomplete: All models failed.")

    return successful_models

# Train and evaluate the ensemble model
def train_and_evaluate():
    print("Loading training features...")

    # Load features
    fake_features = load_features(FAKE_TRAIN_FEATURES_PATH)
    real_features = load_features(REAL_TRAIN_FEATURES_PATH)

    # Validate and extract feature vectors
    X_fake = validate_and_extract(fake_features)
    X_real = validate_and_extract(real_features)

    # Create labels
    y_fake = np.ones(len(X_fake))
    y_real = np.zeros(len(X_real))

    # Combine data and labels
    X_combined = np.vstack((X_fake, X_real))
    y_combined = np.hstack((y_fake, y_real))

    # Normalize features
    scaler = StandardScaler()
    X_combined = scaler.fit_transform(X_combined)

    # Split into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(
        X_combined, y_combined, test_size=0.2, random_state=52, stratify=y_combined
    )

    # Augment training data
    print("Augmenting training data...")
    X_train, y_train = augment_features(X_train, y_train, augment_factor=1)

    # Debugging shapes
    print("Shape of X_train:", X_train.shape)
    print("Shape of X_val:", X_val.shape)
    print("Shape of y_train:", y_train.shape)
    print("Shape of y_val:", y_val.shape)

    # Load and configure ResNet-50 model
    print("Training ResNet-50 model...")
    base_model = ResNet50(include_top=False, weights='imagenet', input_shape=(224, 224, 3))
    for layer in base_model.layers:
        layer.trainable = False

    resnet_model = tf.keras.Sequential([
        base_model,
        Flatten(),
        Dense(256, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])

    resnet_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    # Example: Adjust X_train for ResNet-50 (if image data is used)
    # X_train_resized = resize_images(X_train)
    # resnet_model.fit(X_train_resized, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))

    # Sequential model optimization
    optimized_models = optimize_models_sequentially(X_train, y_train)
    catboost = optimized_models.get('catboost')
    xgboost = optimized_models.get('xgboost')

    # Combine models into ensemble
    print("Creating ensemble model...")
    ensemble_estimators = []
    if catboost:
        ensemble_estimators.append(('catboost', catboost))
    if xgboost:
        ensemble_estimators.append(('xgboost', xgboost))

    ensemble = VotingClassifier(estimators=ensemble_estimators, voting='soft')

    # Train ensemble
    print("Training ensemble model...")
    for _ in tqdm(range(1), desc="Training Loop"):
        ensemble.fit(X_train, y_train)

    # Save ensemble model
    with open(CHECKPOINT_PATH, 'wb') as f:
        pickle.dump(ensemble, f)

    # Evaluate on validation set
    print("Evaluating the ensemble model...")
    val_predictions = ensemble.predict(X_val)
    val_probabilities = ensemble.predict_proba(X_val)[:, 1]

    accuracy = np.mean(val_predictions == y_val)
    auc = roc_auc_score(y_val, val_probabilities)

    # Classification report
    print("Classification Report:")
    print(classification_report(y_val, val_predictions))

    print(f"Accuracy: {accuracy:.4f}")
    print(f"AUC-ROC: {auc:.4f}")

# Run training and evaluation
if __name__ == "__main__":
    train_and_evaluate()


Configured GPU with memory growth and 11 GB limit.
Loading training features...



Validating and extracting features:   0%|          | 0/16 [00:00<?, ?it/s][A
Validating and extracting features:  12%|█▎        | 2/16 [00:00<00:00, 18.76it/s][A
Validating and extracting features:  25%|██▌       | 4/16 [00:01<00:05,  2.32it/s][A
Validating and extracting features:  38%|███▊      | 6/16 [00:02<00:04,  2.44it/s][A
Validating and extracting features:  50%|█████     | 8/16 [00:03<00:03,  2.50it/s][A
Validating and extracting features:  62%|██████▎   | 10/16 [00:03<00:02,  2.44it/s][A
Validating and extracting features:  75%|███████▌  | 12/16 [00:04<00:01,  2.23it/s][A
Validating and extracting features:  88%|████████▊ | 14/16 [00:06<00:01,  1.67it/s][A
Validating and extracting features: 100%|██████████| 16/16 [00:07<00:00,  2.05it/s]

Validating and extracting features:   0%|          | 0/17090 [00:00<?, ?it/s][A
Validating and extracting features:  12%|█▏        | 2117/17090 [00:00<00:00, 21168.86it/s][A
Validating and extracting features:  36%|███▌      | 61

Augmenting training data...



Augmenting features:   0%|          | 0/1 [00:00<?, ?it/s][A
Augmenting features: 100%|██████████| 1/1 [00:17<00:00, 17.28s/it]


Shape of X_train: (101443, 1280)
Shape of X_val: (25361, 1280)
Shape of y_train: (101443,)
Shape of y_val: (25361,)
Training deep learning model...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m3171/3171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 3ms/step - accuracy: 0.9970 - loss: 0.0051 - val_accuracy: 0.9988 - val_loss: 0.0217
Epoch 2/10
[1m3171/3171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 3ms/step - accuracy: 0.9997 - loss: 0.0032 - val_accuracy: 1.0000 - val_loss: 3.2321e-04
Epoch 3/10
[1m3171/3171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 1.0000 - loss: 3.6690e-10 - val_accuracy: 1.0000 - val_loss: 3.2514e-04
Epoch 4/10
[1m3171/3171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 1.0000 - loss: 2.0417e-06 - val_accuracy: 0.9999 - val_loss: 0.0014
Epoch 5/10
[1m3171/3171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - accuracy: 1.0000 - loss: 1.1580e-05 - val_accuracy: 1.0000 - val_loss: 9.3084e-04
Epoch 6/10
[1m3171/3171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - accuracy: 1.0000 - loss: 5.9939e-07 - val_accuracy: 1.0000 - va



Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits




Optimizing XGBoost...
Optimization failed for XGBClassifier: 'super' object has no attribute '__sklearn_tags__'
Creating ensemble model...
Training ensemble model...



Training Loop:   0%|          | 0/1 [00:00<?, ?it/s][A
Training Loop: 100%|██████████| 1/1 [02:09<00:00, 129.97s/it]


Evaluating the ensemble model...
Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      3418
         1.0       1.00      1.00      1.00     21943

    accuracy                           1.00     25361
   macro avg       1.00      1.00      1.00     25361
weighted avg       1.00      1.00      1.00     25361

Accuracy: 0.9997
AUC-ROC: 1.0000


In [None]:
import numpy as np
import pickle
import os
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler

# File paths
FAKE_VALID_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/spatial_valid_fake.pkl'
REAL_VALID_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/spatial_valid_real.pkl'
CHECKPOINT_PATH = "drive/MyDrive/SP_cup/checkpoints/ensemble_model.pkl"
OUTPUT_SCORES_PATH = "drive/MyDrive/SP_cup/results/validation_scores.txt"

# Function to load features
def load_features(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

# Validate and extract feature vectors and file IDs
def validate_and_extract(features, label):
    valid_features = []
    valid_file_ids = []
    labels = []

    for entry in tqdm(features, desc="Validating and extracting features"):
        if isinstance(entry, dict) and 'features' in entry and 'image_name' in entry:
            valid_features.append(entry['features'])
            # Extract the fileID from the 'image_name' field
            file_id = os.path.basename(entry['image_name'])
            valid_file_ids.append(file_id)
            # Assign the label (1 for real, 0 for fake)
            labels.append(label)

    return np.array(valid_features, dtype=np.float32), valid_file_ids, labels

# Generate evaluation set scores
def generate_scores():
    print("Loading validation features...")

    # Load validation features
    fake_features = load_features(FAKE_VALID_FEATURES_PATH)
    real_features = load_features(REAL_VALID_FEATURES_PATH)

    # Extract features, file IDs, and labels
    X_fake, fake_file_ids, y_fake = validate_and_extract(fake_features, label=0)
    X_real, real_file_ids, y_real = validate_and_extract(real_features, label=1)

    # Combine data and labels
    X_val = np.vstack((X_fake, X_real))
    y_val = np.hstack((y_fake, y_real))
    file_ids = fake_file_ids + real_file_ids

    # Normalize features
    scaler = StandardScaler()
    X_val = scaler.fit_transform(X_val)

    # Load the trained ensemble model
    print("Loading the trained model...")
    with open(CHECKPOINT_PATH, 'rb') as f:
        ensemble = pickle.load(f)

    # Generate probabilities (scores)
    print("Generating evaluation scores...")
    val_probabilities = ensemble.predict_proba(X_val)[:, 1]

    # Adjust scores to ensure real images have higher values and fake images lower
    scores = [prob if label == 1 else 1 - prob for prob, label in zip(val_probabilities, y_val)]

    # Save scores to the output file
    print(f"Saving scores to {OUTPUT_SCORES_PATH}...")
    with open(OUTPUT_SCORES_PATH, 'w') as f:
        for file_id, score in zip(file_ids, scores):
            f.write(f"{file_id}\t{score:.6f}\n")

    print("Scores saved successfully.")

# Run score generation
if __name__ == "__main__":
    generate_scores()


Loading validation features...


Validating and extracting features: 100%|██████████| 1524/1524 [00:00<00:00, 1134360.12it/s]
Validating and extracting features: 100%|██████████| 1548/1548 [00:00<00:00, 678380.80it/s]

Loading the trained model...





Generating evaluation scores...
Saving scores to drive/MyDrive/SP_cup/results/validation_scores.txt...
Scores saved successfully.


In [None]:
from sklearn.metrics import accuracy_score, classification_report

def generate_scores_and_evaluate():
    print("Loading validation features...")

    # Load validation features
    fake_features = load_features(FAKE_VALID_FEATURES_PATH)
    real_features = load_features(REAL_VALID_FEATURES_PATH)

    # Extract features, file IDs, and labels
    X_fake, fake_file_ids, y_fake = validate_and_extract(fake_features, label=0)
    X_real, real_file_ids, y_real = validate_and_extract(real_features, label=1)

    # Combine data and labels
    X_val = np.vstack((X_fake, X_real))
    y_val = np.hstack((y_fake, y_real))
    file_ids = fake_file_ids + real_file_ids

    # Normalize features
    scaler = StandardScaler()
    X_val = scaler.fit_transform(X_val)

    # Load the trained ensemble model
    print("Loading the trained model...")
    with open(CHECKPOINT_PATH, 'rb') as f:
        ensemble = pickle.load(f)

    # Generate probabilities (scores) and predicted labels
    print("Generating evaluation scores...")
    val_probabilities = ensemble.predict_proba(X_val)[:, 1]
    predicted_labels = (val_probabilities > 0.5).astype(int)

    # Adjust scores to ensure real images have higher values and fake images lower
    scores = [prob if label == 1 else 1 - prob for prob, label in zip(val_probabilities, y_val)]

    # Save scores to the output file
    print(f"Saving scores to {OUTPUT_SCORES_PATH}...")
    with open(OUTPUT_SCORES_PATH, 'w') as f:
        for file_id, score in zip(file_ids, scores):
            f.write(f"{file_id}\t{score:.6f}\n")

    print("Scores saved successfully.")

    # Evaluate model performance
    accuracy = accuracy_score(y_val, predicted_labels)
    print("Classification Report:")
    print(classification_report(y_val, predicted_labels))
    print(f"Accuracy: {accuracy:.4f}")

# Run the evaluation
if __name__ == "__main__":
    generate_scores_and_evaluate()


Loading validation features...


Validating and extracting features: 100%|██████████| 1524/1524 [00:00<00:00, 189823.58it/s]
Validating and extracting features: 100%|██████████| 1548/1548 [00:00<00:00, 328981.69it/s]

Loading the trained model...
Generating evaluation scores...





Saving scores to drive/MyDrive/SP_cup/results/validation_scores.txt...
Scores saved successfully.
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1524
           1       0.50      1.00      0.67      1548

    accuracy                           0.50      3072
   macro avg       0.25      0.50      0.34      3072
weighted avg       0.25      0.50      0.34      3072

Accuracy: 0.5039


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import pickle

# Paths to your .pkl files
FAKE_VALID_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/spatial_valid_fake.pkl'
REAL_VALID_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/spatial_valid_real.pkl'

def inspect_pkl(file_path):
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
    print(f"Inspecting {file_path}...")
    # Display a sample of the data
    if isinstance(data, list):
        print("Sample entry from list:")
        print(data[0])  # Adjust index if needed
    elif isinstance(data, dict):
        print("Keys in the dictionary:")
        print(data.keys())
        print("Sample entry:")
        print(data)
    else:
        print("Unexpected data type:", type(data))

# Inspect both files
inspect_pkl(FAKE_VALID_FEATURES_PATH)
inspect_pkl(REAL_VALID_FEATURES_PATH)


Inspecting drive/MyDrive/SP_cup/features/spatial_valid_fake.pkl...
Sample entry from list:
{'image_name': 'drive/MyDrive/validation/fake_valid/fake/valid_fake_0110265.png', 'features': array([-0.1473,  0.3027, -0.0877, ...,  0.128 , -0.0487,  0.1984],
      dtype=float16)}
Inspecting drive/MyDrive/SP_cup/features/spatial_valid_real.pkl...
Sample entry from list:
{'image_name': 'drive/MyDrive/validation/real_valid/real/valid_real_0611952.png', 'features': array([-0.1267 , -0.02531, -0.1095 , ...,  0.3542 , -0.0885 ,  0.1219 ],
      dtype=float16)}


In [None]:
pip install joblib



In [None]:
import numpy as np
import pickle
import os
import tensorflow as tf
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import SMOTE
from multiprocessing import Pool, cpu_count

# GPU Configuration
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    try:
        for device in physical_devices:
            tf.config.experimental.set_memory_growth(device, True)
        print("Configured GPU with memory growth.")
    except Exception as e:
        print(f"Error configuring GPU: {e}")
else:
    print("No GPU detected, running on CPU.")

# File paths
FAKE_TRAIN_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/standardized_fake_train.pkl'
REAL_TRAIN_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/standardized_real_train.pkl'
CHECKPOINT_PATH = "drive/MyDrive/SP_cup/checkpoints/ensemble_model.pkl"
os.makedirs(os.path.dirname(CHECKPOINT_PATH), exist_ok=True)

# Function to load features
def load_features(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

# Helper function for feature extraction
def extract_features(entry):
    if isinstance(entry, dict):
        return entry.get('features', [])
    return []

# Extract features with validation
def validate_and_extract(features):
    with Pool(cpu_count()) as pool:
        valid_features = list(tqdm(pool.imap(extract_features, features), total=len(features), desc="Validating and extracting features"))
    return np.array([item for sublist in valid_features for item in sublist], dtype=np.float32)

# Augment features
def augment_features(X, y, augment_factor=1):
    augmented_X, augmented_y = [], []
    for _ in tqdm(range(augment_factor), desc="Augmenting features"):
        noise = np.random.normal(0, 0.01, X.shape)
        scale = np.random.uniform(0.9, 1.1, X.shape)
        X_augmented = X + noise
        X_augmented *= scale
        augmented_X.append(X_augmented)
        augmented_y.append(y)
    return np.vstack(augmented_X), np.hstack(augmented_y)

# Train and evaluate ensemble model
def train_and_evaluate():
    print("Loading training features...")

    # Load features
    X_fake = validate_and_extract(load_features(FAKE_TRAIN_FEATURES_PATH))
    X_real = validate_and_extract(load_features(REAL_TRAIN_FEATURES_PATH))

    # Create labels
    y_fake = np.ones(len(X_fake))
    y_real = np.zeros(len(X_real))

    # Combine data and labels
    X_combined = np.vstack((X_fake, X_real))
    y_combined = np.hstack((y_fake, y_real))

    # Normalize features
    scaler = StandardScaler()
    X_combined = scaler.fit_transform(X_combined)

    # Split into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(
        X_combined, y_combined, test_size=0.2, random_state=52, stratify=y_combined
    )

    # Apply SMOTE
    print("Applying SMOTE...")
    smote = SMOTE(random_state=52)
    X_train, y_train = smote.fit_resample(X_train, y_train)

    # Augment training data
    print("Augmenting training data...")
    X_train, y_train = augment_features(X_train, y_train, augment_factor=1)

    # Debug shapes
    print(f"X_train shape: {X_train.shape}, X_val shape: {X_val.shape}")

    # Initialize classifiers with default parameters
    catboost = CatBoostClassifier(verbose=0)

    # Train classifiers
    print("Training CatBoostClassifier...")
    catboost.fit(X_train, y_train)

    # Save model
    with open(CHECKPOINT_PATH, 'wb') as f:
        pickle.dump(catboost, f)

    # Evaluate model
    val_preds = catboost.predict(X_val)
    val_probs = catboost.predict_proba(X_val)[:, 1]
    print(classification_report(y_val, val_preds))
    print(f"AUC-ROC: {roc_auc_score(y_val, val_probs):.4f}")

# Main execution
if __name__ == "__main__":
    train_and_evaluate()

Configured GPU with memory growth.
Loading training features...


Validating and extracting features: 100%|██████████| 16/16 [00:01<00:00,  9.87it/s]
Validating and extracting features: 100%|██████████| 17090/17090 [00:03<00:00, 5199.23it/s]


ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 0 and the array at index 1 has size 21875200