In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [3]:
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-25.1.0-py3-none-any.whl.metadata (12 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-25.1.0-py3-none-any.whl (26 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-25.1.0 scikit-optimize-0.10.2


In [4]:
pip install --upgrade scikit-learn xgboost

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m105.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.0
    Uninstalling scikit-learn-1.6.0:
      Successfully uninstalled scikit-learn-1.6.0
Successfully installed scikit-learn-1.6.1


In [None]:
import numpy as np
import pickle
import os
import tensorflow as tf
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier
from skopt import BayesSearchCV

# GPU Configuration: Set memory growth and limit to 11 GB
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    try:
        for device in physical_devices:
            tf.config.experimental.set_memory_growth(device, True)
            tf.config.experimental.VirtualDeviceConfiguration(
                device,
                [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=11264)]  # Limit to 11 GB
            )
        print("Configured GPU with memory growth and 11 GB limit.")
    except Exception as e:
        print(f"Error configuring GPU: {e}")
else:
    print("No GPU detected, running on CPU.")

# File paths
FAKE_TRAIN_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/standardized_fake_train.pkl'
REAL_TRAIN_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/standardized_real_train.pkl'
FAKE_VALID_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/spatial_valid_fake.pkl'
REAL_VALID_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/spatial_valid_real.pkl'
CHECKPOINT_PATH = "drive/MyDrive/SP_cup/checkpoints/ensemble_model.pkl"
os.makedirs(os.path.dirname(CHECKPOINT_PATH), exist_ok=True)

# Function to load features
def load_features(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

# Validate and extract feature vectors
def validate_and_extract(features):
    def extract_feature(entry):
        if isinstance(entry, list):
            return [sub_entry['features'] for sub_entry in entry if isinstance(sub_entry, dict) and 'features' in sub_entry]
        elif isinstance(entry, dict) and 'features' in entry:
            return [entry['features']]
        return []

    valid_features = []
    for entry in features:
        valid_features.extend(extract_feature(entry))
    return np.array(valid_features, dtype=np.float32)

# Augment features for robustness
def augment_features(X, y, augment_factor=1):
    augmented_X, augmented_y = [], []
    for _ in range(augment_factor):
        noise = np.random.normal(0, 0.01, X.shape)
        scale = np.random.uniform(0.9, 1.1, X.shape)
        X_augmented = X + noise
        X_augmented *= scale
        augmented_X.append(X_augmented)
        augmented_y.append(y)
    return np.vstack(augmented_X), np.hstack(augmented_y)

# Function for Bayesian Optimization with memory-safe defaults
def optimize_model(model, param_grid, X_train, y_train, n_iter=5):
    try:
        bayes_search = BayesSearchCV(
            model,
            param_grid,
            n_iter=n_iter,
            cv=3,
            scoring='roc_auc',
            n_jobs=1,  # Reduce to 1 to avoid memory overload
            verbose=1
        )
        bayes_search.fit(X_train, y_train)
        return bayes_search.best_estimator_
    except Exception as e:
        print(f"Optimization failed for {model.__class__.__name__}: {e}")
        return None

# Sequential optimization
def optimize_models_sequentially(X_train, y_train):
    results = {}

    # Optimizing CatBoost
    print("Optimizing CatBoost...")
    try:
        catboost_params = {
            'depth': (4, 6),  # Narrow range for fewer resources
            'learning_rate': (1e-3, 0.05, 'log-uniform'),
            'iterations': (50, 150)  # Reduced iterations
        }
        results['catboost'] = optimize_model(
            CatBoostClassifier(verbose=0, task_type='GPU'),
            catboost_params,
            X_train,
            y_train
        )
    except Exception as e:
        print(f"CatBoost optimization failed: {e}")

    # Ensure at least one model succeeds
    if not results['catboost']:
        raise ValueError("Optimization incomplete: All models failed.")

    return {k: v for k, v in results.items() if v is not None}

# Train and evaluate the ensemble model
def train_and_evaluate():
    print("Loading training features...")

    # Load features
    fake_features = load_features(FAKE_TRAIN_FEATURES_PATH)
    real_features = load_features(REAL_TRAIN_FEATURES_PATH)

    # Validate and extract feature vectors
    X_fake = validate_and_extract(fake_features)
    X_real = validate_and_extract(real_features)

    # Create labels
    y_fake = np.ones(len(X_fake))
    y_real = np.zeros(len(X_real))

    # Combine data and labels
    X_combined = np.vstack((X_fake, X_real))
    y_combined = np.hstack((y_fake, y_real))

    # Normalize features
    scaler = StandardScaler()
    X_combined = scaler.fit_transform(X_combined)

    # Split into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(
        X_combined, y_combined, test_size=0.2, random_state=42, stratify=y_combined
    )

    # Augment training data
    print("Augmenting training data...")
    X_train, y_train = augment_features(X_train, y_train, augment_factor=1)  # Reduced factor

    # Debugging shapes
    print("Shape of X_train:", X_train.shape)
    print("Shape of X_val:", X_val.shape)
    print("Shape of y_train:", y_train.shape)
    print("Shape of y_val:", y_val.shape)

    # Sequential model optimization
    optimized_models = optimize_models_sequentially(X_train, y_train)
    catboost = optimized_models['catboost']

    # Combine models into ensemble
    print("Creating ensemble model...")
    ensemble = VotingClassifier(estimators=[
        ('catboost', catboost)
    ], voting='soft')

    # Train ensemble
    print("Training ensemble model...")
    for _ in tqdm(range(1), desc="Training Loop"):
        ensemble.fit(X_train, y_train)

    # Save ensemble model
    with open(CHECKPOINT_PATH, 'wb') as f:
        pickle.dump(ensemble, f)

    # Evaluate on validation set
    print("Evaluating the ensemble model...")
    val_predictions = ensemble.predict(X_val)
    val_probabilities = ensemble.predict_proba(X_val)[:, 1]

    accuracy = np.mean(val_predictions == y_val)
    auc = roc_auc_score(y_val, val_probabilities)

    # Classification report
    print("Classification Report:")
    print(classification_report(y_val, val_predictions))

    print(f"Accuracy: {accuracy:.4f}")
    print(f"AUC-ROC: {auc:.4f}")

# Run training and evaluation
if __name__ == "__main__":
    train_and_evaluate()


Configured GPU with memory growth and 11 GB limit.
Loading training features...
Augmenting training data...


In [None]:
import numpy as np
import pickle
import os
import tensorflow as tf
from tqdm import tqdm
from sklearn.metrics import classification_report, roc_auc_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler

# GPU Configuration: Set memory growth and limit to 11 GB
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    try:
        for device in physical_devices:
            tf.config.experimental.set_memory_growth(device, True)
            tf.config.experimental.VirtualDeviceConfiguration(
                device,
                [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=11264)]  # Limit to 11 GB
            )
        print("Configured GPU with memory growth and 11 GB limit.")
    except Exception as e:
        print(f"Error configuring GPU: {e}")
else:
    print("No GPU detected, running on CPU.")

# File paths
FAKE_VALID_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/spatial_valid_fake.pkl'
REAL_VALID_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/spatial_valid_real.pkl'
CHECKPOINT_PATH = "drive/MyDrive/SP_cup/checkpoints/ensemble_model.pkl"

# Function to load features
def load_features(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

# Validate and extract feature vectors
def validate_and_extract(features):
    valid_features = []
    for entry in tqdm(features, desc="Validating and extracting features"):
        if isinstance(entry, list):
            valid_features.extend(
                [sub_entry['features'] for sub_entry in entry if isinstance(sub_entry, dict) and 'features' in sub_entry]
            )
        elif isinstance(entry, dict) and 'features' in entry:
            valid_features.append(entry['features'])
    return np.array(valid_features, dtype=np.float32)

# Function to evaluate the model
def evaluate_model():
    print("Loading validation features...")

    # Load validation features
    fake_features = load_features(FAKE_VALID_FEATURES_PATH)
    real_features = load_features(REAL_VALID_FEATURES_PATH)

    # Validate and extract feature vectors
    X_fake = validate_and_extract(fake_features)
    X_real = validate_and_extract(real_features)

    # Create labels
    y_fake = np.ones(len(X_fake))
    y_real = np.zeros(len(X_real))

    # Combine data and labels
    X_val = np.vstack((X_fake, X_real))
    y_val = np.hstack((y_fake, y_real))

    # Normalize features
    scaler = StandardScaler()
    X_val = scaler.fit_transform(X_val)

    # Load the trained ensemble model
    print("Loading the trained model...")
    with open(CHECKPOINT_PATH, 'rb') as f:
        ensemble = pickle.load(f)

    # Make predictions
    print("Making predictions...")
    val_predictions = ensemble.predict(X_val)
    val_probabilities = ensemble.predict_proba(X_val)[:, 1]

    # Evaluate metrics
    accuracy = np.mean(val_predictions == y_val)
    auc = roc_auc_score(y_val, val_probabilities)
    f1 = f1_score(y_val, val_predictions)
    precision = precision_score(y_val, val_predictions)
    recall = recall_score(y_val, val_predictions)

    # Print metrics
    print("Classification Report:")
    print(classification_report(y_val, val_predictions))
    print(f"Accuracy: {accuracy:.4f}")
    print(f"AUC-ROC: {auc:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")

# Run evaluation
if __name__ == "__main__":
    evaluate_model()


Configured GPU with memory growth and 11 GB limit.
Loading validation features...



Validating and extracting features: 100%|██████████| 1524/1524 [00:00<00:00, 770227.65it/s]

Validating and extracting features: 100%|██████████| 1548/1548 [00:00<00:00, 821663.20it/s]


Loading the trained model...
Making predictions...
Classification Report:
              precision    recall  f1-score   support

         0.0       0.94      0.97      0.95      1548
         1.0       0.97      0.94      0.95      1524

    accuracy                           0.95      3072
   macro avg       0.95      0.95      0.95      3072
weighted avg       0.95      0.95      0.95      3072

Accuracy: 0.9538
AUC-ROC: 0.9933
F1 Score: 0.9525
Precision: 0.9707
Recall: 0.9350


In [None]:
import numpy as np
import pickle
import os
import tensorflow as tf
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from skopt import BayesSearchCV
from joblib import Parallel, delayed
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Flatten, Dropout

# GPU Configuration: Set memory growth and limit to 11 GB
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    try:
        for device in physical_devices:
            tf.config.experimental.set_memory_growth(device, True)
        print("Configured GPU with memory growth and 11 GB limit.")
    except Exception as e:
        print(f"Error configuring GPU: {e}")
else:
    print("No GPU detected, running on CPU.")

# File paths
FAKE_TRAIN_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/standardized_fake_train.pkl'
REAL_TRAIN_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/standardized_real_train.pkl'
FAKE_VALID_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/spatial_valid_fake.pkl'
REAL_VALID_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/spatial_valid_real.pkl'
CHECKPOINT_PATH = "drive/MyDrive/SP_cup/checkpoints/ensemble_model.pkl"
os.makedirs(os.path.dirname(CHECKPOINT_PATH), exist_ok=True)

# Function to load features
def load_features(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

# Validate and extract feature vectors
def validate_and_extract(features):
    def extract_feature(entry):
        if isinstance(entry, list):
            return [sub_entry['features'] for sub_entry in entry if isinstance(sub_entry, dict) and 'features' in sub_entry]
        elif isinstance(entry, dict) and 'features' in entry:
            return [entry['features']]
        return []

    valid_features = Parallel(n_jobs=-1)(
        delayed(extract_feature)(entry) for entry in tqdm(features, desc="Validating and extracting features")
    )
    valid_features = [item for sublist in valid_features for item in sublist]  # Flatten list
    return np.array(valid_features, dtype=np.float32)

# Augment features for robustness
def augment_features(X, y, augment_factor=1):
    augmented_X, augmented_y = [], []
    for _ in tqdm(range(augment_factor), desc="Augmenting features"):
        noise = np.random.normal(0, 0.01, X.shape)
        scale = np.random.uniform(0.9, 1.1, X.shape)
        X_augmented = X + noise
        X_augmented *= scale
        augmented_X.append(X_augmented)
        augmented_y.append(y)
    return np.vstack(augmented_X), np.hstack(augmented_y)

# Function for Bayesian Optimization with memory-safe defaults
def optimize_model(model, param_grid, X_train, y_train, n_iter=5):
    try:
        bayes_search = BayesSearchCV(
            model,
            param_grid,
            n_iter=n_iter,
            cv=3,
            scoring='roc_auc',
            n_jobs=-1,
            verbose=1
        )
        bayes_search.fit(X_train, y_train)
        return bayes_search.best_estimator_
    except Exception as e:
        print(f"Optimization failed for {model.__class__.__name__}: {e}")
        return None

# Fix for XGBoost optimization
def optimize_xgboost(X_train, y_train):
    try:
        xgb_params = {
            'n_estimators': (50, 100),
            'learning_rate': (0.01, 0.2, 'log-uniform'),
            'max_depth': (3, 10),
            'colsample_bytree': (0.5, 1.0),
            'subsample': (0.5, 1.0)
        }
        xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
        best_xgb = optimize_model(xgb_model, xgb_params, X_train, y_train)
        return best_xgb
    except Exception as e:
        print(f"XGBoost optimization failed: {e}")
        return None

# Updated function to optimize models sequentially
def optimize_models_sequentially(X_train, y_train):
    results = {}

    # Optimizing CatBoost
    print("Optimizing CatBoost...")
    try:
        catboost_params = {
            'depth': (4, 6),
            'learning_rate': (1e-3, 0.05, 'log-uniform'),
            'iterations': (50, 100)
        }
        results['catboost'] = optimize_model(
            CatBoostClassifier(verbose=0, task_type='CPU', thread_count=-1),
            catboost_params,
            X_train,
            y_train
        )
    except Exception as e:
        print(f"CatBoost optimization failed: {e}")

    # Optimizing XGBoost
    print("Optimizing XGBoost...")
    results['xgboost'] = optimize_xgboost(X_train, y_train)

    # Ensure at least one model succeeds
    successful_models = {k: v for k, v in results.items() if v is not None}
    if not successful_models:
        raise ValueError("Optimization incomplete: All models failed.")

    return successful_models

# Train and evaluate the ensemble model
def train_and_evaluate():
    print("Loading training features...")

    # Load features
    fake_features = load_features(FAKE_VALID_FEATURES_PATH)
    real_features = load_features(REAL_VALID_FEATURES_PATH)

    # Validate and extract feature vectors
    X_fake = validate_and_extract(fake_features)
    X_real = validate_and_extract(real_features)

    # Create labels
    y_fake = np.ones(len(X_fake))
    y_real = np.zeros(len(X_real))

    # Combine data and labels
    X_combined = np.vstack((X_fake, X_real))
    y_combined = np.hstack((y_fake, y_real))

    # Normalize features
    scaler = StandardScaler()
    X_combined = scaler.fit_transform(X_combined)

    # Split into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(
        X_combined, y_combined, test_size=0.2, random_state=52, stratify=y_combined
    )

    # Augment training data
    print("Augmenting training data...")
    X_train, y_train = augment_features(X_train, y_train, augment_factor=1)

    # Debugging shapes
    print("Shape of X_train:", X_train.shape)
    print("Shape of X_val:", X_val.shape)
    print("Shape of y_train:", y_train.shape)
    print("Shape of y_val:", y_val.shape)

    # Load and configure ResNet-50 model
    print("Training ResNet-50 model...")
    base_model = ResNet50(include_top=False, weights='imagenet', input_shape=(224, 224, 3))
    for layer in base_model.layers:
        layer.trainable = False

    resnet_model = tf.keras.Sequential([
        base_model,
        Flatten(),
        Dense(256, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])

    resnet_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    # Example: Adjust X_train for ResNet-50 (if image data is used)
    # X_train_resized = resize_images(X_train)
    # resnet_model.fit(X_train_resized, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))

    # Sequential model optimization
    optimized_models = optimize_models_sequentially(X_train, y_train)
    catboost = optimized_models.get('catboost')
    xgboost = optimized_models.get('xgboost')

    # Combine models into ensemble
    print("Creating ensemble model...")
    ensemble_estimators = []
    if catboost:
        ensemble_estimators.append(('catboost', catboost))
    if xgboost:
        ensemble_estimators.append(('xgboost', xgboost))

    ensemble = VotingClassifier(estimators=ensemble_estimators, voting='soft')

    # Train ensemble
    print("Training ensemble model...")
    for _ in tqdm(range(1), desc="Training Loop"):
        ensemble.fit(X_train, y_train)

    # Save ensemble model
    with open(CHECKPOINT_PATH, 'wb') as f:
        pickle.dump(ensemble, f)

    # Evaluate on validation set
    print("Evaluating the ensemble model...")
    val_predictions = ensemble.predict(X_val)
    val_probabilities = ensemble.predict_proba(X_val)[:, 1]

    accuracy = np.mean(val_predictions == y_val)
    auc = roc_auc_score(y_val, val_probabilities)

    # Classification report
    print("Classification Report:")
    print(classification_report(y_val, val_predictions))

    print(f"Accuracy: {accuracy:.4f}")
    print(f"AUC-ROC: {auc:.4f}")

# Run training and evaluation
if __name__ == "__main__":
    train_and_evaluate()


Configured GPU with memory growth and 11 GB limit.
Loading training features...



Validating and extracting features:   0%|          | 0/1524 [00:00<?, ?it/s][A
Validating and extracting features:   0%|          | 2/1524 [00:00<01:19, 19.06it/s][A
Validating and extracting features:   0%|          | 4/1524 [00:00<02:24, 10.55it/s][A
Validating and extracting features: 100%|██████████| 1524/1524 [00:00<00:00, 3237.10it/s]

Validating and extracting features: 100%|██████████| 1548/1548 [00:00<00:00, 21309.47it/s]


Augmenting training data...



Augmenting features:   0%|          | 0/1 [00:00<?, ?it/s][A
Augmenting features: 100%|██████████| 1/1 [00:00<00:00,  7.25it/s]

Shape of X_train: (2457, 1280)
Shape of X_val: (615, 1280)
Shape of y_train: (2457,)
Shape of y_val: (615,)
Training ResNet-50 model...





Optimizing CatBoost...
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Optimizing XGBoost...
Optimization failed for XGBClassifier: 'super' object has no attribute '__sklearn_tags__'
Creating ensemble model...
Training ensemble model...



Training Loop:   0%|          | 0/1 [00:00<?, ?it/s][A
Training Loop: 100%|██████████| 1/1 [00:27<00:00, 27.75s/it]

Evaluating the ensemble model...
Classification Report:
              precision    recall  f1-score   support

         0.0       0.92      0.96      0.94       310
         1.0       0.96      0.91      0.93       305

    accuracy                           0.94       615
   macro avg       0.94      0.94      0.94       615
weighted avg       0.94      0.94      0.94       615

Accuracy: 0.9366
AUC-ROC: 0.9847





In [None]:
import numpy as np
import pickle
import os
import tensorflow as tf
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from skopt import BayesSearchCV
from imblearn.over_sampling import SMOTE
from joblib import Parallel, delayed
from multiprocessing import Pool, cpu_count

# GPU Configuration
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    try:
        for device in physical_devices:
            tf.config.experimental.set_memory_growth(device, True)
        print("Configured GPU with memory growth.")
    except Exception as e:
        print(f"Error configuring GPU: {e}")
else:
    print("No GPU detected, running on CPU.")

# File paths
FAKE_TRAIN_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/standardized_fake_train.pkl'
REAL_TRAIN_FEATURES_PATH = 'drive/MyDrive/SP_cup/features/standardized_real_train.pkl'
CHECKPOINT_PATH = "drive/MyDrive/SP_cup/checkpoints/ensemble_model.pkl"
os.makedirs(os.path.dirname(CHECKPOINT_PATH), exist_ok=True)

# Function to load features
def load_features(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

# Extract features with validation
def validate_and_extract(features):
    with Pool(cpu_count()) as pool:
        valid_features = list(tqdm(pool.imap(lambda f: f.get('features', []) if isinstance(f, dict) else [], features),
                                   total=len(features), desc="Validating and extracting features"))
    return np.array([item for sublist in valid_features for item in sublist], dtype=np.float32)

# Augment features
def augment_features(X, y, augment_factor=1):
    augmented_X, augmented_y = [], []
    for _ in tqdm(range(augment_factor), desc="Augmenting features"):
        noise = np.random.normal(0, 0.01, X.shape)
        scale = np.random.uniform(0.9, 1.1, X.shape)
        X_augmented = X + noise
        X_augmented *= scale
        augmented_X.append(X_augmented)
        augmented_y.append(y)
    return np.vstack(augmented_X), np.hstack(augmented_y)

# Optimize models
def optimize_model(model, param_grid, X_train, y_train, n_iter=2):
    try:
        search = BayesSearchCV(
            model,
            param_grid,
            n_iter=n_iter,
            cv=3,
            scoring='roc_auc',
            n_jobs=-1,
            verbose=1
        )
        search.fit(X_train, y_train)
        return search.best_estimator_
    except Exception as e:
        print(f"Optimization failed for {model.__class__.__name__}: {e}")
        return None

# Restart session
def restart_session():
    print("Restarting session to avoid crashes...")
    os.kill(os.getpid(), 9)

# Manage memory
def manage_memory():
    import gc
    gc.collect()
    tf.keras.backend.clear_session()

# Train and evaluate ensemble model with session restart
def train_and_evaluate_with_restarts():
    try:
        print("Loading training features...")

        # Load features
        X_fake = validate_and_extract(load_features(FAKE_TRAIN_FEATURES_PATH))
        X_real = validate_and_extract(load_features(REAL_TRAIN_FEATURES_PATH))

        # Create labels
        y_fake = np.ones(len(X_fake))
        y_real = np.zeros(len(X_real))

        # Combine data and labels
        X_combined = np.vstack((X_fake, X_real))
        y_combined = np.hstack((y_fake, y_real))

        # Normalize features
        scaler = StandardScaler()
        X_combined = scaler.fit_transform(X_combined)

        # Split into training and validation sets
        X_train, X_val, y_train, y_val = train_test_split(
            X_combined, y_combined, test_size=0.2, random_state=52, stratify=y_combined
        )

        # Apply SMOTE
        print("Applying SMOTE...")
        smote = SMOTE(random_state=52)
        X_train, y_train = smote.fit_resample(X_train, y_train)

        # Augment training data
        print("Augmenting training data...")
        X_train, y_train = augment_features(X_train, y_train, augment_factor=1)

        # Debug shapes
        print(f"X_train shape: {X_train.shape}, X_val shape: {X_val.shape}")

        # Optimizing models
        print("Optimizing models...")
        catboost_params = {'depth': (4, 6), 'learning_rate': (1e-3, 0.05, 'log-uniform'), 'iterations': (50, 100)}
        xgboost_params = {'n_estimators': (50, 100), 'learning_rate': (0.01, 0.2, 'log-uniform'), 'max_depth': (3, 10)}

        catboost = optimize_model(CatBoostClassifier(verbose=0), catboost_params, X_train, y_train)
        xgboost = optimize_model(XGBClassifier(use_label_encoder=False, eval_metric='logloss'), xgboost_params, X_train, y_train)

        # Create and train ensemble
        print("Training ensemble model...")
        ensemble_estimators = []
        if catboost: ensemble_estimators.append(('catboost', catboost))
        if xgboost: ensemble_estimators.append(('xgboost', xgboost))

        if not ensemble_estimators:
            raise ValueError("No models were successfully optimized.")

        ensemble = VotingClassifier(estimators=ensemble_estimators, voting='soft')
        ensemble.fit(X_train, y_train)

        # Save model
        with open(CHECKPOINT_PATH, 'wb') as f:
            pickle.dump(ensemble, f)

        # Evaluate model
        val_preds = ensemble.predict(X_val)
        val_probs = ensemble.predict_proba(X_val)[:, 1]
        print(classification_report(y_val, val_preds))
        print(f"AUC-ROC: {roc_auc_score(y_val, val_probs):.4f}")

    except MemoryError:
        print("MemoryError detected! Restarting session...")
        restart_session()
    except Exception as e:
        print(f"Error occurred: {e}")
        restart_session()

# Main execution
if __name__ == "__main__":
    train_and_evaluate_with_restarts()


Configured GPU with memory growth.
Loading training features...


Validating and extracting features:   0%|          | 0/16 [00:00<?, ?it/s]