In [None]:
from google.colab import drive
drive.mount('/content/drive')

KeyboardInterrupt: 

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-25.1.0-py3-none-any.whl.metadata (12 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-25.1.0-py3-none-any.whl (26 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-25.1.0 scikit-optimize-0.10.2


In [None]:
pip install --upgrade scikit-learn xgboost

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m95.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.0
    Uninstalling scikit-learn-1.6.0:
      Successfully uninstalled scikit-learn-1.6.0
Successfully installed scikit-learn-1.6.1


In [None]:
import numpy as np
import pickle
import os
import tensorflow as tf
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from skopt import BayesSearchCV
from imblearn.over_sampling import SMOTE
from joblib import Parallel, delayed
from multiprocessing import Pool, cpu_count

# GPU Configuration
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    try:
        for device in physical_devices:
            tf.config.experimental.set_memory_growth(device, True)
        print("Configured GPU with memory growth.")
    except Exception as e:
        print(f"Error configuring GPU: {e}")
else:
    print("No GPU detected, running on CPU.")

# File paths
FAKE_TRAIN_FEATURES_PATH = 'drive/MyDrive/features/standardized_fake_train.pkl'
REAL_TRAIN_FEATURES_PATH = 'drive/MyDrive/features/standardized_real_train.pkl'
CHECKPOINT_PATH = "drive/MyDrive/checkpoints/ensemble_model.pkl"
os.makedirs(os.path.dirname(CHECKPOINT_PATH), exist_ok=True)

# Function to load features
def load_features(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

# Extract features with validation
def validate_and_extract(features):
    with Pool(cpu_count()) as pool:
        valid_features = list(tqdm(pool.imap(lambda f: f.get('features', []) if isinstance(f, dict) else [], features),
                                   total=len(features), desc="Validating and extracting features"))
    return np.array([item for sublist in valid_features for item in sublist], dtype=np.float32)

# Augment features
def augment_features(X, y, augment_factor=1):
    augmented_X, augmented_y = [], []
    for _ in tqdm(range(augment_factor), desc="Augmenting features"):
        noise = np.random.normal(0, 0.01, X.shape)
        scale = np.random.uniform(0.9, 1.1, X.shape)
        X_augmented = X + noise
        X_augmented *= scale
        augmented_X.append(X_augmented)
        augmented_y.append(y)
    return np.vstack(augmented_X), np.hstack(augmented_y)

# Optimize models
def optimize_model(model, param_grid, X_train, y_train, n_iter=2):
    try:
        search = BayesSearchCV(
            model,
            param_grid,
            n_iter=n_iter,
            cv=3,
            scoring='roc_auc',
            n_jobs=-1,
            verbose=1
        )
        search.fit(X_train, y_train)
        return search.best_estimator_
    except Exception as e:
        print(f"Optimization failed for {model.__class__.__name__}: {e}")
        return None

# Restart session
def restart_session():
    print("Restarting session to avoid crashes...")
    os.kill(os.getpid(), 9)

# Manage memory
def manage_memory():
    import gc
    gc.collect()
    tf.keras.backend.clear_session()

# Train and evaluate ensemble model with session restart
def train_and_evaluate_with_restarts():
    try:
        print("Loading training features...")

        # Load features
        X_fake = validate_and_extract(load_features(FAKE_TRAIN_FEATURES_PATH))
        X_real = validate_and_extract(load_features(REAL_TRAIN_FEATURES_PATH))

        # Create labels
        y_fake = np.ones(len(X_fake))
        y_real = np.zeros(len(X_real))

        # Combine data and labels
        X_combined = np.vstack((X_fake, X_real))
        y_combined = np.hstack((y_fake, y_real))

        # Normalize features
        scaler = StandardScaler()
        X_combined = scaler.fit_transform(X_combined)

        # Split into training and validation sets
        X_train, X_val, y_train, y_val = train_test_split(
            X_combined, y_combined, test_size=0.2, random_state=52, stratify=y_combined
        )

        # Apply SMOTE
        print("Applying SMOTE...")
        smote = SMOTE(random_state=52)
        X_train, y_train = smote.fit_resample(X_train, y_train)

        # Augment training data
        print("Augmenting training data...")
        X_train, y_train = augment_features(X_train, y_train, augment_factor=1)

        # Debug shapes
        print(f"X_train shape: {X_train.shape}, X_val shape: {X_val.shape}")

        # Optimizing models
        print("Optimizing models...")
        catboost_params = {'depth': (4, 6), 'learning_rate': (1e-3, 0.05, 'log-uniform'), 'iterations': (50, 100)}
        xgboost_params = {'n_estimators': (50, 100), 'learning_rate': (0.01, 0.2, 'log-uniform'), 'max_depth': (3, 10)}

        catboost = optimize_model(CatBoostClassifier(verbose=0), catboost_params, X_train, y_train)
        xgboost = optimize_model(XGBClassifier(use_label_encoder=False, eval_metric='logloss'), xgboost_params, X_train, y_train)

        # Create and train ensemble
        print("Training ensemble model...")
        ensemble_estimators = []
        if catboost: ensemble_estimators.append(('catboost', catboost))
        if xgboost: ensemble_estimators.append(('xgboost', xgboost))

        if not ensemble_estimators:
            raise ValueError("No models were successfully optimized.")

        ensemble = VotingClassifier(estimators=ensemble_estimators, voting='soft')
        ensemble.fit(X_train, y_train)

        # Save model
        with open(CHECKPOINT_PATH, 'wb') as f:
            pickle.dump(ensemble, f)

        # Evaluate model
        val_preds = ensemble.predict(X_val)
        val_probs = ensemble.predict_proba(X_val)[:, 1]
        print(classification_report(y_val, val_preds))
        print(f"AUC-ROC: {roc_auc_score(y_val, val_probs):.4f}")

    except MemoryError:
        print("MemoryError detected! Restarting session...")
        restart_session()
    except Exception as e:
        print(f"Error occurred: {e}")
        restart_session()

# Main execution
if __name__ == "__main__":
    train_and_evaluate_with_restarts()


No GPU detected, running on CPU.
Loading training features...


Validating and extracting features:   0%|          | 0/16 [00:00<?, ?it/s]