In [1]:
# ===== Basic Python Libraries =====
import pandas as pd
import numpy as np
import random
import time
import joblib
import pickle

# ===== Scikit-learn: Semi-Supervised Models =====
from sklearn.semi_supervised import LabelPropagation, LabelSpreading, SelfTrainingClassifier

# ===== Scikit-learn: Base Classifiers =====
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# ===== Scikit-learn: Preprocessing =====
from sklearn.preprocessing import StandardScaler, LabelEncoder

# ===== Scikit-learn: Data Splitting =====
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, train_test_split

# ===== Scikit-learn: Model Evaluation =====
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    balanced_accuracy_score,
    classification_report,
    confusion_matrix
)

# ===== Scikit-learn: Utilities =====
from sklearn.utils.class_weight import compute_class_weight
from sklearn.base import clone

# ===== Visualization Libraries =====
import matplotlib.pyplot as plt
import seaborn as sns

# ===== Scikit-learn: Hyperparameter Tuning =====
from sklearn.model_selection import GridSearchCV, ParameterGrid

In [2]:
TRAIN_DATASET_PATH = 'Dataset/train_data.csv'
TEST_DATASET_PATH = 'Dataset/test_data.csv'
TARGET_COLUMN = 'Label'

Load Data

In [3]:
df_train = pd.read_csv(TRAIN_DATASET_PATH)
df_test = pd.read_csv(TEST_DATASET_PATH)

Preprocessing Data

In [4]:
columns_to_drop = ['FlowID', 'SourceIP', 'DestinationIP', 'Timestamp']

def preprocess_dataframe(df, columns_to_drop, target_column, desired_sample_size):
    df_processed = df.drop(columns=columns_to_drop, errors='ignore').copy()

    # Handle infinite and missing values
    df_processed.replace([float('inf'), float('-inf')], pd.NA, inplace=True)
    df_processed.fillna(0, inplace=True)

    # Stratified downsampling (unless using full data)
    if isinstance(desired_sample_size, str) and desired_sample_size.lower() == 'all':
        df_sampled = df_processed
    else:
        sample_frac = desired_sample_size / len(df_processed)
        df_sampled, _ = train_test_split(
            df_processed,
            test_size=1 - sample_frac,
            stratify=df_processed[target_column],
            random_state=42
        )

    # Separate features (X) and target (y)
    X = df_sampled.drop(columns=[target_column])
    y = df_sampled[target_column]

    return X, y

In [5]:
X_train, y_train = preprocess_dataframe(df_train, columns_to_drop, TARGET_COLUMN, 30000)
X_test, y_test = preprocess_dataframe(df_test, columns_to_drop, TARGET_COLUMN, 'all')

Encode labels (1 for BENIGN, 0 for ATTACK)

In [6]:
def simplify_labels(y):
    return y.apply(lambda x: 'BENIGN' if x.upper() == 'BENIGN' else 'ATTACK')

y_train = simplify_labels(y_train)
y_test = simplify_labels(y_test)

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)


In [7]:
# check mapping
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
mapping

{'ATTACK': 0, 'BENIGN': 1}

Scale features

In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [20]:
# save the standard scaler
#joblib.dump(scaler, 'scaler_semi_supervised.pkl')

# save scaler using pickle
with open('scaler_semi_supervised.pkl', 'wb') as f:
    pickle.dump(scaler, f)

Unlabel data

In [9]:
# First convert y_train to a numpy array if it's not already
y_train_array = y_train.values if hasattr(y_train, 'values') else np.array(y_train)

# Create a copy of y_train with continuous indices
indices = np.arange(len(y_train_array))

# Keep 10% labeled randomly, stratified by class
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.9, random_state=42)
labeled_idx, unlabeled_idx = next(sss.split(indices, y_train_array))

# Create the semi-supervised target array
y_train_semi = np.full(len(y_train_array), -1)
y_train_semi[labeled_idx] = y_train_array[labeled_idx]

In [13]:
def evaluate_semi_supervised_models(X_train_scaled, y_train, y_train_semi, n_splits=5, random_state=42):
    # Ensure y_train is a NumPy array
    y_train_array = y_train.values if hasattr(y_train, 'values') else np.array(y_train)

    # Get labeled subset
    labeled_indices = np.where(y_train_semi != -1)[0]
    X_labeled = X_train_scaled[labeled_indices]
    y_labeled = y_train_array[labeled_indices]

    # Print class distribution
    unique_classes, counts = np.unique(y_labeled, return_counts=True)
    print("Class distribution in labeled data:")
    for cls, count in zip(unique_classes, counts):
        print(f"Class {cls}: {count} samples")

    # Define model factories
    model_factories = {
        'LabelSpreading': lambda: LabelSpreading(kernel='knn', n_neighbors=10, max_iter=200),
        'LabelPropagation': lambda: LabelPropagation(kernel='knn', n_neighbors=10, max_iter=200),
        'SelfTraining_LR': lambda: SelfTrainingClassifier(LogisticRegression(max_iter=1000, class_weight='balanced')),
        'SelfTraining_RF': lambda: SelfTrainingClassifier(RandomForestClassifier(n_estimators=100, class_weight='balanced'))
    }

    results = {
        'model': [], 'fold': [], 'accuracy': [], 'balanced_accuracy': [],
        'weighted_f1': [], 'macro_f1': [],
        'precision_weighted': [], 'precision_macro': [],
        'recall_weighted': [], 'recall_macro': [],
        'train_time': [], 'predict_time': []
    }

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    # Now outer loop over models
    for model_name, model_factory in model_factories.items():
        print(f"\n====================")
        print(f"Evaluating {model_name}")
        print("====================")

        for fold, (train_idx, val_idx) in enumerate(skf.split(X_labeled, y_labeled)):
            print(f"  Fold {fold+1}/{n_splits}")

            # Validation fold
            X_val_fold = X_labeled[val_idx]
            y_val_fold = y_labeled[val_idx]

            # Mask labels for semi-supervised training
            y_train_semi_fold = np.full(len(y_train_array), -1)
            labeled_train_indices = labeled_indices[train_idx]
            y_train_semi_fold[labeled_train_indices] = y_train_array[labeled_train_indices]

            # Set class weights for base estimator if supported
            y_train_fold = y_train_array[labeled_train_indices]
            classes = np.unique(y_train_fold)
            class_weights = compute_class_weight('balanced', classes=classes, y=y_train_fold)
            class_weight_dict = dict(zip(classes, class_weights))

            model = model_factory()
            if hasattr(model, "base_estimator") and hasattr(model.base_estimator, 'class_weight'):
                model.base_estimator.class_weight = class_weight_dict

            # Fit on full dataset using semi-supervised labels
            train_start = time.time()
            model.fit(X_train_scaled, y_train_semi_fold)
            train_end = time.time()

            # Predict on validation fold
            predict_start = time.time()
            y_val_pred = model.predict(X_val_fold)
            predict_end = time.time()

            # Evaluation metrics
            acc = accuracy_score(y_val_fold, y_val_pred)
            balanced_acc = balanced_accuracy_score(y_val_fold, y_val_pred)
            f1_weighted = f1_score(y_val_fold, y_val_pred, average='weighted')
            f1_macro = f1_score(y_val_fold, y_val_pred, average='macro')
            precision_weighted = precision_score(y_val_fold, y_val_pred, average='weighted', zero_division=0)
            precision_macro = precision_score(y_val_fold, y_val_pred, average='macro', zero_division=0)
            recall_weighted = recall_score(y_val_fold, y_val_pred, average='weighted', zero_division=0)
            recall_macro = recall_score(y_val_fold, y_val_pred, average='macro', zero_division=0)

            # Store results
            results['model'].append(model_name)
            results['fold'].append(fold+1)
            results['accuracy'].append(acc)
            results['balanced_accuracy'].append(balanced_acc)
            results['weighted_f1'].append(f1_weighted)
            results['macro_f1'].append(f1_macro)
            results['precision_weighted'].append(precision_weighted)
            results['precision_macro'].append(precision_macro)
            results['recall_weighted'].append(recall_weighted)
            results['recall_macro'].append(recall_macro)
            results['train_time'].append(train_end - train_start)
            results['predict_time'].append(predict_end - predict_start)

            print(f"    Balanced Accuracy: {balanced_acc:.4f}, Macro F1: {f1_macro:.4f}, Macro Precision: {precision_macro:.4f}, Macro Recall: {recall_macro:.4f}")

    return pd.DataFrame(results)


In [14]:
results = evaluate_semi_supervised_models(X_train_scaled, y_train, y_train_semi)

Class distribution in labeled data:
Class 0: 680 samples
Class 1: 2320 samples

Evaluating LabelSpreading
  Fold 1/5


  probabilities /= normalizer


    Balanced Accuracy: 0.9512, Macro F1: 0.9397, Macro Precision: 0.9295, Macro Recall: 0.9512
  Fold 2/5


  probabilities /= normalizer


    Balanced Accuracy: 0.9566, Macro F1: 0.9507, Macro Precision: 0.9452, Macro Recall: 0.9566
  Fold 3/5


  probabilities /= normalizer


    Balanced Accuracy: 0.9622, Macro F1: 0.9471, Macro Precision: 0.9341, Macro Recall: 0.9622
  Fold 4/5


  probabilities /= normalizer


    Balanced Accuracy: 0.9510, Macro F1: 0.9297, Macro Precision: 0.9126, Macro Recall: 0.9510
  Fold 5/5


  probabilities /= normalizer


    Balanced Accuracy: 0.9415, Macro F1: 0.9206, Macro Precision: 0.9039, Macro Recall: 0.9415

Evaluating LabelPropagation
  Fold 1/5


  probabilities /= normalizer


    Balanced Accuracy: 0.9527, Macro F1: 0.9379, Macro Precision: 0.9251, Macro Recall: 0.9527
  Fold 2/5


  probabilities /= normalizer


    Balanced Accuracy: 0.9687, Macro F1: 0.9603, Macro Precision: 0.9526, Macro Recall: 0.9687
  Fold 3/5


  probabilities /= normalizer


    Balanced Accuracy: 0.9653, Macro F1: 0.9433, Macro Precision: 0.9257, Macro Recall: 0.9653
  Fold 4/5


  probabilities /= normalizer


    Balanced Accuracy: 0.9551, Macro F1: 0.9282, Macro Precision: 0.9080, Macro Recall: 0.9551
  Fold 5/5


  probabilities /= normalizer


    Balanced Accuracy: 0.9436, Macro F1: 0.9248, Macro Precision: 0.9094, Macro Recall: 0.9436

Evaluating SelfTraining_LR
  Fold 1/5
    Balanced Accuracy: 0.9353, Macro F1: 0.8956, Macro Precision: 0.8706, Macro Recall: 0.9353
  Fold 2/5
    Balanced Accuracy: 0.9192, Macro F1: 0.8964, Macro Precision: 0.8788, Macro Recall: 0.9192
  Fold 3/5
    Balanced Accuracy: 0.9318, Macro F1: 0.9022, Macro Precision: 0.8811, Macro Recall: 0.9318
  Fold 4/5
    Balanced Accuracy: 0.9271, Macro F1: 0.8769, Macro Precision: 0.8500, Macro Recall: 0.9271
  Fold 5/5
    Balanced Accuracy: 0.9282, Macro F1: 0.8788, Macro Precision: 0.8519, Macro Recall: 0.9282

Evaluating SelfTraining_RF
  Fold 1/5
    Balanced Accuracy: 0.9989, Macro F1: 0.9976, Macro Precision: 0.9964, Macro Recall: 0.9989
  Fold 2/5
    Balanced Accuracy: 0.9926, Macro F1: 0.9952, Macro Precision: 0.9979, Macro Recall: 0.9926
  Fold 3/5
    Balanced Accuracy: 0.9952, Macro F1: 0.9952, Macro Precision: 0.9952, Macro Recall: 0.9952
 

In [16]:
results_agg = results.groupby('model').agg({
    'accuracy': ['mean', 'std'],
    'balanced_accuracy': ['mean', 'std'],
    'weighted_f1': ['mean', 'std'],
    'macro_f1': ['mean', 'std'],
    'train_time': ['mean', 'std'],
    'predict_time': ['mean', 'std'],
    'precision_weighted': ['mean', 'std'],
    'precision_macro': ['mean', 'std'],
    'recall_weighted': ['mean', 'std'],
    'recall_macro': ['mean', 'std']
}).reset_index()
results_agg

Unnamed: 0_level_0,model,accuracy,accuracy,balanced_accuracy,balanced_accuracy,weighted_f1,weighted_f1,macro_f1,macro_f1,train_time,...,predict_time,predict_time,precision_weighted,precision_weighted,precision_macro,precision_macro,recall_weighted,recall_weighted,recall_macro,recall_macro
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std,mean,...,mean,std,mean,std,mean,std,mean,std,mean,std
0,LabelPropagation,0.955333,0.010698,0.957087,0.010062,0.956221,0.010292,0.938903,0.014062,1.313264,...,0.030232,0.002781,0.959026,0.009053,0.924133,0.017974,0.955333,0.010698,0.957087,0.010062
1,LabelSpreading,0.954667,0.009603,0.952497,0.007685,0.955426,0.009174,0.937567,0.012428,0.948976,...,0.030493,0.00973,0.957544,0.007879,0.925063,0.016644,0.954667,0.009603,0.952497,0.007685
2,SelfTraining_LR,0.915667,0.010775,0.928322,0.006008,0.919021,0.00953,0.889984,0.011387,3.952677,...,0.00044,0.000607,0.931104,0.004218,0.866483,0.014705,0.915667,0.010775,0.928322,0.006008
3,SelfTraining_RF,0.995333,0.003206,0.991785,0.006049,0.995323,0.003217,0.993314,0.004604,42.462959,...,0.007232,0.000213,0.995341,0.003218,0.994897,0.003834,0.995333,0.003206,0.991785,0.006049


In [17]:
def get_best_semi_supervised_model(
    results_df,
    X_train_scaled,
    y_train_semi,
    metric='balanced_accuracy'
):
    """
    Selects and trains the best semi-supervised model based on cross-validation results.

    Parameters:
    -----------
    results_df : pd.DataFrame
        Output from evaluate_semi_supervised_models.
    X_train_scaled : np.ndarray
        Scaled feature matrix for training.
    y_train_semi : np.ndarray
        Semi-supervised label array (-1 for unlabeled samples).
    metric : str, default='balanced_accuracy'
        Metric to use for selecting the best model.
    save_path : str or None
        Directory path to save the trained model as a pickle file (optional).

    Returns:
    --------
    tuple: (best_model_name, trained_model, average_score, training_time)
    """
    # Step 1: Validate metric
    valid_metrics = ['accuracy', 'balanced_accuracy', 'weighted_f1', 'macro_f1']
    if metric not in valid_metrics:
        raise ValueError(f"Metric must be one of {valid_metrics}")

    # Step 2: Aggregate performance across folds
    results_agg = results_df.groupby('model').agg({metric: ['mean', 'std']}).reset_index()
    results_agg.columns = ['model', f'{metric}_mean', f'{metric}_std']

    # Step 3: Identify best model
    best_row = results_agg.loc[results_agg[f'{metric}_mean'].idxmax()]
    best_model_name = best_row['model']
    best_model_score = best_row[f'{metric}_mean']
    best_model_std = best_row[f'{metric}_std']

    print(f"\n✅ Best model based on {metric}: {best_model_name}")
    print(f"   Avg {metric}: {best_model_score:.4f} ± {best_model_std:.4f}")

    # Step 4: Instantiate best model
    if best_model_name == 'LabelSpreading':
        best_model = LabelSpreading(kernel='knn', n_neighbors=10, max_iter=200)
    elif best_model_name == 'LabelPropagation':
        best_model = LabelPropagation(kernel='knn', n_neighbors=10, max_iter=200)
    elif best_model_name == 'SelfTraining_LR':
        best_model = SelfTrainingClassifier(LogisticRegression(max_iter=1000, class_weight='balanced'))
    elif best_model_name == 'SelfTraining_RF':
        best_model = SelfTrainingClassifier(RandomForestClassifier(n_estimators=100, class_weight='balanced'))
    else:
        raise ValueError(f"Unknown model name: {best_model_name}")

    # Step 5: Train on full dataset
    print(f"\n🚀 Training {best_model_name} on full dataset ({len(X_train_scaled)} samples)...")
    start_time = time.time()
    best_model.fit(X_train_scaled, y_train_semi)
    training_time = time.time() - start_time
    print(f"   ✅ Training complete in {training_time:.2f} seconds")

    return best_model_name, best_model, best_model_score, training_time


In [12]:
ls_model = SelfTrainingClassifier(LogisticRegression(max_iter=1000, class_weight='balanced'))
ls_model.fit(X_train_scaled, y_train_semi)
y_test_pred = ls_model.predict(X_test_scaled)

# get test performance
test_acc = accuracy_score(y_test, y_test_pred)
test_f1_weighted = f1_score(y_test, y_test_pred, average='weighted')
test_recall_weighted = recall_score(y_test, y_test_pred, average='weighted')
test_precision_weighted = precision_score(y_test, y_test_pred, average='weighted')
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test Weighted F1: {test_f1_weighted:.4f}")
print(f"Test Weighted Recall: {test_recall_weighted:.4f}")
print(f"Test Weighted Precision: {test_precision_weighted:.4f}")


Test Accuracy: 0.9335
Test Weighted F1: 0.9354
Test Weighted Recall: 0.9335
Test Weighted Precision: 0.9419


In [18]:
# Get the best model
best_model_name, best_model, best_model_score, train_time = get_best_semi_supervised_model(results, X_train_scaled, y_train_semi, 'weighted_f1')

# Save best model with name
joblib.dump(best_model, f'{best_model_name}_semi_supervised.pkl')


✅ Best model based on weighted_f1: SelfTraining_RF
   Avg weighted_f1: 0.9953 ± 0.0032

🚀 Training SelfTraining_RF on full dataset (30000 samples)...
   ✅ Training complete in 49.07 seconds


['SelfTraining_RF_semi_supervised.pkl']

In [96]:
def evaluate_on_test_data(best_model, best_model_name, X_test_scaled, y_test_encoded, label_encoder=None):
    """
    Evaluates a trained semi-supervised model on test data and prints metrics in a detailed format.

    Parameters:
    -----------
    best_model : sklearn estimator
        Trained semi-supervised model (e.g., SelfTrainingClassifier).
    X_test_scaled : np.ndarray
        Scaled test features.
    y_test_encoded : np.ndarray
        Encoded true labels for the test set.
    label_encoder : sklearn.preprocessing.LabelEncoder, optional
        Used to decode class labels for readable classification report.
    
    Returns:
    --------
    dict : Dictionary of evaluation results including accuracy, precision, recall, F1, classification report, and confusion matrix.
    """
    print(f"\nEvaluating {best_model_name} on test data...")
    
    start_time = time.time()
    y_pred_encoded = best_model.predict(X_test_scaled)
    prediction_time = time.time() - start_time

    # Decode predictions for readable report (optional)
    if label_encoder:
        try:
            target_names = label_encoder.classes_
        except AttributeError:
            target_names = None
    else:
        target_names = None

    # Compute metrics
    accuracy = accuracy_score(y_test_encoded, y_pred_encoded)
    precision = precision_score(y_test_encoded, y_pred_encoded, average='weighted', zero_division=0)
    recall = recall_score(y_test_encoded, y_pred_encoded, average='weighted', zero_division=0)
    f1 = f1_score(y_test_encoded, y_pred_encoded, average='weighted', zero_division=0)

    # Generate classification report
    try:
        report = classification_report(y_test_encoded, y_pred_encoded, target_names=target_names, zero_division=0)
        conf_matrix = confusion_matrix(y_test_encoded, y_pred_encoded)

        print(f"\nAccuracy: {accuracy:.4f}")
        print(f"Precision (Weighted): {precision:.4f}")
        print(f"Recall (Weighted): {recall:.4f}")
        print(f"F1-Score (Weighted): {f1:.4f}")
        print(f"Prediction Time (s): {prediction_time:.4f}")
        print("\nClassification Report:")
        print(report)
        print("\nConfusion Matrix:")
        print(conf_matrix)
    except ValueError as e:
        print("Could not generate full classification report due to missing classes in test set.")
        report = "Unavailable"
        conf_matrix = "Unavailable"
        print(f"Error details: {e}")
        print(f"\nAccuracy: {accuracy:.4f}")

    return {
        "Accuracy": accuracy,
        "Precision (Weighted)": precision,
        "Recall (Weighted)": recall,
        "F1-Score (Weighted)": f1,
        "Classification Report": report,
        "Confusion Matrix": conf_matrix,
        "Prediction Time (s)": prediction_time
    }


In [97]:
# Evaluate the best model on test data
test_results = evaluate_on_test_data(best_model, best_model_name, X_test_scaled, y_test, label_encoder)
# Save test results to a file
test_results_df = pd.DataFrame([test_results])
test_results_df.to_csv(f'{best_model_name}_test_results.csv', index=False)


Evaluating SelfTraining_RF on test data...

Accuracy: 0.9932
Precision (Weighted): 0.9933
Recall (Weighted): 0.9932
F1-Score (Weighted): 0.9932
Prediction Time (s): 4.2466

Classification Report:
              precision    recall  f1-score   support

      ATTACK       1.00      0.97      0.98    145426
      BENIGN       0.99      1.00      1.00    494269

    accuracy                           0.99    639695
   macro avg       0.99      0.99      0.99    639695
weighted avg       0.99      0.99      0.99    639695


Confusion Matrix:
[[141421   4005]
 [   316 493953]]


In [80]:
def optimize_self_training_rf(X_train_scaled, y_train_semi, cv=5):
    """
    Optimize SelfTraining RandomForest using grid search with cross-validation.
    
    Parameters:
    -----------
    X_train_scaled : numpy.ndarray
        Scaled training features
    y_train_semi : numpy.ndarray
        Semi-supervised labels for training (-1 for unlabeled samples)
    cv : int, default=5
        Number of cross-validation folds
        
    Returns:
    --------
    tuple: (best_model, best_params, cv_results)
        Best model, best parameters, and cross-validation results
    """
    
    
    # Get labeled data indices for stratified CV
    labeled_indices = np.where(y_train_semi != -1)[0]
    y_labeled = y_train_semi[labeled_indices]
    
    # Create stratified k-fold for semi-supervised scenario
    cv_splitter = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    
    # Define the parameter grid
    param_grid = {
        'base_estimator__n_estimators': [100, 200],
        'base_estimator__max_depth': [None, 20],
        'base_estimator__min_samples_split': [2, 5],
        'base_estimator__min_samples_leaf': [1, 2],
        'threshold': [0.7, 0.9]  # Self-training confidence threshold
    }
    
    # Create base RF classifier
    base_rf = RandomForestClassifier(
        class_weight='balanced', 
        random_state=42
    )
    
    # Create self-training classifier
    self_training_model = SelfTrainingClassifier(
        base_rf,
        verbose=False
    )
    
    # Define the grid search
    grid_search = GridSearchCV(
        estimator=self_training_model,
        param_grid=param_grid,
        scoring='f1_macro',
        cv=cv_splitter.split(X_train_scaled[labeled_indices], y_labeled),
        verbose=1,
        n_jobs=-1  # Use all available cores
    )
    
    print("Starting grid search for SelfTraining_RF...")
    print("This may take some time...")
    
    # Prepare a mask for CV that only evaluates on labeled data
    # but still allows the model to use unlabeled data for self-training
    # Create train/test splits based on labeled data only, but fit on all data
    
    # Custom fit for semi-supervised grid search
    best_score = -1
    best_model = None
    best_params = None
    cv_results = []
    
    # Manual grid search approach tailored for semi-supervised learning
    for params in ParameterGrid(param_grid):
        fold_scores = []
        
        for train_idx, val_idx in cv_splitter.split(X_train_scaled[labeled_indices], y_labeled):
            # Map indices back to original dataset indices
            labeled_train_indices = labeled_indices[train_idx]
            labeled_val_indices = labeled_indices[val_idx]
            
            # Create semi-supervised labels for this fold
            y_fold = np.full_like(y_train_semi, -1)
            y_fold[labeled_train_indices] = y_train_semi[labeled_train_indices]
            
            # Configure model with current parameters
            base_est = RandomForestClassifier(
                n_estimators=params['base_estimator__n_estimators'],
                max_depth=params['base_estimator__max_depth'],
                min_samples_split=params['base_estimator__min_samples_split'],
                min_samples_leaf=params['base_estimator__min_samples_leaf'],
                class_weight='balanced',
                random_state=42
            )
            
            model = SelfTrainingClassifier(
                base_est,
                threshold=params['threshold'],
                verbose=False
            )
            
            # Fit model on the fold's semi-supervised data
            model.fit(X_train_scaled, y_fold)
            
            # Evaluate on the validation set (labeled data only)
            y_val_pred = model.predict(X_train_scaled[labeled_val_indices])
            score = balanced_accuracy_score(
                y_train_semi[labeled_val_indices], 
                y_val_pred
            )
            
            fold_scores.append(score)
        
        # Calculate average score across folds
        mean_score = np.mean(fold_scores)
        std_score = np.std(fold_scores)
        
        # Save result
        cv_results.append({
            'params': params,
            'mean_score': mean_score,
            'std_score': std_score
        })
        
        print(f"Params: {params}")
        print(f"Score: {mean_score:.4f} ± {std_score:.4f}")
        
        # Update best if improved
        if mean_score > best_score:
            best_score = mean_score
            best_params = params
            
    # Sort results by mean score
    cv_results = sorted(cv_results, key=lambda x: x['mean_score'], reverse=True)
    
    # Create the best model with optimal parameters
    best_base_est = RandomForestClassifier(
        n_estimators=best_params['base_estimator__n_estimators'],
        max_depth=best_params['base_estimator__max_depth'],
        min_samples_split=best_params['base_estimator__min_samples_split'],
        min_samples_leaf=best_params['base_estimator__min_samples_leaf'],
        class_weight='balanced',
        random_state=42
    )
    
    best_model = SelfTrainingClassifier(
        best_base_est,
        threshold=best_params['threshold'],
        verbose=False
    )
    
    # Fit on the full training dataset
    best_model.fit(X_train_scaled, y_train_semi)
    
    print("\nBest parameters:")
    for param, value in best_params.items():
        print(f"{param}: {value}")
    
    print(f"\nBest cross-validation score: {best_score:.4f}")
    
    return best_model, best_params, cv_results

In [83]:
# Get best model using optimization
best_model_optimized, best_params_optimized, cv_results_optimized = optimize_self_training_rf(X_train_scaled, y_train_semi)
# save best model optimized
joblib.dump(best_model_optimized, 'best_semi_supervised_model_optimized.pkl')

['best_semi_supervised_model_optimized.pkl']