Setup

In [21]:
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
import pickle
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import *
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, IsolationForest
from sklearn.svm import SVC, OneClassSVM
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier, LocalOutlierFactor
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import multiprocessing
from tqdm import tqdm
import gc

In [2]:
SEED = 42
np.random.seed(SEED)
N_JOBS = multiprocessing.cpu_count()  
print(f"Using {N_JOBS} CPU cores for parallel processing")

ROOT = Path(r"C:\Computer Science\AIMLDL\log-anomaly-detection")
FEATURES_PATH = ROOT / "features"
RESULTS_PATH = ROOT / "results" / "cross_source_transfer" / "ml_models"
RESULTS_PATH.mkdir(parents=True, exist_ok=True)

with open(FEATURES_PATH / "hybrid_features.pkl", 'rb') as f:
    data = pickle.load(f)['hybrid_features_data']
with open(FEATURES_PATH / "cross_source_splits.pkl", 'rb') as f:
    splits = pickle.load(f)['splits']

print(f"Loaded {len(data)} sources, {len(splits)} experiments")

Using 20 CPU cores for parallel processing
Loaded 6 sources, 6 experiments


Model Definitions

In [3]:
models_config = {
    'nb': {
        'model': GaussianNB(),
        'params': {'var_smoothing': [1e-9, 1e-7]}
    },
    'lr': {
        'model': LogisticRegression(random_state=SEED, max_iter=1000),
        'params': {'C': [0.1, 1.0], 'penalty': ['l2'], 'solver': ['lbfgs']}
    },
    'knn': {
        'model': KNeighborsClassifier(),
        'params': {'n_neighbors': [3, 5], 'weights': ['uniform', 'distance']}
    },
    'svm': {
        'model': SVC(random_state=SEED, probability=True),
        'params': {'C': [0.1, 1], 'kernel': ['rbf'], 'gamma': ['scale']}
    },
    'dt': {
        'model': DecisionTreeClassifier(random_state=SEED),
        'params': {'max_depth': [5, 10], 'min_samples_split': [2, 5]}
    },
    'rf': {
        'model': RandomForestClassifier(random_state=SEED, n_jobs=2),
        'params': {'n_estimators': [50, 100], 'max_depth': [10], 'min_samples_split': [2]}
    },
    'gb': {
        'model': GradientBoostingClassifier(random_state=SEED),
        'params': {'n_estimators': [50], 'learning_rate': [0.1], 'max_depth': [3, 5]}
    },
    'xgb': {
        'model': XGBClassifier(random_state=SEED, eval_metric='logloss', n_jobs=2, tree_method='hist'),
        'params': {'n_estimators': [50], 'learning_rate': [0.1], 'max_depth': [3, 5]}
    }
}

unsupervised_models_config = {
    'iso': {
        'model': IsolationForest(random_state=SEED, contamination='auto', n_jobs=2),
        'params': {}
    },
    'ocsvm': {
        'model': OneClassSVM(kernel='rbf', gamma='scale'),
        'params': {}
    },
    'lof': {
        'model': LocalOutlierFactor(contamination='auto', novelty=True, n_jobs=2),
        'params': {}
    }
}

In [4]:
def calc_metrics(y_true, y_pred, y_proba=None):
    try:
        f1 = f1_score(y_true, y_pred, average='binary', zero_division=0)
        mcc = matthews_corrcoef(y_true, y_pred)
        acc = accuracy_score(y_true, y_pred)
        
        cm = confusion_matrix(y_true, y_pred)
        if cm.size == 4:
            tn, fp, fn, tp = cm.ravel()
        else:
            tn = fp = fn = tp = 0
            
        sens = tp / (tp + fn) if (tp + fn) > 0 else 0
        spec = tn / (tn + fp) if (tn + fp) > 0 else 0
        bal_acc = (sens + spec) / 2
        
        metrics = {'f1': f1, 'mcc': mcc, 'acc': acc, 'bal_acc': bal_acc}
        
        if y_proba is not None and len(np.unique(y_true)) > 1:
            try:
                roc = roc_auc_score(y_true, y_proba)
                p, r, _ = precision_recall_curve(y_true, y_proba)
                pr = auc(r, p)
                metrics.update({'roc': roc, 'pr': pr})
            except ValueError:
                pass
        
        return metrics
    except Exception as e:
        return {'f1': 0, 'mcc': 0, 'acc': 0, 'bal_acc': 0}

Training and Hyperparameter Tuning

In [5]:
def train_supervised_models(X_train, y_train, X_test, y_test, models_config):
    """Train all supervised models with hyperparameter tuning."""
    if len(X_train) == 0 or len(y_train) == 0:
        raise ValueError("Empty training data")
    
    if X_train.shape[1] != X_test.shape[1]:
        raise ValueError(f"Feature dimension mismatch: train={X_train.shape[1]}, test={X_test.shape[1]}")
    
    if len(np.unique(y_train)) < 2:
        raise ValueError("Training data must have at least 2 classes")
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    results = {}
    
    for name, config in models_config.items():
        try:
            if len(config['params']) > 0:
                cv = StratifiedKFold(n_splits=min(3, len(y_train) // 10), shuffle=True, random_state=SEED)
                
                grid = GridSearchCV(
                    config['model'], config['params'], 
                    cv=cv, scoring='f1', n_jobs=1, verbose=0
                )
                grid.fit(X_train_scaled, y_train)
                best_model = grid.best_estimator_
                best_params = grid.best_params_
                best_score = grid.best_score_
            else:
                # No hyperparameters to tune
                best_model = config['model']
                best_model.fit(X_train_scaled, y_train)
                best_params = {}
                best_score = 0
            
            y_pred = best_model.predict(X_test_scaled)
            
            # Get probabilities or decision function
            y_proba = None
            try:
                y_proba = best_model.predict_proba(X_test_scaled)[:, 1]
            except AttributeError:
                try:
                    y_proba = best_model.decision_function(X_test_scaled)
                    y_proba = (y_proba - y_proba.min()) / (y_proba.max() - y_proba.min() + 1e-10)
                except AttributeError:
                    pass
            
            metrics = calc_metrics(y_test, y_pred, y_proba)
            
            results[name] = {
                'metrics': metrics,
                'best_params': best_params,
                'best_score': best_score
            }
            
            # Free memory
            del best_model
            if 'grid' in locals():
                del grid
            gc.collect()
            
        except Exception as e:
            results[name] = {'error': str(e)}
    
    return results, scaler

def train_unsupervised_models(X_train, X_test, y_test, unsupervised_models_config, scaler):
    """Train all unsupervised models for anomaly detection."""
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    results = {}
    
    for name, config in unsupervised_models_config.items():
        try:
            model = config['model']
            model.fit(X_train_scaled)
            
            y_pred = model.predict(X_test_scaled)
            
            # Get anomaly scores
            if hasattr(model, 'score_samples'):
                scores = -model.score_samples(X_test_scaled)
            elif hasattr(model, 'decision_function'):
                scores = -model.decision_function(X_test_scaled)
            else:
                scores = None
            
            y_pred_binary = (y_pred == -1).astype(int)
            
            metrics = calc_metrics(y_test, y_pred_binary, scores)
            results[name] = {'metrics': metrics}
            
            # Free memory
            del model
            gc.collect()
            
        except Exception as e:
            results[name] = {'error': str(e)}
    
    return results

In [6]:
def run_single_experiment(split, feature_type):
    """Run a single cross-source transfer experiment."""
    try:
        test_src = split['test_source']
        train_srcs = split['train_sources']
        
        # Get test data
        test_data = data[test_src]
        if test_data['labels'] is None:
            return None, None
        
        # Extract features based on type
        if feature_type == 'bert':
            if 'bert_only' not in test_data or test_data['bert_only'] is None:
                return None, None
            X_test = test_data['bert_only']
        elif feature_type == 'hybrid':
            if 'hybrid_variants' not in test_data or 'bert_embedding_concat' not in test_data['hybrid_variants']:
                return None, None
            X_test = test_data['hybrid_variants']['bert_embedding_concat']
        else:
            return None, None
        
        y_test = test_data['labels']
        
        # Collect training data from multiple sources
        X_train_list, y_train_list = [], []
        for source in train_srcs:
            if data[source]['labels'] is None:
                continue
                
            if feature_type == 'bert':
                if 'bert_only' in data[source] and data[source]['bert_only'] is not None:
                    X_train_list.append(data[source]['bert_only'])
                    y_train_list.append(data[source]['labels'])
            elif feature_type == 'hybrid':
                if 'hybrid_variants' in data[source] and 'bert_embedding_concat' in data[source]['hybrid_variants']:
                    X_train_list.append(data[source]['hybrid_variants']['bert_embedding_concat'])
                    y_train_list.append(data[source]['labels'])
        
        if not X_train_list:
            return None, None
        
        # Combine training data
        X_train = np.vstack(X_train_list)
        y_train = np.concatenate(y_train_list)
        
        # Validate data
        if len(np.unique(y_train)) < 2:
            return None, None
        
        # Train models
        sup_results, scaler = train_supervised_models(X_train, y_train, X_test, y_test, models_config)
        unsup_results = train_unsupervised_models(X_train, X_test, y_test, unsupervised_models_config, scaler)
        
        exp_name = f"{test_src}_{feature_type}"
        result = {
            'supervised': sup_results,
            'unsupervised': unsup_results,
            'test_samples': len(y_test),
            'train_samples': len(y_train),
            'anomaly_rate': float(np.mean(y_test)),
            'train_sources': train_srcs
        }
        
        # Free memory
        del X_train, y_train, X_test, y_test, scaler
        gc.collect()
        
        return exp_name, result
        
    except Exception as e:
        print(f"Error in experiment: {str(e)}")
        return None, None

In [7]:
experiment_configs = []
for split in splits:
    experiment_configs.append((split, 'bert'))
    experiment_configs.append((split, 'hybrid'))

print(f"\nPrepared {len(experiment_configs)} experiment configurations")

all_results = {}

for split, feature_type in tqdm(experiment_configs, desc="Experiments"):
    exp_name, result = run_single_experiment(split, feature_type)
    if exp_name is not None:
        all_results[exp_name] = result
    
    gc.collect()

print(f"\nCompleted {len(all_results)} experiments successfully")



Prepared 12 experiment configurations


Experiments: 100%|██████████| 12/12 [2:40:32<00:00, 802.68s/it] 


Completed 12 experiments successfully





Analysis

In [8]:
results_list = []

for exp_name, exp_data in all_results.items():
    parts = exp_name.split('_')
    test_source = parts[0]
    feature_type = '_'.join(parts[1:])
    
    # Supervised models
    for model_name, model_data in exp_data['supervised'].items():
        if 'error' not in model_data:
            metrics = model_data['metrics']
            results_list.append({
                'experiment': exp_name,
                'test_source': test_source,
                'feature_type': feature_type,
                'model_type': 'supervised',
                'model': model_name,
                'f1': metrics.get('f1', 0),
                'mcc': metrics.get('mcc', 0),
                'roc': metrics.get('roc', 0),
                'pr': metrics.get('pr', 0),
                'bal_acc': metrics.get('bal_acc', 0),
                'acc': metrics.get('acc', 0),
                'best_params': str(model_data.get('best_params', {})),
                'train_samples': exp_data['train_samples'],
                'test_samples': exp_data['test_samples'],
                'anomaly_rate': exp_data['anomaly_rate']
            })
    
    # Unsupervised models
    for model_name, model_data in exp_data['unsupervised'].items():
        if 'error' not in model_data:
            metrics = model_data['metrics']
            results_list.append({
                'experiment': exp_name,
                'test_source': test_source,
                'feature_type': feature_type,
                'model_type': 'unsupervised',
                'model': model_name,
                'f1': metrics.get('f1', 0),
                'mcc': metrics.get('mcc', 0),
                'roc': metrics.get('roc', 0),
                'pr': metrics.get('pr', 0),
                'bal_acc': metrics.get('bal_acc', 0),
                'acc': metrics.get('acc', 0),
                'best_params': 'N/A',
                'train_samples': exp_data['train_samples'],
                'test_samples': exp_data['test_samples'],
                'anomaly_rate': exp_data['anomaly_rate']
            })

df = pd.DataFrame(results_list)

In [9]:
print("Top 10 Models by F1 Score:")
top_models = df.nlargest(10, 'f1')[['model', 'test_source', 'feature_type', 'f1', 'pr', 'mcc', 'bal_acc']]
print(top_models.round(3).to_string(index=False))

print("Average Performance by Model:")
avg_perf = df.groupby('model')[['f1', 'pr', 'mcc', 'bal_acc', 'roc']].mean().sort_values('f1', ascending=False).round(3)
print(avg_perf)

print("Average Performance by Feature Type:")
feat_perf = df.groupby('feature_type')[['f1', 'pr', 'mcc', 'bal_acc']].mean().round(3)
print(feat_perf)

print("Average Performance by Model Type:")
type_perf = df.groupby('model_type')[['f1', 'pr', 'mcc', 'bal_acc']].mean().round(3)
print(type_perf)

Top 10 Models by F1 Score:
model test_source feature_type    f1    pr    mcc  bal_acc
   lr      Apache         bert 0.990 0.990  0.986    0.990
  svm     OpenSSH         bert 0.989 0.997  0.949    0.980
  xgb     OpenSSH         bert 0.968 0.999  0.873    0.968
   lr     OpenSSH         bert 0.958 0.987  0.826    0.940
   gb     OpenSSH         bert 0.916 0.970  0.726    0.917
   rf     OpenSSH         bert 0.915 0.984  0.724    0.917
  lof         BGL       hybrid 0.862 0.757  0.185    0.531
  lof     OpenSSH       hybrid 0.861 0.671 -0.077    0.483
  lof         BGL         bert 0.857 0.789  0.026    0.501
ocsvm         BGL         bert 0.853 0.819  0.089    0.518
Average Performance by Model:
          f1     pr    mcc  bal_acc    roc
model                                     
lr     0.510  0.645  0.232    0.626  0.691
lof    0.496  0.441 -0.074    0.473  0.451
svm    0.468  0.665  0.166    0.595  0.692
ocsvm  0.435  0.389 -0.204    0.406  0.341
xgb    0.399  0.611  0.220    0.629 

In [10]:
df.to_csv(RESULTS_PATH / "ml_results.csv", index=False)
print(f"Saved CSV to: {RESULTS_PATH / 'ml_results.csv'}")

with open(RESULTS_PATH / "ml_results.pkl", 'wb') as f:
    pickle.dump({
        'results': all_results,
        'summary': df,
        'top_models': top_models,
        'avg_performance': avg_perf,
        'feature_performance': feat_perf,
        'type_performance': type_perf
    }, f)
print(f"Saved pickle to: {RESULTS_PATH / 'ml_results.pkl'}")

Saved CSV to: C:\Computer Science\AIMLDL\log-anomaly-detection\results\cross_source_transfer\ml_models\ml_results.csv
Saved pickle to: C:\Computer Science\AIMLDL\log-anomaly-detection\results\cross_source_transfer\ml_models\ml_results.pkl


Deployment

In [12]:
best_row = df.loc[df['f1'].idxmax()]
best_model_name = best_row['model']
best_feature_type = best_row['feature_type']
best_test_source = best_row['test_source']

print(f"Best Model Configuration:")
print(f"  Model: {best_model_name}")
print(f"  Feature Type: {best_feature_type}")
print(f"  Test Source: {best_test_source}")
print(f"  F1 Score: {best_row['f1']:.3f}")
print(f"  MCC: {best_row['mcc']:.3f}")
print(f"  Balanced Accuracy: {best_row['bal_acc']:.3f}")

Best Model Configuration:
  Model: lr
  Feature Type: bert
  Test Source: Apache
  F1 Score: 0.990
  MCC: 0.986
  Balanced Accuracy: 0.990


In [None]:
all_train_sources = [s for s in data.keys() if data[s]['labels'] is not None]
print(f"Training sources: {', '.join(all_train_sources)}")

X_train_list, y_train_list = [], []
for source in all_train_sources:
    if best_feature_type == 'bert':
        if 'bert_only' in data[source] and data[source]['bert_only'] is not None:
            X_train_list.append(data[source]['bert_only'])
            y_train_list.append(data[source]['labels'])
    else:  # hybrid
        if 'hybrid_variants' in data[source] and 'bert_embedding_concat' in data[source]['hybrid_variants']:
            X_train_list.append(data[source]['hybrid_variants']['bert_embedding_concat'])
            y_train_list.append(data[source]['labels'])

X_train_all = np.vstack(X_train_list)
y_train_all = np.concatenate(y_train_list)

print(f"Total training samples: {len(y_train_all):,}")
print(f"Feature dimensions: {X_train_all.shape[1]}")
print(f"Anomaly rate: {np.mean(y_train_all)*100:.2f}%")

Training sources: Apache, BGL, HPC, OpenSSH, Proxifier, Zookeeper
Total training samples: 12,000
Feature dimensions: 768
Anomaly rate: 40.73%


In [16]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_all)

print(f"Training final {best_model_name} model")
if best_model_name in models_config:
    model_config = models_config[best_model_name]
    is_supervised = True
else:
    model_config = unsupervised_models_config[best_model_name]
    is_supervised = False

from sklearn.base import clone

if is_supervised:
    if len(model_config['params']) > 0:
        from sklearn.model_selection import GridSearchCV, StratifiedKFold
        
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
        grid = GridSearchCV(
            model_config['model'], 
            model_config['params'],
            cv=cv,
            scoring='f1',
            n_jobs=-1,
            verbose=1
        )
        grid.fit(X_train_scaled, y_train_all)
        final_model = grid.best_estimator_
        best_params = grid.best_params_
        print(f"Best parameters: {best_params}")
    else:
        final_model = clone(model_config['model'])
        final_model.fit(X_train_scaled, y_train_all)
        best_params = {}
else:
    final_model = clone(model_config['model'])
    final_model.fit(X_train_scaled)
    best_params = {}

Training final lr model
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best parameters: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}


In [17]:
validation_source = best_test_source
if validation_source in data and data[validation_source]['labels'] is not None:
    if best_feature_type == 'bert':
        X_val = data[validation_source]['bert_only']
    else:
        X_val = data[validation_source]['hybrid_variants']['bert_embedding_concat']
    
    y_val = data[validation_source]['labels']
    X_val_scaled = scaler.transform(X_val)
    
    if is_supervised:
        y_pred = final_model.predict(X_val_scaled)
    else:
        y_pred = final_model.predict(X_val_scaled)
        y_pred = (y_pred == -1).astype(int)
    
    val_metrics = calc_metrics(y_val, y_pred)
    print(f"Validation on {validation_source}:")
    print(f"  F1: {val_metrics['f1']:.3f}")
    print(f"  MCC: {val_metrics['mcc']:.3f}")
    print(f"  Balanced Accuracy: {val_metrics['bal_acc']:.3f}")

Validation on Apache:
  F1: 1.000
  MCC: 1.000
  Balanced Accuracy: 1.000


In [22]:
deployment_path = RESULTS_PATH / "deployment"
deployment_path.mkdir(exist_ok=True)

deployment_data = {
    'model': final_model,
    'scaler': scaler,
    'feature_type': best_feature_type,
    'model_name': best_model_name,
    'is_supervised': is_supervised,
    'best_params': best_params,
    'metrics': {
        'f1': float(best_row['f1']),
        'mcc': float(best_row['mcc']),
        'bal_acc': float(best_row['bal_acc']),
        'acc': float(best_row['acc']),
        'roc': float(best_row.get('roc', 0)),
        'pr': float(best_row.get('pr', 0))
    },
    'training_info': {
        'n_samples': len(y_train_all),
        'n_features': X_train_all.shape[1],
        'n_sources': len(all_train_sources),
        'sources': all_train_sources,
        'anomaly_rate': float(np.mean(y_train_all))
    },
    'timestamp': datetime.now().isoformat()
}

with open(deployment_path / "best_classifier.pkl", 'wb') as f:
    pickle.dump(deployment_data, f)

print(f"Deployment model saved to: {deployment_path / 'best_classifier.pkl'}")
print(f"File size: {(deployment_path / 'best_classifier.pkl').stat().st_size / (1024*1024):.2f} MB")

Deployment model saved to: C:\Computer Science\AIMLDL\log-anomaly-detection\results\cross_source_transfer\ml_models\deployment\best_classifier.pkl
File size: 0.02 MB


In [23]:
import json

metadata = {
    'model_name': best_model_name,
    'feature_type': best_feature_type,
    'is_supervised': is_supervised,
    'best_params': str(best_params),
    'metrics': deployment_data['metrics'],
    'training_info': deployment_data['training_info'],
    'timestamp': deployment_data['timestamp']
}

with open(deployment_path / "model_metadata.json", 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"Metadata saved to: {deployment_path / 'model_metadata.json'}")

Metadata saved to: C:\Computer Science\AIMLDL\log-anomaly-detection\results\cross_source_transfer\ml_models\deployment\model_metadata.json
