# Reproducibility & Evaluation Check
This notebook demonstrates the ML pipeline used in the Endpoint Sentinel system.
It covers data generation, feature extraction, training, and evaluation.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import sys
import os

# Add project root to path
sys.path.append(os.path.abspath('..'))
from ml.train import generate_synthetic_data
from ml.models import RiskClassifier

## 1. Data Generation
We generate 10,000 samples of synthetic endpoint telemetry.

In [None]:
df = generate_synthetic_data(n_samples=10000)
print(f"Dataset shape: {df.shape}")
print(df['label'].value_counts())
df.head()

## 2. Model Training (XGBoost)

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('label', axis=1)
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RiskClassifier()
# Note: In the real class we save to file, here we fit directly for demo
clf.model.fit(X_train, y_train)
print("Training complete.")

## 3. Evaluation

In [None]:
y_pred = clf.model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
# Feature Importance
xgb.plot_importance(clf.model)
plt.show()