In [None]:
%load_ext autoreload
%autoreload 2

figsize=(14, 4)

import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix, roc_curve, auc, accuracy_score
import shap

from xgboost import XGBClassifier, plot_importance
from sklearn.model_selection import train_test_split

### Set up synthetic data

In [None]:
X, y = make_classification(
	n_samples=1000,
	n_features=10,
	n_informative=6,
	n_redundant=2,
	n_repeated=0,
	n_classes=2,
	flip_y=0.03,
	class_sep=1.0,
	random_state=42
)
feature_names = [f"feature_{i}" for i in range(X.shape[1])]
df = pd.DataFrame(X, columns=feature_names)
df["target"] = y

df.head()

### Visualize pairplot

In [None]:
features = df.drop(columns="target")

sns.pairplot(features)
plt.show()

### Visualize correlations

In [None]:
corr = features.corr()
plt.figure(figsize=figsize)
sns.heatmap(corr, annot=True, cmap="coolwarm")
plt.show()

### Visualize histograms

In [None]:
features.hist(bins=20, figsize=figsize, layout=(3,4))
plt.tight_layout()
plt.show()

### Boxplot

In [None]:
plt.figure(figsize=figsize)
sns.boxplot(data=features)
plt.xticks(rotation=45)
plt.show()

### t-SNE Visualization

In [None]:
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

tsne = TSNE(n_components=2, perplexity=30, learning_rate=200, random_state=42)
features_tsne = tsne.fit_transform(features_scaled)

plt.figure(figsize=figsize)
scatter = plt.scatter(features_tsne[:,0], features_tsne[:,1], c=y, cmap="viridis", alpha=0.7)
plt.colorbar(scatter, label="Target")
plt.show()

### Split into train and test data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
	df.drop("target", axis=1), df["target"], test_size=0.2, random_state=42
)

### Train model

In [None]:
model = XGBClassifier(
	n_estimators=100,
	learning_rate=0.1,
	max_depth=4,
	random_state=42,
	use_label_encoder=False,
	eval_metric="logloss"
)
model.fit(X_train, y_train)

### Test model

In [None]:
preds = model.predict(X_test)
acc = accuracy_score(y_test, preds)
print(f"Accuracy: {acc:.3f}")


### Look at preds

In [None]:
results = pd.DataFrame({
	"y_true": y_test,
	"y_predicted": preds
})

results["mismatch"] = results["y_true"] != results["y_predicted"]

results.head()

### Confusion matrix

In [None]:
cm = confusion_matrix(y_test, preds)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Pred 0", "Pred 1"],
            yticklabels=["Actual 0", "Actual 1"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

### Roc curve

In [None]:
y_probs = model.predict_proba(X_test)[:,1]

fpr, tpr, thresholds = roc_curve(y_test, y_probs)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc:.3f})')
plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--', label='Random Classifier')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid(True)

plt.show()

### Feature importance

In [None]:
plot_importance(model)

### SHAP summary-plot

In [None]:
explainer = shap.Explainer(model)
shap_values = explainer(X_test)
shap.summary_plot(shap_values, X_test)

### Shap bar plot

In [None]:
shap.summary_plot(shap_values, X_test, plot_type="bar")

### Shap dependence plot

In [None]:
shap.dependence_plot("feature_7", shap_values.values, X_test)