In [None]:
%load_ext autoreload
%autoreload 2

figsize=(14, 4)
figsize_large=(14,10)

import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import xgboost as xgb

### Set up synthetic data

In [None]:
X, y = make_regression(
	n_samples=1000,
	n_features=20,
	n_informative=15,
	n_targets=1,
	bias=0.0,
	noise=20,
	random_state=42
)
print(min(y), max(y))

### Split into train/test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.2, random_state=42
)

### Train model

In [None]:
model = xgb.XGBRegressor(
	n_estimators=100,
	learning_rate=0.1,
	max_depth=5,
	random_state=42
)
model.fit(X_train, y_train)
preds = model.predict(X_test)
print(f"R² Score: {r2_score(y_test, preds):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, preds)):.4f}")
print(f"MAE: {mean_absolute_error(y_test, preds):.4f}")

### Visualizations
1. Preds vs. actual
2. Residuals

In [None]:
fig, axes = plt.subplots(2,2, figsize=figsize_large)

axes[0,0].scatter(y_test, preds, alpha=0.6, edgecolors='k')
axes[0,0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()],'r--', lw=2, label='Perfect Prediction')
axes[0,0].set_xlabel("Actual values", fontsize=11)
axes[0,0].set_ylabel("Predicted values", fontsize=11)
axes[0,0].set_title("Predictions vs. Actual", fontsize=12, fontweight='bold')
axes[0,0].legend()
axes[0,0].grid(True, alpha=0.3)


residuals = y_test - preds
axes[0,1].scatter(preds, residuals, alpha=0.6, edgecolors='k')
axes[0,1].axhline(y=0, color='r', linestyle='--', lw=2)
axes[0,1].set_xlabel("Predicted values", fontsize=11)
axes[0,1].set_ylabel("Residuals", fontsize=11)
axes[0,1].set_title("Residual plot", fontsize=12, fontweight='bold')
axes[0,1].grid(True, alpha=0.3)

top_n=15
feature_importances = model.feature_importances_
sorted_idx = np.argsort(feature_importances)[-top_n:]
axes[1,0].barh(range(top_n), feature_importances[sorted_idx])
axes[1,0].set_yticks(range(top_n))
axes[1,0].set_yticklabels([f"Feature_{i}" for i in sorted_idx])
axes[1,0].set_xlabel("Importance", fontsize=11)
axes[1,0].set_title("Feature importances", fontsize=12, fontweight='bold')
axes[1,0].grid(True, alpha=0.3)

#Denne må man gi modellen et eval_set for å få feedback på
""" results = learning_curve = model.evals_result()
rmse = results['validation_0']['rmse']
epochs = len(rmse)
x_axis = range(epochs)
axes[1,1].plot(x_axis, rmse, label="Test")
axes[1,1].set_xlabel("Boosting round", fontsize=11)
axes[1,1].set_ylabel("RMSE", fontsize=11)
axes[1,1].set_title("Learning curve", fontsize=12, fontweight='bold')
axes[1,1].legend()
axes[1,1].grid(True, alpha=0.3) """
