# Linear Functions

In [1]:
import matplotlib.pyplot as plt
import numpy as np

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score
import numpy as np

# Example dataset
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest for classification
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Making predictions
y_probs = model.predict_proba(X_test)[:, 1]  # Probabilities of the positive class

# Calculate AUC score
auc_score = roc_auc_score(y_test, y_probs)

# Adjusted function to return bootstrapped AUC scores for visualization
def bootstrap_auc(y_true, y_probs, n_bootstraps=1000, random_state=42):
    rng = np.random.RandomState(random_state)
    bootstrapped_scores = []
    
    for i in range(n_bootstraps):
        # Generate random indices with replacement
        indices = rng.randint(0, len(y_probs), len(y_probs))
        # Calculate AUC on the bootstrapped sample
        if len(np.unique(y_true[indices])) < 2:
            # Prevent an error if the sample is not diverse enough for AUC calculation
            continue
        score = roc_auc_score(y_true[indices], y_probs[indices])
        bootstrapped_scores.append(score)
    
    # Calculate the lower and upper percentile to form the confidence interval
    sorted_scores = np.array(bootstrapped_scores)
    confidence_lower = np.percentile(sorted_scores, 2.5)
    confidence_upper = np.percentile(sorted_scores, 97.5)
    
    return confidence_lower, confidence_upper, bootstrapped_scores

# Calculate confidence intervals for the AUC score and get the bootstrapped AUC scores
conf_int_lower, conf_int_upper, bootstrapped_scores = bootstrap_auc(y_test, y_probs)

# Print the results
print(f"AUC score: {auc_score:.3f}")
print(f"95% Confidence interval for the AUC score: [{conf_int_lower:.3f}, {conf_int_upper:.3f}]")

# Visualize the distribution of bootstrapped AUC scores
plt.hist(bootstrapped_scores, bins=50, alpha=0.75)
plt.axvline(auc_score, color='red', linestyle='--', label='AUC Score')
plt.axvline(conf_int_lower, color='green', linestyle='--', label='2.5th percentile')
plt.axvline(conf_int_upper, color='green', linestyle='--', label='97.5th percentile')
plt.title('Distribution of Bootstrapped AUC Scores')
plt.xlabel('AUC Score')
plt.ylabel('Frequency')
plt.legend(loc='upper left')
plt.show()
