In [3]:
!pip install xgboost

Collecting xgboost
  Using cached xgboost-3.0.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.0-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB 660.6 kB/s eta 0:03:47
   ---------------------------------------- 0.1/150.0 MB 1.2 MB/s eta 0:02:07
   ---------------------------------------- 0.5/150.0 MB 3.8 MB/s eta 0:00:40
   ---------------------------------------- 1.1/150.0 MB 5.7 MB/s eta 0:00:26
   ---------------------------------------- 1.6/150.0 MB 6.9 MB/s eta 0:00:22
    --------------------------------------- 2.2/150.0 MB 7.7 MB/s eta 0:00:20
    --------------------------------------- 2.7/150.0 MB 8.2 MB/s eta 0:00:18
    --------------------------------------- 3.3/150.0 MB 8.8 MB/s eta 0:00:17
   - -------------------------------------- 3.9/150.0 MB 9.1 MB/s eta 0:00:17
   - -------------------------------------- 4.4/150.0 MB 9.4 MB/s eta 0:00:1

In [21]:
# XGBoost Comparison in Python
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

# Load and prepare the Pima Indians Diabetes dataset
# This is equivalent to the R code:
# data("PimaIndiansDiabetes2")
# ds <- as.data.frame(na.omit(PimaIndiansDiabetes2))
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
column_names = ['pregnant', 'glucose', 'pressure', 'triceps', 'insulin', 'mass', 'pedigree', 'age', 'diabetes']
ds = pd.read_csv(url, names=column_names)
# Convert the outcome to binary (0,1 for negative/positive)
ds['diabetes'] = ds['diabetes'].astype(int)

# Extract predictor names
prednames = ds.columns.tolist()[:-1]  # all columns except 'diabetes'

# Fit a logistic regression model to obtain a parametric equation
# This is equivalent to the R code:
# logmodel <- glm(diabetes ~ ., data = ds, family = "binomial")
X = ds[prednames]
y = ds['diabetes']
logmodel = LogisticRegression(max_iter=10000)
logmodel.fit(X, y)

# Extract coefficients
# In R: cfs <- coefficients(logmodel)
coef = np.append(logmodel.intercept_, logmodel.coef_[0])

# Function to generate bootstrapped data of specific size
def generate_bootstrapped_data(size):
    """Generate bootstrapped data using the same approach as in the Week 11 R code"""
    # Create dataframe to store sampled predictors
    dfdata = pd.DataFrame()
    
    # Sample each predictor with replacement
    # This is equivalent to the R code:
    # dfdata <- map_dfc(prednames, function(nm){ sample(ds$nm, size = sz, replace = T) })
    for col in prednames:
        dfdata[col] = np.random.choice(ds[col], size=size, replace=True)
    
    # Compute the logit values
    # In R: pvec <- map((1:8), function(pnum){ cfs[pnum+1] * eval(parse(text = paste0("dfdata$", prednames[pnum]))) }) %>% reduce(`+`) + cfs[1]
    logit = coef[0]  # intercept
    for i, col in enumerate(prednames):
        logit += coef[i+1] * dfdata[col]
    
    # Calculate probabilities and convert to binary outcome
    # In R: dfdata['outcome'] <- ifelse(1/(1 + exp(-(pvec))) > 0.5, 1, 0)
    prob = 1 / (1 + np.exp(-logit))
    dfdata['outcome'] = (prob > 0.5).astype(int)
    
    return dfdata[prednames], dfdata['outcome']

# Function to evaluate XGBoost with scikit-learn and 5-fold CV
def evaluate_xgboost_sklearn(size):
    print(f"Processing dataset size: {size}")
    # Generate bootstrapped data
    X, y = generate_bootstrapped_data(size)
    
    # Split into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Create XGBoost classifier
    model = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
    
    # Measure time for fitting the model with 5-fold CV
    start_time = time.time()
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    
    # Fit the final model on the training data
    model.fit(X_train, y_train)
    
    # Calculate time taken
    time_taken = time.time() - start_time
    
    # Evaluate on test data
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"  Completed: Accuracy = {accuracy:.4f}, Time = {time_taken:.4f} seconds")
    return accuracy, time_taken

# Execute the comparison across different dataset sizes
sizes = [100, 1000, 10000, 100000, 1000000, 10000000]
results = []

for size in sizes:
    try:
        accuracy, time_taken = evaluate_xgboost_sklearn(size)
        results.append({
            'Method': 'XGBoost in Python via scikit-learn and 5-fold CV',
            'Dataset size': size,
            'Testing-set predictive performance': round(accuracy, 4),
            'Time taken for the model to be fit': round(time_taken, 4)
        })
    except Exception as e:
        print(f"Error with size {size}: {str(e)}")
        results.append({
            'Method': 'XGBoost in Python via scikit-learn and 5-fold CV',
            'Dataset size': size,
            'Testing-set predictive performance': 'Error',
            'Time taken for the model to be fit': 'Error'
        })

# Display results as a table
results_df = pd.DataFrame(results)
print("\nFinal Results:")
print(results_df.to_string(index=False))

# Save results to CSV
results_df.to_csv("xgboost_python_results.csv", index=False)

Processing dataset size: 100
  Completed: Accuracy = 0.8000, Time = 3.2738 seconds
Processing dataset size: 1000
  Completed: Accuracy = 0.9100, Time = 1.2084 seconds
Processing dataset size: 10000
  Completed: Accuracy = 0.9630, Time = 1.5408 seconds
Processing dataset size: 100000
  Completed: Accuracy = 0.9786, Time = 3.0505 seconds
Processing dataset size: 1000000
  Completed: Accuracy = 0.9856, Time = 17.0820 seconds
Processing dataset size: 10000000
Error with size 10000000: Unable to allocate 76.3 MiB for an array with shape (10000000,) and data type float64

Final Results:
                                          Method  Dataset size Testing-set predictive performance Time taken for the model to be fit
XGBoost in Python via scikit-learn and 5-fold CV           100                                0.8                             3.2738
XGBoost in Python via scikit-learn and 5-fold CV          1000                               0.91                             1.2084
XGBoost in Py