In [3]:
!pip install xgboost

Collecting xgboost
  Using cached xgboost-3.0.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.0-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB 660.6 kB/s eta 0:03:47
   ---------------------------------------- 0.1/150.0 MB 1.2 MB/s eta 0:02:07
   ---------------------------------------- 0.5/150.0 MB 3.8 MB/s eta 0:00:40
   ---------------------------------------- 1.1/150.0 MB 5.7 MB/s eta 0:00:26
   ---------------------------------------- 1.6/150.0 MB 6.9 MB/s eta 0:00:22
    --------------------------------------- 2.2/150.0 MB 7.7 MB/s eta 0:00:20
    --------------------------------------- 2.7/150.0 MB 8.2 MB/s eta 0:00:18
    --------------------------------------- 3.3/150.0 MB 8.8 MB/s eta 0:00:17
   - -------------------------------------- 3.9/150.0 MB 9.1 MB/s eta 0:00:17
   - -------------------------------------- 4.4/150.0 MB 9.4 MB/s eta 0:00:1

In [14]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score
import time


dfdata = pd.read_csv("patientdata.csv")
dfdata.head()

Unnamed: 0,pregnant,glucose,pressure,triceps,insulin,mass,pedigree,age,outcome
0,11,136,52,18,215,34.7,2.288,25.0,1.0
1,5,79,78,34,130,32.7,0.654,34.0,0.0
2,3,186,86,33,50,19.4,0.661,26.0,1.0
3,0,102,58,30,75,33.3,0.439,24.0,0.0
4,0,128,100,33,185,39.3,0.115,55.0,0.0


In [26]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import cross_val_score, KFold
import time

# Load dataset
dfdata = pd.read_csv("patientdata.csv")

# Ensure outcome is numeric and clean
dfdata = dfdata.dropna()
dfdata['outcome'] = dfdata['outcome'].astype(int)

# Define function to generate bootstrapped dataset
def generate_bootstrapped_data(original_data, size):
    """
    Generate bootstrapped dataset of specified size using:
    1. Sampling with replacement for features
    2. Predicting outcomes using XGBoost model trained on original data
    """
    target_col = 'outcome'
    X_orig = original_data.drop(columns=[target_col])
    y_orig = original_data[target_col]

    # Fit base model to original data
    model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', objective='binary:logistic', random_state=42)
    model.fit(X_orig, y_orig)

    # Bootstrap sampling
    indices = np.random.choice(len(X_orig), size=size, replace=True)
    X_bootstrap = X_orig.iloc[indices].reset_index(drop=True)

    # Predict outcomes
    y_pred_prob = model.predict_proba(X_bootstrap)[:, 1]
    y_bootstrap = (y_pred_prob > 0.5).astype(int)

    return X_bootstrap, y_bootstrap

# Evaluation function
def evaluate_model(X, y):
    start = time.time()
    model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', objective='binary:logistic', random_state=42)
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    accuracy = cross_val_score(model, X, y, cv=kfold, scoring='accuracy').mean()
    end = time.time()
    return accuracy, end - start

# Dataset sizes to test
dataset_sizes = [100, 1000, 10000, 100000, 1000000, 10000000]

# Run evaluations
results = []

for size in dataset_sizes:
    try:
        Xb, yb = generate_bootstrapped_data(dfdata, size)
        acc, t = evaluate_model(Xb, yb)
        results.append({
            "Method used": "XGBoost in Python via scikit-learn and 5-fold CV",
            "Dataset size": size,
            "Testing-set predictive performance": round(acc, 4),
            "Time taken for the model to be fit": round(t, 2)
        })
        print(f"✅ Size {size}: Acc={acc:.4f}, Time={t:.2f}s")
    except Exception as e:
        print(f"❌ Size {size} failed: {e}")
        results.append({
            "Method used": "XGBoost in Python via scikit-learn and 5-fold CV",
            "Dataset size": size,
            "Testing-set predictive performance": np.nan,
            "Time taken for the model to be fit": np.nan
        })

# Save results
results_df = pd.DataFrame(results)
results_df.to_csv("xgboost_python_results.csv", index=False)

# Display table
print("\nFinal Results:")
print(results_df.to_string(index=False))


✅ Size 100: Acc=0.9200, Time=1.40s
✅ Size 1000: Acc=0.9480, Time=2.28s
✅ Size 10000: Acc=0.9772, Time=3.07s
✅ Size 100000: Acc=0.9869, Time=13119.65s
✅ Size 1000000: Acc=0.9915, Time=23.09s
✅ Size 10000000: Acc=0.9936, Time=117.95s

Final Results:
                                     Method used  Dataset size  Testing-set predictive performance  Time taken for the model to be fit
XGBoost in Python via scikit-learn and 5-fold CV           100                              0.9200                                1.40
XGBoost in Python via scikit-learn and 5-fold CV          1000                              0.9480                                2.28
XGBoost in Python via scikit-learn and 5-fold CV         10000                              0.9772                                3.07
XGBoost in Python via scikit-learn and 5-fold CV        100000                              0.9869                            13119.65
XGBoost in Python via scikit-learn and 5-fold CV       1000000               