In [5]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

# Load the dataset
data = pd.read_csv("dfdataWeek11.csv")

# Separate features and target
X = data.drop('outcome', axis=1)
y = data['outcome']

# Split data for final evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create different dataset sizes
sizes = [100, 1000, 10000, 100000, 1000000]
results = []

# Function to sample dataset and evaluate
def evaluate_model(size):
    # Check if we have enough data for the requested size
    if size > len(X_train):
        print(f"Not enough data for size {size}, using all available data: {len(X_train)}")
        X_sample = X_train
        y_sample = y_train
    else:
        # Sample the data
        indices = np.random.choice(len(X_train), size, replace=False)
        X_sample = X_train.iloc[indices]
        y_sample = y_train.iloc[indices]

    # Create and configure the model
    model = XGBClassifier(
        learning_rate=0.1,
        n_estimators=100,
        max_depth=5,
        random_state=42
    )

    # Measure cross-validation time and score
    start_time = time.time()
    cv_scores = cross_val_score(model, X_sample, y_sample, cv=5, scoring='accuracy')
    cv_time = time.time() - start_time

    # Fit the model on the full sample
    start_time = time.time()
    model.fit(X_sample, y_sample)
    fit_time = time.time() - start_time

    # Evaluate on test set
    y_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)

    return {
        'size': size,
        'cv_score': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'test_accuracy': test_accuracy,
        'fit_time': fit_time,
        'cv_time': cv_time,
        'sample_size': len(X_sample)
    }

# Run evaluation for each size
for size in sizes:
    if size > 100000000:  # Skip very large sizes unless necessary
        print(f"Size {size} is very large, skipping. Remove this check if you want to run it.")
        continue

    print(f"Evaluating model with dataset size: {size}")
    result = evaluate_model(size)
    results.append(result)
    print(f"  Test accuracy: {result['test_accuracy']:.4f}")
    print(f"  CV score: {result['cv_score']:.4f} ± {result['cv_std']:.4f}")
    print(f"  Fit time: {result['fit_time']:.2f} seconds")
    print(f"  CV time: {result['cv_time']:.2f} seconds")
    print()

# Create a table with results
results_df = pd.DataFrame(results)
print(results_df[['size', 'test_accuracy', 'cv_score', 'fit_time', 'cv_time']])

Evaluating model with dataset size: 100
  Test accuracy: 0.9064
  CV score: 0.8900 ± 0.0374
  Fit time: 0.03 seconds
  CV time: 0.22 seconds

Evaluating model with dataset size: 1000
  Test accuracy: 0.9525
  CV score: 0.9400 ± 0.0164
  Fit time: 0.07 seconds
  CV time: 0.38 seconds

Evaluating model with dataset size: 10000
  Test accuracy: 0.9713
  CV score: 0.9735 ± 0.0023
  Fit time: 0.16 seconds
  CV time: 0.80 seconds

Evaluating model with dataset size: 100000
  Test accuracy: 0.9826
  CV score: 0.9822 ± 0.0009
  Fit time: 1.64 seconds
  CV time: 3.45 seconds

Evaluating model with dataset size: 1000000
  Test accuracy: 0.9846
  CV score: 0.9846 ± 0.0003
  Fit time: 9.51 seconds
  CV time: 39.41 seconds

      size  test_accuracy  cv_score  fit_time    cv_time
0      100       0.906384  0.890000  0.031617   0.220047
1     1000       0.952546  0.940000  0.065960   0.376083
2    10000       0.971261  0.973500  0.157059   0.804156
3   100000       0.982619  0.982180  1.635043   3.4