# Test notebook

In [1]:
!pip install xgboost



In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.datasets import make_classification
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from collections import Counter

# Binary classification
The breast cancer dataset is a classic binary classification dataset in the sklearn dataset library.

In [3]:
from sklearn.datasets import load_breast_cancer

X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=94)

# Use "hist" for constructing the trees, with early stopping enabled.
clf = xgb.XGBClassifier(tree_method="hist", early_stopping_rounds=2)
# Fit the model, test sets are used for early stopping.
clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
# Save model into JSON format.
clf.save_model("clf.json")

[0]	validation_0-logloss:0.45997
[1]	validation_0-logloss:0.34184
[2]	validation_0-logloss:0.27076
[3]	validation_0-logloss:0.22399
[4]	validation_0-logloss:0.19346
[5]	validation_0-logloss:0.16814
[6]	validation_0-logloss:0.15393
[7]	validation_0-logloss:0.14081
[8]	validation_0-logloss:0.13269
[9]	validation_0-logloss:0.12515
[10]	validation_0-logloss:0.11551
[11]	validation_0-logloss:0.11184
[12]	validation_0-logloss:0.10799
[13]	validation_0-logloss:0.10541
[14]	validation_0-logloss:0.10493
[15]	validation_0-logloss:0.10326
[16]	validation_0-logloss:0.10300
[17]	validation_0-logloss:0.10339
[18]	validation_0-logloss:0.10160
[19]	validation_0-logloss:0.09892
[20]	validation_0-logloss:0.09478
[21]	validation_0-logloss:0.09359
[22]	validation_0-logloss:0.09247
[23]	validation_0-logloss:0.09284


# Binary Classification 
The cleveland heart disease dataset is a binary classification dataset found on xgboosting.com. It has more features than the above dataset and uses a grid search for hyperparameter tuning.

In [4]:
from sklearn.datasets import fetch_openml

# Load the Cleveland Heart Disease dataset (classification dataset)
X, y = fetch_openml("heart-disease", return_X_y=True, target_column='target', as_frame=True)

# Mark missing as nan
X = X.fillna(value=np.nan)

# Convert target to integers
y = y.astype('int')

# Print key information about the dataset
print(f"Dataset shape: {X.shape}")
print(f"Features: {X.columns.tolist()}")
print(f"Class distributions: {Counter(y)}")

# Retrieve values
X = X.values

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define parameter grid
param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.01, 0.05],
    'n_estimators': [50, 100, 200],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Create XGBClassifier
model = XGBClassifier(objective='binary:logistic', random_state=42, n_jobs=1)

# Perform grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print best score and parameters
print(f"Best score: {grid_search.best_score_:.3f}")
print(f"Best parameters: {grid_search.best_params_}")

# Access best model
best_model = grid_search.best_estimator_

# Save best model
best_model.save_model('best_model_heart_disease.ubj')

# Load saved model
loaded_model = XGBClassifier()
loaded_model.load_model('best_model_heart_disease.ubj')

# Use loaded model for predictions
predictions = loaded_model.predict(X_test)

# Print accuracy score
accuracy = loaded_model.score(X_test, y_test)
print(f"Accuracy: {accuracy:.3f}")

Dataset shape: (303, 13)
Features: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
Class distributions: Counter({1: 165, 0: 138})


  warn(


Best score: 0.839
Best parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 200, 'subsample': 0.8}
Accuracy: 0.836


# Multiclass classification
The fetch_covtype dataset (found on xgboosting.com) contains data on forest cover types from four wilderness areas in the Roosevelt National Forest of northern Colorado. Useful example in case we go beyond binary classification.

In [6]:
from sklearn.datasets import fetch_covtype

# Load the Covertype dataset
covtype = fetch_covtype()
X, y = covtype.data, covtype.target

# Ensure class numbers start at 0
y = y - 1

# Print key information about the dataset
print(f"Dataset shape: {X.shape}")
print(f"Classes: {np.unique(y)}")
print(f"Class Distributions: {Counter(y)}")

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define parameter grid
param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.01, 0.05],
    'n_estimators': [50, 100, 200],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Create XGBClassifier
model = XGBClassifier(objective='multi:softmax', random_state=42, n_jobs=1)

# Perform grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print best score and parameters
print(f"Best score: {grid_search.best_score_:.3f}")
print(f"Best parameters: {grid_search.best_params_}")

# Access best model
best_model = grid_search.best_estimator_

# Save best model
best_model.save_model('best_model_covtype.ubj')

# Load saved model
loaded_model = XGBClassifier()
loaded_model.load_model('best_model_covtype.ubj')

# Use loaded model for predictions
predictions = loaded_model.predict(X_test)

# Print accuracy score
accuracy = loaded_model.score(X_test, y_test)
print(f"Accuracy: {accuracy:.3f}")

Dataset shape: (581012, 54)
Classes: [0 1 2 3 4 5 6]
Class Distributions: Counter({1: 283301, 0: 211840, 2: 35754, 6: 20510, 5: 17367, 4: 9493, 3: 2747})




KeyboardInterrupt: 

# Creating a toy dataset

In [5]:
# Let's create a toy dataset
n_samples = 1000  # number of objects
n_features = 5   # H(mag), e, a, Peri., Incl.

# Generate dataset with make_classification
X, y = make_classification(
    n_samples=n_samples, 
    n_features=n_features,
    n_informative=5,      # all 5 features are informative
    n_redundant=0,        # no redundant features
    n_clusters_per_class=1,
    weights=[0.6, 0.4],   # balance between NEO (1) and non-NEO (0)
    class_sep=1.5,        # separation between classes
    random_state=42
)

# Convert to DataFrame for feature naming
df = pd.DataFrame(X, columns=['H', 'e', 'a', 'Peri.', 'Incl.'])
df['label'] = y

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df[['H', 'e', 'a', 'Peri.', 'Incl.']], df['label'], test_size=0.3, random_state=42)

# Initialize XGBoost classifier
xgb_clf = XGBClassifier(eval_metric='logloss')

# Train the model
xgb_clf.fit(X_train, y_train)

# Make predictions
y_pred = xgb_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.9866666666666667
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99       190
           1       0.99      0.97      0.98       110

    accuracy                           0.99       300
   macro avg       0.99      0.98      0.99       300
weighted avg       0.99      0.99      0.99       300



# Add random noise and set ranges for features

In [6]:
# Parameters 
n_samples = 1000
H_range = (15, 25)         # Typical range for Absolute Magnitude (H)
e_range = (0.0, 0.9)       # Typical range for Eccentricity (e)
a_range = (0.5, 5.0)       # Typical range for Semi-major Axis (a), in AU
peri_range = (0.3, 2.0)    # Typical range for Perihelion Distance (Peri.), in AU
incl_range = (0.0, 30.0)   # Typical range for Inclination (Incl.), in degrees

# Generate random values within these ranges
np.random.seed(42)
H = np.random.uniform(*H_range, n_samples)
e = np.random.uniform(*e_range, n_samples)
a = np.random.uniform(*a_range, n_samples)
peri = np.random.uniform(*peri_range, n_samples)
incl = np.random.uniform(*incl_range, n_samples)

# Add Gaussian noise to simulate observational noise
noise_level = 0.05  # Adjust noise level as needed
H += np.random.normal(0, noise_level, n_samples)
e += np.random.normal(0, noise_level, n_samples)
a += np.random.normal(0, noise_level, n_samples)
peri += np.random.normal(0, noise_level, n_samples)
incl += np.random.normal(0, noise_level, n_samples)

# Create DataFrame and labels
df = pd.DataFrame({
    'H': H,
    'e': e,
    'a': a,
    'Peri.': peri,
    'Incl.': incl
})

# Assign labels: 1 for NEO and 0 for non-NEO, with a 50-50 distribution
labels = np.random.choice([0, 1], size=n_samples, p=[0.5, 0.5])
df['label'] = labels

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df[['H', 'e', 'a', 'Peri.', 'Incl.']], df['label'], test_size=0.3, random_state=42)

# Train
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_clf.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = xgb_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.4766666666666667
Classification Report:
               precision    recall  f1-score   support

           0       0.47      0.53      0.50       146
           1       0.49      0.42      0.45       154

    accuracy                           0.48       300
   macro avg       0.48      0.48      0.48       300
weighted avg       0.48      0.48      0.48       300



Parameters: { "use_label_encoder" } are not used.



# Add hyperparameter tuning using a grid search to improve accuracy

In [7]:
from sklearn.metrics import accuracy_score, classification_report

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df[['H', 'e', 'a', 'Peri.', 'Incl.']], df['label'], test_size=0.3, random_state=42)

# Set up the parameter grid for hyperparameter tuning
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],         # Controls the impact of each tree
    'max_depth': [3, 5, 7],                    # Limits the depth of each tree (to control overfitting)
    'n_estimators': [50, 100, 200],            # Number of trees (iterations)
    'subsample': [0.7, 0.8, 1.0],              # Fraction of samples used per tree
    'colsample_bytree': [0.6, 0.8, 1.0],       # Fraction of features used per tree
    'gamma': [0, 0.1, 0.2]                     # Minimum loss reduction to split (helps control overfitting)
}

# Initialize
xgb_clf = XGBClassifier(eval_metric='logloss')

# Set up GridSearchCV with cross-validation
grid_search = GridSearchCV(
    estimator=xgb_clf, 
    param_grid=param_grid, 
    scoring='accuracy', 
    cv=5,                   # 5-fold cross-validation
    verbose=1,              # Print progress
    n_jobs=-1               # Use all available cores
)

# Perform hyperparameter tuning
grid_search.fit(X_train, y_train)

# Print the best parameters and the best accuracy score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

# Use the best estimator for final predictions
best_xgb = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_xgb.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Test Accuracy:", accuracy)
print("Classification Report:\n", report)


Fitting 5 folds for each of 729 candidates, totalling 3645 fits
Best Parameters: {'colsample_bytree': 0.6, 'gamma': 0.1, 'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.7}
Best Cross-Validation Accuracy: 0.5371428571428571
Test Accuracy: 0.4766666666666667
Classification Report:
               precision    recall  f1-score   support

           0       0.47      0.54      0.50       146
           1       0.49      0.42      0.45       154

    accuracy                           0.48       300
   macro avg       0.48      0.48      0.48       300
weighted avg       0.48      0.48      0.47       300



In [8]:
print(df)

             H         e         a     Peri.      Incl.  label
0    18.715552  0.170761  1.677731  1.508594  17.087680      0
1    24.387628  0.451302  1.534183  1.702374  24.185213      0
2    22.299328  0.873058  4.602790  0.636086  22.825406      1
3    21.032259  0.711005  1.664638  1.416443   4.569927      0
4    16.587068  0.688916  1.693717  1.180042   4.533532      0
..         ...       ...       ...       ...        ...    ...
995  15.977320  0.512229  4.370838  1.172521  27.033966      0
996  24.222270  0.923181  1.186248  0.943255   3.605252      0
997  16.298183  0.173637  1.828990  0.956542   9.827622      1
998  24.531737 -0.014695  1.783756  1.102295  24.543122      0
999  19.449776  0.292538  4.469602  0.588726  18.011462      1

[1000 rows x 6 columns]
