In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

file_path = "/Users/iranblanco/Desktop/Masters/ANA 680/Week 1/HW1/breast-cancer-wisconsin.data"

# Column names 
column_names = [
    "Sample code number",
    "Clump Thickness",
    "Uniformity of Cell Size",
    "Uniformity of Cell Shape",
    "Marginal Adhesion",
    "Single Epithelial Cell Size",
    "Bare Nuclei",
    "Bland Chromatin",
    "Normal Nucleoli",
    "Mitoses",
    "Class"
]

df = pd.read_csv(file_path, header=None, names=column_names)

# Missing values replaced
df.replace('?', np.nan, inplace=True)
df["Bare Nuclei"] = pd.to_numeric(df["Bare Nuclei"])  

# Drop rows 
df.dropna(inplace=True)

# Defining features (X) and target variable (y)
X = df.drop(columns=["Sample code number", "Class"])
y = df["Class"]

# Spliting into training (75%) and testing (25%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

np.save("X_train.npy", X_train)
np.save("X_test.npy", X_test)
np.save("y_train.npy", y_train)
np.save("y_test.npy", y_test)

print("Data preprocessing completed and saved.")

Data preprocessing completed and saved.


In [7]:
# Load data
X_train = np.load("X_train.npy")
X_test = np.load("X_test.npy")
y_train = np.load("y_train.npy")
y_test = np.load("y_test.npy")

# Convert labels: 2 → 0 (Benign), 4 → 1 (Malignant)
y_train = np.where(y_train == 2, 0, 1)
y_test = np.where(y_test == 2, 0, 1)

# Initialize and train the XGBoost model
model = XGBClassifier(eval_metric="logloss", random_state=42)
model.fit(X_train, y_train)

# Predictions on the test set
y_pred = model.predict(X_test)

# Evaluating performance
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Saving results to a CSV file
results = pd.DataFrame({"Model": ["XGBoost"], "Accuracy": [accuracy]})
results.to_csv("results_xgboost.csv", index=False)

# Print
print("XGBoost Results:")
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)

XGBoost Results:
Accuracy: 0.9532163742690059
Confusion Matrix:
 [[102   1]
 [  7  61]]
