In [26]:
from sklearn.datasets import load_breast_cancer

In [27]:
# Load the dataset
data = load_breast_cancer()
x = data.data # Feature matrix
y = data.target # Target variable (0 for benign, 1 for malignant)

In [28]:
import pandas as pd
# Convert to DataFrame for easier inspection
df = pd.DataFrame(x, columns = data.feature_names)
print("Missing values :\n", df.isnull().sum().sum())

Missing values :
 0


In [29]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [30]:
# Standardize the feature matrix
x_scaled = scaler.fit_transform(x)

In [31]:
y_df = pd.DataFrame(y, columns=['Target'])

In [32]:
#To ensure the dataset is ready for modeling, let’s start by loading and preprocessing it:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report


In [33]:
# Load and split dataset
data = load_breast_cancer()
x = data.data
y = data.target 

In [34]:
# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [35]:
# Standardize features for algorithms sensitive to scale
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [36]:
# Logistic Regression
model_lr = LogisticRegression()
model_lr.fit(X_train, Y_train)
y_pred_lr = model_lr.predict(X_test)

print("Logistic Regression Results:")
print(classification_report(Y_test, y_pred_lr))

Logistic Regression Results:
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



In [37]:
# Decision Tree Classifier
# Load the dataset
data = load_breast_cancer()
X = data.data  # Feature matrix
y = data.target  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Decision Tree Classifier
model_dt = DecisionTreeClassifier(random_state=42)
model_dt.fit(X_train, y_train)
y_pred_dt = model_dt.predict(X_test)

print("Decision Tree Classifier Results:")
print(classification_report(y_test, y_pred_dt))

Decision Tree Classifier Results:
              precision    recall  f1-score   support

           0       0.93      0.93      0.93        43
           1       0.96      0.96      0.96        71

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114



In [38]:
# Random Forest Classifier
model_rf = RandomForestClassifier(random_state = 42)
model_rf.fit(X_train, Y_train)
y_pred_rf = model_rf.predict(X_test)

print("Random Forest Classifier Results:")
print(classification_report(y_test, y_pred_rf))

Random Forest Classifier Results:
              precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114



In [39]:
# Support Vector Machine (SVM)
model_svm = SVC(kernel='linear')  # Linear kernel is commonly effective for binary classification
model_svm.fit(X_train, y_train)
y_pred_svm = model_svm.predict(X_test)

print("Support Vector Machine Results:")
print(classification_report(y_test, y_pred_svm))

Support Vector Machine Results:
              precision    recall  f1-score   support

           0       0.93      0.95      0.94        43
           1       0.97      0.96      0.96        71

    accuracy                           0.96       114
   macro avg       0.95      0.96      0.95       114
weighted avg       0.96      0.96      0.96       114



In [41]:
# k-Nearest Neighbors (k-NN)
model_knn = KNeighborsClassifier(n_neighbors=5)
model_knn.fit(X_train, y_train)
y_pred_knn = model_knn.predict(X_test)

print("k-Nearest Neighbors Results:")
print(classification_report(y_test, y_pred_knn))

k-Nearest Neighbors Results:
              precision    recall  f1-score   support

           0       0.93      0.93      0.93        43
           1       0.96      0.96      0.96        71

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114



1. Logistic Regression
How it Works: Logistic Regression is a linear model for binary classification that estimates the probability of an instance belonging to a particular class by applying a logistic (sigmoid) function to a linear combination of input features. The output is a probability between 0 and 1, typically using a threshold of 0.5 for classification.

Suitability: Logistic Regression is effective for binary classification problems like this dataset, where the goal is to classify tumors as benign or malignant. It’s a straightforward, interpretable model and performs well as a baseline when the data can be linearly separable.

2. Decision Tree Classifier
How it Works: A Decision Tree Classifier recursively splits the data into subsets based on feature values, forming a tree structure. Each split is chosen to best separate classes, leading to branches and leaf nodes that represent class labels. It’s a non-parametric algorithm, meaning it doesn’t assume any particular data distribution.

Suitability: Decision Trees can capture complex, non-linear relationships in data and work well for datasets with mixed feature importance, such as this one. However, they can be prone to overfitting, especially with deep trees, so tuning the depth or ensemble methods (like Random Forests) can improve performance.

3. Random Forest Classifier
How it Works: Random Forest is an ensemble method that builds multiple decision trees using random subsets of the data and features. Each tree’s output is aggregated (typically by majority vote) to make the final prediction. This approach reduces overfitting by averaging out the high variance of individual trees.

Suitability: Random Forests are ideal for datasets like this one, which contain complex patterns. They are more robust than single decision trees, usually yielding higher accuracy by averaging multiple predictions. Additionally, they provide feature importance metrics, which can help identify the most influential factors in classification.

Support Vector Machine (SVM)
How it Works: SVM finds a hyperplane in the feature space that maximizes the margin between the two classes. It uses support vectors (the data points closest to the hyperplane) to define this boundary. SVM can also be configured with different kernels (e.g., linear, polynomial, radial basis function) to capture non-linear relationships.

Suitability: SVM is highly effective for binary classification, especially on datasets where classes are well-separated, as in this case. With a linear kernel, SVM works well for high-dimensional data like the 30 features in this dataset, providing a robust boundary for the benign and malignant classes. However, it may be computationally expensive on larger datasets.

5. k-Nearest Neighbors (k-NN)
How it Works: k-NN is an instance-based algorithm that assigns a class label to a data point based on the majority label among its k nearest neighbors in the feature space. It relies on a distance metric (typically Euclidean distance) to identify the closest neighbors.

Suitability: k-NN is suitable for this dataset due to its simplicity and effectiveness in binary classification, particularly for smaller datasets. However, it is sensitive to feature scales, so standardization is essential. It may be less efficient for high-dimensional or noisy data, but with proper scaling, it can be effective here.

Each algorithm offers unique strengths: Logistic Regression provides simplicity, Decision Trees and Random Forests capture complex patterns, SVM effectively handles high-dimensional data, and k-NN offers straightforward, distance-based classification. Testing each one on this dataset allows us to evaluate which approach best suits the classification of benign and malignant tumors based on their features.


In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score


In [44]:
# List of models
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Support Vector Machine': SVC(kernel='linear', random_state=42),
    'k-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5)
}

In [45]:
# Dictionary to store model performance
performance = {}


In [46]:
# Train, predict, and evaluate each model
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    # Make predictions
    y_pred = model.predict(X_test)
    # Evaluate performance
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    performance[model_name] = {'Accuracy': accuracy, 'F1 Score': f1}


In [47]:
# Display performance results
for model_name, metrics in performance.items():
    print(f"{model_name} - Accuracy: {metrics['Accuracy']:.4f}, F1 Score: {metrics['F1 Score']:.4f}")

Logistic Regression - Accuracy: 0.9737, F1 Score: 0.9790
Decision Tree - Accuracy: 0.9474, F1 Score: 0.9577
Random Forest - Accuracy: 0.9649, F1 Score: 0.9722
Support Vector Machine - Accuracy: 0.9561, F1 Score: 0.9645
k-Nearest Neighbors - Accuracy: 0.9474, F1 Score: 0.9577
