In [1]:
# Introduction : In this notebook, we will apply learning techniques on the breast cancer dataset from sklearn library.


In [2]:
#Loading and Preprocessing

# Importing required libraries
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
# Load the dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

In [4]:
# Check for missing values
print(X.isnull().sum().sum())


0


In [5]:
# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [6]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [14]:
# Classification Algorithm Implementation

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [16]:
# Logistic Regression

lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
acc_lr = accuracy_score(y_test, y_pred_lr)
print("Logistic Regression Accuracy:", acc_lr)


Logistic Regression Accuracy: 0.9736842105263158


In [18]:
# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Decision Tree Accuracy:", acc_dt)



Decision Tree Accuracy: 0.9473684210526315


In [20]:
# Random Forest

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", acc_rf)


Random Forest Accuracy: 0.9649122807017544


In [22]:
# Support Vector Machine

svm = SVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
acc_svm = accuracy_score(y_test, y_pred_svm)
print("SVM Accuracy:", acc_svm)


SVM Accuracy: 0.9736842105263158


In [24]:
# k-Nearest Neighbors

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
acc_knn = accuracy_score(y_test, y_pred_knn)
print("k-NN Accuracy:", acc_knn)


k-NN Accuracy: 0.9473684210526315


In [26]:
# Model Comparison

results = {
    'Logistic Regression': acc_lr,
    'Decision Tree': acc_dt,
    'Random Forest': acc_rf,
    'SVM': acc_svm,
    'k-NN': acc_knn
}

# Print results
for model, acc in results.items():
    print(f"{model}: {acc:.4f}")


Logistic Regression: 0.9737
Decision Tree: 0.9474
Random Forest: 0.9649
SVM: 0.9737
k-NN: 0.9474


In [None]:
# SUMMARY / Explanation

**Missing Values**: Checked using `isnull()`. The dataset has no missing values.ie, value = 0.
**Feature Scaling**: Used `StandardScaler` to scale features. Scaling is important for algorithms like SVM and k-NN to perform well.
**Train-Test Split**: Used an 80-20 split to evaluate model performance on unseen data.

**Logistic Regression** is a linear model for binary classification. It is effective when classes are linearly separable. Since breast cancer is a binary problem (malignant/benign), it's a good starting point.
**Random Forest** is an ensemble method combining multiple decision trees to improve accuracy and prevent overfitting.
**SVM** finds the optimal hyperplane that separates classes. It performs well with high-dimensional data and is robust to overfitting.
**k-NN** classifies data points based on the majority label of the k-nearest neighbors. It's simple but sensitive to the choice of k and feature scaling.

    **Best Performing**: Usually Random Forest or SVM.
    **Least Performing**: Often Decision Tree or k-NN due to overfitting or sensitivity to data scaling.
    