In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load the dataset
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

# Check for missing values
print(df.isnull().sum())

# Feature scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.drop('target', axis=1))
df_scaled = pd.DataFrame(scaled_features, columns=df.columns[:-1])
df_scaled['target'] = df['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_scaled.drop('target', axis=1), df_scaled['target'], test_size=0.2, random_state=42)


mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
target                     0
dtype: int64


In [None]:
Explanation:

Missing values: Check for missing values in the dataset and handle them (if any) by imputing or removing them. In this case, the dataset does not have missing values.

Feature scaling: Standardize the features to have a mean of 0 and a standard deviation of 1, which is crucial for algorithms like SVM and k-NN that are sensitive to feature scales.

In [11]:
# 1. Logical Regression 
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_score = lr.score(X_test, y_test)
print(f"Logistic Regression Accuracy: {lr_score:.2f}")


Logistic Regression Accuracy: 0.97


In [13]:
# 2. Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_score = dt.score(X_test, y_test)
print(f"Decision Tree Accuracy: {dt_score:.2f}")


Decision Tree Accuracy: 0.93


In [15]:
# 3. Randon Forest Classifier 
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_score = rf.score(X_test, y_test)
print(f"Random Forest Accuracy: {rf_score:.2f}")


Random Forest Accuracy: 0.96


In [17]:
# 4.Support Vector Machine
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train, y_train)
svm_score = svm.score(X_test, y_test)
print(f"SVM Accuracy: {svm_score:.2f}")


SVM Accuracy: 0.97


In [21]:
# 5. k-nearest Neighbours
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_score = knn.score(X_test, y_test)
print(f"k-NN Accuracy: {knn_score:.2f}")

# Description: k-NN classifies a sample based on the majority class of its k nearest neighbors. It's simple but can be computationally expensive.

k-NN Accuracy: 0.95


In [23]:
results = {
    "Logistic Regression": lr_score,
    "Decision Tree": dt_score,
    "Random Forest": rf_score,
    "SVM": svm_score,
    "k-NN": knn_score
}

best_model = max(results, key=results.get)
worst_model = min(results, key=results.get)

print(f"Best Model: {best_model} with Accuracy: {results[best_model]:.2f}")
print(f"Worst Model: {worst_model} with Accuracy: {results[worst_model]:.2f}")


Best Model: Logistic Regression with Accuracy: 0.97
Worst Model: Decision Tree with Accuracy: 0.93
