# Step 1: Load the dataset


In [1]:
from google.colab import files

# Upload the CSV file from your local computer
uploaded = files.upload()

Saving breast_cancer_bd.csv to breast_cancer_bd.csv


# Step 2: Inspect
Inspect the Breast Cancer dataset it for any non-numeric values or missing entries.


In [18]:
import pandas as pd
data = pd.read_csv("breast_cancer_bd.csv")

data.head()


Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


# Step 3a: Drop unnecessary column and check for non-numeric values
We remove the 'Sample code number' column because it is not needed for prediction.  
Then we check e
ach column for non-numeric values or missing entries.


In [19]:
# Drop the sample code column (not needed for prediction)
data.drop("Sample code number", axis=1, inplace=True)

# -----------------------------
# Exploratory check for non-numeric values
# -----------------------------
# Check number of non-numeric or missing values per column
print("Non-numeric / NaN values per column:")
print(data.apply(lambda col: pd.to_numeric(col, errors='coerce')).isna().sum())


Non-numeric / NaN values per column:
Clump Thickness                 0
Uniformity of Cell Size         0
Uniformity of Cell Shape        0
Marginal Adhesion               0
Single Epithelial Cell Size     0
Bare Nuclei                    16
Bland Chromatin                 0
Normal Nucleoli                 0
Mitoses                         0
Class                           0
dtype: int64


# Step 3b: Check specific column for non-numeric values
Since we know that only the 'Bare Nuclei' column might contain problematic entries
we specifically scan this column to identify any rows that cannot be converted to numeric values.


In [22]:
# Check only the 'Bare Nuclei' column for non-numeric values
bad_rows = data[pd.to_numeric(data["Bare Nuclei"], errors='coerce').isna()]

print("\nRows in 'Bare Nuclei' with problematic values (if any):")
bad_rows



Rows in 'Bare Nuclei' with problematic values (if any):


Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
23,8,4,5,1,2,?,7,3,1,4
40,6,6,6,9,6,?,7,8,1,2
139,1,1,1,1,1,?,2,1,1,2
145,1,1,3,1,2,?,2,1,1,2
158,1,1,2,1,3,?,1,1,1,2
164,5,1,1,1,2,?,3,1,1,2
235,3,1,4,1,2,?,3,1,1,2
249,3,1,1,1,2,?,3,1,1,2
275,3,1,3,1,2,?,2,1,1,2
292,8,8,8,1,2,?,6,10,1,4


# Step 4: Clean, train, and evaluate the Logistic Regression model
1. Clean the 'Bare Nuclei' column by replacing '?' with NaN and filling missing values with the mode.
2. Separate features (X) and target (y).
3. Scale the features using StandardScaler.
4. Split the dataset into training and test sets.
5. Train the Logistic Regression model.
6. Make predictions and evaluate accuracy, confusion matrix, and classification report.


In [32]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# -----------------------------
# Clean 'Bare Nuclei' column
# -----------------------------
data["Bare Nuclei"] = data["Bare Nuclei"].replace("?", np.nan).fillna(data["Bare Nuclei"].mode()[0])

# -----------------------------
# Separate features and target
# -----------------------------
X = data.drop("Class", axis=1)
y = data["Class"]

# -----------------------------
# Scale features
# -----------------------------
scaler = StandardScaler()
X = scaler.fit_transform(X)

# -----------------------------
# Split dataset
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# -----------------------------
# Train Logistic Regression
# -----------------------------
model = LogisticRegression()
model.fit(X_train, y_train)

# -----------------------------
# Make predictions and evaluate
# -----------------------------
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print(f"Model Accuracy: {accuracy*100:.2f}%\n")
print("Confusion Matrix:")
print(cm)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# -----------------------------
# First 10 predictions
# -----------------------------
print("\nFirst 10 Test set prediction: ")
for i, (pred, actual) in enumerate(zip(y_pred[:10], y_test[:10]), 1):
    pred_label = "Benign (2)" if pred == 2 else "Malignant (4)"
    actual_label = "Benign (2)" if actual == 2 else "Malignant (4)"
    correct = "Correct" if pred == actual else "Wrong"
    print(f"Sample {i}: Predicted → {pred_label}, Actual → {actual_label}, Result → {correct}")




Model Accuracy: 96.67%

Confusion Matrix:
[[141   2]
 [  5  62]]

Classification Report:
              precision    recall  f1-score   support

           2       0.97      0.99      0.98       143
           4       0.97      0.93      0.95        67

    accuracy                           0.97       210
   macro avg       0.97      0.96      0.96       210
weighted avg       0.97      0.97      0.97       210


First 10 Test set prediction: 
Sample 1: Predicted → Benign (2), Actual → Benign (2), Result → Correct
Sample 2: Predicted → Benign (2), Actual → Benign (2), Result → Correct
Sample 3: Predicted → Benign (2), Actual → Benign (2), Result → Correct
Sample 4: Predicted → Malignant (4), Actual → Malignant (4), Result → Correct
Sample 5: Predicted → Benign (2), Actual → Benign (2), Result → Correct
Sample 6: Predicted → Benign (2), Actual → Benign (2), Result → Correct
Sample 7: Predicted → Benign (2), Actual → Benign (2), Result → Correct
Sample 8: Predicted → Malignant (4), Actua