In [1]:
%pip install pandas numpy scikit-learn tensorflow

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np

# For splitting data
from sklearn.model_selection import train_test_split

# For scaling numerical features
from sklearn.preprocessing import StandardScaler, LabelEncoder

# For metrics
from sklearn.metrics import classification_report, confusion_matrix

# Deep learning imports
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


In [3]:
# 1. LOAD THE DATA
# ----------------------------------------------------------------------------
# Replace 'data.csv' with the actual path if your CSV is named differently 
# or located elsewhere.
df = pd.read_csv("data/cancer_dataset.csv")

In [4]:


# 2. QUICK DATA INSPECTION & CLEANING
# ----------------------------------------------------------------------------
print("Data shape:", df.shape)
print(df.head())




Data shape: (569, 32)
         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  radius_worst  texture_worst  per

In [5]:
# Check for missing values (if any)
print("\nMissing values:\n", df.isnull().sum())




Missing values:
 id                         0
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64


In [6]:
# In many breast cancer datasets, 'id' is often just an identifier and
# is not predictive. We'll drop it if present.
if 'id' in df.columns:
    df.drop(columns=['id'], inplace=True)

In [7]:

# 3. PREPROCESS THE TARGET (diagnosis)
# ----------------------------------------------------------------------------
# Typically, 'B' = Benign = 0, 'M' = Malignant = 1
# Let's encode that:
label_encoder = LabelEncoder()
df['diagnosis'] = label_encoder.fit_transform(df['diagnosis'])  
# After encoding: B -> 0, M -> 1
df['diagnosis']

0      1
1      1
2      1
3      1
4      1
      ..
564    1
565    1
566    1
567    1
568    0
Name: diagnosis, Length: 569, dtype: int64

In [8]:


# 4. SPLIT INTO FEATURES (X) AND TARGET (y)
# ----------------------------------------------------------------------------
X = df.drop(columns=['diagnosis'])  # all columns except diagnosis
y = df['diagnosis']                 # target column


In [9]:


# 5. TRAIN-TEST SPLIT
# ----------------------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,      # 20% test
    random_state=42,    # for reproducibility
    stratify=y          # keep the same class distribution
)


In [10]:

# 6. SCALE THE FEATURES
# ----------------------------------------------------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

In [11]:
X_train.shape

(455, 30)

In [14]:
X_train.shape[1]

30

In [15]:

# 7. BUILD THE ANN MODEL
# ----------------------------------------------------------------------------
# Here’s a simple 3-layer network with:
#   - input_dim = number of features
#   - hidden layer 1 with 16 neurons (relu)
#   - hidden layer 2 with 8 neurons (relu)
#   - output layer with 1 neuron (sigmoid for binary classification)

model = Sequential([
    Dense(units=16, activation='relu', input_dim=X_train.shape[1]),
    Dense(units=8, activation='relu'),
    Dense(units=1, activation='sigmoid')  # single output node for binary classification
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [16]:


# 8. COMPILE THE MODEL
# ----------------------------------------------------------------------------
# Use 'binary_crossentropy' for a 2-class problem, 'adam' optimizer is typical
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [17]:
# 9. TRAIN (FIT) THE MODEL
# ----------------------------------------------------------------------------
history = model.fit(
    X_train_scaled, 
    y_train, 
    validation_split=0.2,   # further split the training data into train/val
    epochs=50,              # you can tune the epoch count
    batch_size=12,          # you can tune batch size
    verbose=2
)

Epoch 1/50
31/31 - 3s - 110ms/step - accuracy: 0.4478 - loss: 0.6809 - val_accuracy: 0.5165 - val_loss: 0.6551
Epoch 2/50
31/31 - 1s - 30ms/step - accuracy: 0.6978 - loss: 0.5501 - val_accuracy: 0.7033 - val_loss: 0.5457
Epoch 3/50
31/31 - 0s - 11ms/step - accuracy: 0.8654 - loss: 0.4568 - val_accuracy: 0.8242 - val_loss: 0.4516
Epoch 4/50
31/31 - 1s - 22ms/step - accuracy: 0.9258 - loss: 0.3664 - val_accuracy: 0.9121 - val_loss: 0.3550
Epoch 5/50
31/31 - 0s - 10ms/step - accuracy: 0.9423 - loss: 0.2762 - val_accuracy: 0.9341 - val_loss: 0.2784
Epoch 6/50
31/31 - 0s - 11ms/step - accuracy: 0.9615 - loss: 0.2063 - val_accuracy: 0.9341 - val_loss: 0.2255
Epoch 7/50
31/31 - 0s - 10ms/step - accuracy: 0.9725 - loss: 0.1615 - val_accuracy: 0.9451 - val_loss: 0.1931
Epoch 8/50
31/31 - 0s - 13ms/step - accuracy: 0.9753 - loss: 0.1324 - val_accuracy: 0.9451 - val_loss: 0.1745
Epoch 9/50
31/31 - 0s - 9ms/step - accuracy: 0.9753 - loss: 0.1123 - val_accuracy: 0.9341 - val_loss: 0.1606
Epoch 10/5

In [18]:

# 10. EVALUATE THE MODEL
# ----------------------------------------------------------------------------
# Evaluate on the test set
loss, accuracy = model.evaluate(X_test_scaled, y_test, verbose=0)
print("\nTest Loss: {:.4f}".format(loss))
print("Test Accuracy: {:.4f}".format(accuracy))



Test Loss: 0.0459
Test Accuracy: 0.9825


In [19]:




# 11. MAKE PREDICTIONS & DISPLAY CLASSIFICATION REPORT
# ----------------------------------------------------------------------------
y_pred_prob = model.predict(X_test_scaled)
# Convert probabilities to binary predictions (0 or 1) using 0.5 threshold
y_pred = (y_pred_prob > 0.5).astype("int32").ravel()

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step

Confusion Matrix:
[[71  1]
 [ 1 41]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99        72
           1       0.98      0.98      0.98        42

    accuracy                           0.98       114
   macro avg       0.98      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114

