# Breast Cancer Classification System

In [1]:
#  Import necessary libraries

from sklearn.datasets import load_breast_cancer  # Load the dataset
from sklearn.model_selection import train_test_split  # For splitting the data
import pandas as pd  # For creating and manipulating dataframes

# Set a seed for reproducibility

SEED = 42  # Ensures that the splits are the same every time you run the code

# Load the breast cancer dataset

cancer_data = load_breast_cancer()

# Create a DataFrame with feature names

df = pd.DataFrame(cancer_data.data, columns=cancer_data.feature_names)

# Add the target variable to the DataFrame

df['target'] = cancer_data.target

# Split the data into training (80%), validation (10%), and test (10%) sets

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=SEED)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=SEED)

# Display the sizes of the resulting dataframes

print("Training set size:", train_df.shape)
print("Validation set size:", val_df.shape)
print("Test set size:", test_df.shape)

Training set size: (455, 31)
Validation set size: (57, 31)
Test set size: (57, 31)


In [2]:
# silly rule if the radius is greater than 14, we identify this as 1 cancer or 0 no cancer
def silly_rule(df) :
    df["prediction"] = (df['mean radius'] > 14).astype(int)
    return df["prediction"]

In [3]:
val_df["prediction"] = silly_rule(val_df)
# checking the accuracy of silly rule 
accuracy = (val_df["prediction"] == val_df["target"]).mean()
print(f"--Silly rule accuracy on validation test : {accuracy:.2%}")

--Silly rule accuracy on validation test : 15.79%


In [4]:
test_df["prediction"] = silly_rule(test_df)
# checking the accuracy of test data
silly_test_accuracy = (test_df["prediction"] == test_df['target']).mean()
print(f"--Silly rule accuracy on test data: {silly_test_accuracy : 0.2%}")

--Silly rule accuracy on test data:  12.28%


In [5]:
#sensible rule if the mean area is less than 1000 return 1 as malignant or 0 for no cancer

def sensible_rule(df) :
    df["prediction"] = ((df["mean area"]) < 1000).astype(int)
    return df['prediction']

In [6]:
# validation of sensible rule 
val_df["prediction"] = sensible_rule(val_df)
# checking the accuracy
accuracy = (val_df["prediction"] == val_df["target"]).mean()
print(f"--Sensible rule accuracy on validation test : {accuracy:.2%}")

--Sensible rule accuracy on validation test : 68.42%


In [7]:
test_df["prediction"] = sensible_rule(test_df)
# checking the accuracy of test data
test_accuracy = (test_df["prediction"] == test_df['target']).mean()
print(f"--Sensible run accuracy on test data: {test_accuracy : 0.3%}")

--Sensible run accuracy on test data:  85.965%


In [8]:
# sophisticated rule if mean perimeter is < 100 and mean texture is < 15
def soph_rule(df):
    df["prediction"] = ((df["mean perimeter"] < 100) & (df["mean texture"] > 15)).astype(int)
    return df["prediction"]
    

In [9]:
val_df["prediction"] = soph_rule(val_df)
# checking the accuracy of validation data
accuracy = (val_df["prediction"] == val_df["target"]).mean()
print(f"--Sophisticated rule accuracy on validation test : {accuracy:0.2%}")

--Sophisticated rule accuracy on validation test : 71.93%


In [10]:
test_df["prediction"] = soph_rule(test_df)
# checking the accuracy of test data
sophisticated_test_accuracy = (test_df["prediction"] == test_df['target']).mean()
print(f"--Sophisticated run accuracy on test data: {sophisticated_test_accuracy : 0.2%}")

--Sophisticated run accuracy on test data:  78.95%
