In [1]:
# Import necessary libraries
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Step 1: Load the Breast Cancer dataset
print("[INFO] Loading the Breast Cancer dataset...")
data = load_breast_cancer()

# Convert the dataset to a pandas DataFrame for easier handling
df = pd.DataFrame(data=data.data, columns=data.feature_names)
df['target'] = data.target  # Add the target column

# Display dataset overview
print("[INFO] Dataset overview:")
print(df.head())
print("\nDataset shape:", df.shape)
print("\nTarget variable counts:\n", df['target'].value_counts())

[INFO] Loading the Breast Cancer dataset...
[INFO] Dataset overview:
   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimensi

In [3]:
# Step 2: Handle missing values (if any)
# Check for missing values
missing_values = df.isnull().sum().sum()
print("\nMissing values in the dataset:", missing_values)


Missing values in the dataset: 0


In [4]:
# Step 3: Split data into features (X) and target (y)
X = df.drop('target', axis=1)  # Features
y = df['target']  # Target variable

# Step 4: Split the data into training and testing sets
print("\n[INFO] Splitting the dataset into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Display the shapes of the training and testing sets
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

# Step 5: Normalize/Scale the feature data
print("\n[INFO] Normalizing the feature data...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit and transform on training data
X_test_scaled = scaler.transform(X_test)  # Only transform on testing data

# Convert scaled data back to DataFrame for easy handling (optional)
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X.columns)

print("\n[INFO] Data preparation complete. The dataset is ready for analysis.")



[INFO] Splitting the dataset into training and testing sets...
Training set shape: (455, 30)
Testing set shape: (114, 30)

[INFO] Normalizing the feature data...

[INFO] Data preparation complete. The dataset is ready for analysis.


In [5]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split

# Feature Selection using SelectKBest
selector = SelectKBest(score_func=f_classif, k=10)  # Selecting the top 10 features
X_selected = selector.fit_transform(X, y)

# Get the names of the selected features
selected_features = X.columns[selector.get_support()]
print("[INFO] Selected Features:\n", selected_features)

# Split the dataset into training and testing sets with selected features
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42, stratify=y)
print("[INFO] Dataset split into training and testing sets.")


[INFO] Selected Features:
 Index(['mean radius', 'mean perimeter', 'mean area', 'mean concavity',
       'mean concave points', 'worst radius', 'worst perimeter', 'worst area',
       'worst concavity', 'worst concave points'],
      dtype='object')
[INFO] Dataset split into training and testing sets.


In [6]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

# Define the base MLPClassifier model
mlp = MLPClassifier(max_iter=1000, random_state=42)

# Define the parameter grid for GridSearch
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],  # Different layer configurations
    'activation': ['relu', 'tanh'],  # Activation functions
    'solver': ['adam', 'sgd'],  # Optimizers
    'alpha': [0.0001, 0.001],  # Regularization term
    'learning_rate': ['constant', 'adaptive'],  # Learning rate types
}

# Set up GridSearchCV
grid_search = GridSearchCV(mlp, param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)

# Fit GridSearchCV on training data
print("[INFO] Running GridSearchCV...")
grid_search.fit(X_train, y_train)

# Output the best parameters and best score
print("[INFO] Best Parameters:\n", grid_search.best_params_)
print("[INFO] Best Score:\n", grid_search.best_score_)

# Retrieve the best model
best_model = grid_search.best_estimator_


[INFO] Running GridSearchCV...
Fitting 5 folds for each of 48 candidates, totalling 240 fits
[INFO] Best Parameters:
 {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (50, 50), 'learning_rate': 'constant', 'solver': 'adam'}
[INFO] Best Score:
 0.9340659340659341


In [7]:
from sklearn.metrics import classification_report, accuracy_score

# Train the best model
print("[INFO] Training the ANN model with best parameters...")
best_model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = best_model.predict(X_test)

# Print accuracy and classification report
print("[INFO] Evaluation Results:")
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


[INFO] Training the ANN model with best parameters...
[INFO] Evaluation Results:
Test Accuracy: 0.8508771929824561

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.90      0.82        42
           1       0.94      0.82      0.87        72

    accuracy                           0.85       114
   macro avg       0.84      0.86      0.85       114
weighted avg       0.87      0.85      0.85       114



In [None]:
import streamlit as st
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.datasets import load_breast_cancer
import joblib


# Load the Breast Cancer dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

# Feature Selection
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]

# Load the pre-trained ANN model
model = joblib.load("best_ann_model.pkl")  # Save your model as 'best_ann_model.pkl'

# Streamlit App
st.title("Breast Cancer Prediction App")
st.write("This app predicts whether breast cancer is malignant or benign based on user inputs.")

# Sidebar for user input
st.sidebar.header("Input Features")

def user_input_features():
    input_data = {}
    for feature in selected_features:
        input_data[feature] = st.sidebar.slider(feature, float(X[feature].min()), float(X[feature].max()), float(X[feature].mean()))
    return pd.DataFrame([input_data])

user_data = user_input_features()
st.write("### User Input Features", user_data)

# Predict and Display Results
if st.button("Predict"):
    prediction = model.predict(user_data)
    prediction_proba = model.predict_proba(user_data)
    st.write("### Prediction:", "Malignant" if prediction[0] == 0 else "Benign")
    st.write("### Prediction Probability:", prediction_proba)


ImportError: cannot import name 'joblib' from 'sklearn.externals' (e:\Lambton Files\Term 3\Gupta Neural Network\Week 13\.venv\Lib\site-packages\sklearn\externals\__init__.py)