In [5]:
# Importing the Dependencies
import numpy as np
import pandas as pd
import pickle
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.datasets import load_breast_cancer


In [6]:
#Loard The DataSet
data_set = load_breast_cancer()
X = pd.DataFrame(data_set.data, columns=data_set.feature_names)
Y = pd.Series(data_set.target)


In [7]:
# loading the data to a data frame
data_frame = pd.DataFrame(data_set.data, columns = data_set.feature_names)

In [20]:
# print the first 5 rows of the dataframe
data_frame.head()
data_frame.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='object')

In [9]:
# Splitting the Data into Training and Test Data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)


In [10]:
# Data Preprocessing
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
# Hyperparameter Tuning using Grid Search
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'max_iter': [100, 200, 300, 400, 500]}
grid_search = GridSearchCV(LogisticRegression(random_state=42), param_grid, cv=5)
grid_search.fit(X_train_scaled, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [12]:
# Get the Best hyperparameters from grid search
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'C': 0.1, 'max_iter': 100}


In [13]:
# Model Training with Best Hyperparameters
cancer_model = LogisticRegression(**best_params, random_state=42)
cancer_model.fit(X_train_scaled, Y_train)

In [14]:
# accuracy on training data
train_predictions = cancer_model.predict(X_train_scaled)
accuracy = accuracy_score(train_predictions, Y_train)
print('Accuracy on Training data:', accuracy)

Accuracy on Training data: 0.9868131868131869


In [15]:
# accuracy on test data
test_predictions = cancer_model.predict(X_test_scaled)
accuracy = accuracy_score(test_predictions, Y_test)
print('Accuracy on Test data:', accuracy)


Accuracy on Test data: 0.9736842105263158


In [16]:
# Evaluation Metrics
conf_matrix = confusion_matrix(Y_test, test_predictions)
class_report = classification_report(Y_test, test_predictions)

print('\nConfusion Matrix:\n', conf_matrix)
print('\nClassification Report:\n', class_report)


Confusion Matrix:
 [[40  2]
 [ 1 71]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.95      0.96        42
           1       0.97      0.99      0.98        72

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



In [17]:
#test program with sample input from the dataset
sample_input = np.array([14.5, 21.2, 98.0, 654.3, 0.102, 0.107, 0.081, 0.066, 0.176, 0.059, 0.271, 0.792, 2.613, 26.5, 0.005, 0.022, 0.020, 0.007, 0.025, 0.004, 15.3, 28.8, 98.0, 708.8, 0.127, 0.345, 0.391, 0.109, 0.198, 0.06]).reshape(1, -1)

# Scale the sample input using the same scaler used for training
sample_input_scaled = scaler.transform(sample_input)

# Make prediction using cancer model
prediction = cancer_model.predict(sample_input_scaled)
print("Predicted class:", prediction[0])

# prediction result
if prediction[0] == 0:
    print('Benign (non-cancerous)')
else:
    print('Malignant (cancerous)')


Predicted class: 1
Malignant (cancerous)




In [18]:
#Save the created model
filename = 'breast_cancer_prediction_model.pkl'
joblib.dump(cancer_model, filename)

['breast_cancer_prediction_model.pkl']