## Load the Dataset

#### 1. Import Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


#### 2. Load the Dataset

In [3]:
# Load dataset
df = pd.read_csv("breast_cancer.csv")  # Replace with your actual dataset file name

# Display first 5 rows
df.head()


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


#### 3. Check Column Names

In [10]:
print(df.columns)


Index(['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
      dtype='object')


## Data Preprocessing

#### 1. Check for Missing Values

In [11]:
print(df.isnull().sum())


diagnosis                    0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean                0
fractal_dimension_mean       0
radius_se                    0
texture_se                   0
perimeter_se                 0
area_se                      0
smoothness_se                0
compactness_se               0
concavity_se                 0
concave points_se            0
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst                0
perimeter_worst              0
area_worst                   0
smoothness_worst             0
compactness_worst            0
concavity_worst              0
concave points_worst         0
symmetry_worst               0
fractal_dimension_worst      0
Unnamed: 32                569
dtype: i

In [12]:
df.fillna(df.mean(), inplace=True)  # Replacing missing values with column mean


  df.fillna(df.mean(), inplace=True)  # Replacing missing values with column mean


#### 2. Convert Categorical Target Variable ('diagnosis') to Numeric

In [13]:
le = LabelEncoder()
df["diagnosis"] = le.fit_transform(df["diagnosis"])  # M → 1, B → 0
df.head()


Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


#### 3. Split the Data into Features and Target

In [14]:
X = df.drop(columns=["diagnosis"])  # Features
y = df["diagnosis"]  # Target variable


#### 4. Normalize the Data (Feature Scaling)

In [15]:
scaler = StandardScaler()
X = scaler.fit_transform(X)


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


#### 5. Split Data into Training and Testing Sets

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


##  Model Training

#### 1. Train the Model

In [17]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)  # Training the model


ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

#### Handle Missing Values

In [18]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="mean")  # Replace NaNs with column mean
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)




In [19]:
# Drop column 30 if it's entirely NaN
X_train = pd.DataFrame(X_train)  # Convert to DataFrame for easier handling
X_test = pd.DataFrame(X_test)

if X_train.isnull().all().any():
    X_train = X_train.dropna(axis=1, how='all')
    X_test = X_test.dropna(axis=1, how='all')

# Now handle missing values
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="mean")
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)


###  Train the Model Again

In [20]:
model = LogisticRegression()
model.fit(X_train, y_train)  # Training the model


In [21]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", round(accuracy, 4))


Model Accuracy: 0.9737


In [22]:
import joblib
joblib.dump(model, 'breast_cancer_model.pkl')



['breast_cancer_model.pkl']

In [23]:
import joblib

# Train your model (make sure it's named 'model')
joblib.dump(model, 'breast_cancer_model.pkl')  # Save the model properly


['breast_cancer_model.pkl']

In [24]:
import pickle

with open('breast_cancer_model.pkl', 'wb') as f:
    pickle.dump(model, f)


In [25]:
model = joblib.load('breast_cancer_model.pkl')
print(model)  # Check if the model loads correctly


LogisticRegression()


In [28]:
import numpy as np

# Assuming user_input is a list or array containing only 5 features (incorrect)
user_input = np.array([10, 20, 15, 0.2, 0.4])  # Example incorrect input

# Reshape it to match the required number of features
if user_input.shape[0] != 30:
    st.error("Error: The input should have 30 features, but received only {}".format(user_input.shape[0]))
else:
    prediction = model.predict([user_input])


NameError: name 'st' is not defined

In [29]:
print(X_train.shape)  # It should be (rows, 30)


(455, 30)


In [30]:
import joblib
import numpy as np

model = joblib.load('breast_cancer_model.pkl')
test_input = np.random.rand(1, 30)  # Simulate 30 input features
print(model.predict(test_input))


[1]
