In [1]:
# Import necessary libraries
import pandas as pd

# Define column names since the dataset does not include headers
columns = ['ID', 'Diagnosis'] + [f'feature_{i}' for i in range(1, 31)]

# Load the dataset
data_path = 'wdbc.data'  # Make sure to replace path_to_your_file with the actual path
data = pd.read_csv(data_path, header=None, names=columns)

# Display the first few rows to ensure it's loaded correctly
data.head()


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Unnamed: 0,ID,Diagnosis,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [7]:
from sklearn.preprocessing import LabelEncoder

# Encode the 'Diagnosis' column (M = 1, B = 0)
le = LabelEncoder()
data['Diagnosis'] = le.fit_transform(data['Diagnosis'])

# Check the transformation
data[['ID', 'Diagnosis']].head()

Unnamed: 0,ID,Diagnosis
0,842302,1
1,842517,1
2,84300903,1
3,84348301,1
4,84358402,1


In [8]:
# Splitting the dataset
X = data.drop(['ID', 'Diagnosis'], axis=1)
y = data['Diagnosis']

In [9]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the features
X_scaled = scaler.fit_transform(X)


In [10]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Display the shape of the training and testing data
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((455, 30), (114, 30), (455,), (114,))

In [11]:
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression model
model = LogisticRegression(random_state=42)

# Train the model with the training data
model.fit(X_train, y_train)

# Display the model's coefficients
model.coef_

array([[ 0.43464701,  0.39719369,  0.39606985,  0.46999027,  0.06739432,
        -0.52671658,  0.80767618,  1.1077114 , -0.24153785, -0.07844689,
         1.25057276, -0.1889927 ,  0.58988574,  0.91987672,  0.31656977,
        -0.66906627, -0.17107672,  0.31431731, -0.50499861, -0.61176062,
         0.87394431,  1.35751246,  0.58604724,  0.83803013,  0.54737794,
        -0.00589749,  0.95168648,  0.78040785,  1.19559714,  0.16293183]])

In [12]:
from sklearn.metrics import accuracy_score, confusion_matrix

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Generate and display the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Display the results
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9736842105263158
Confusion Matrix:
 [[70  1]
 [ 2 41]]


In [13]:
from joblib import dump

# Save the model to a file
model_filename = 'breast_cancer_model.joblib'
dump(model, model_filename)

print(f"Model saved to {model_filename}")

Model saved to breast_cancer_model.joblib
