In [None]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import joblib
import os

# 1. Load dataset
df = pd.read_csv("data.csv")  # or full path / URL: "https://raw.githubusercontent.com/.../data.csv"
print(df.head())
print(df.info())  # Check no missing values (this dataset has none!)

# Keep only allowed columns
df = df[['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'diagnosis']]

# 2. Preprocessing
# Target encoding: M=1 (malignant), B=0 (benign)
le = LabelEncoder()
df['diagnosis'] = le.fit_transform(df['diagnosis'])  # M → 1, B → 0

# Features & target
features = ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean']
X = df[features]
y = df['diagnosis']

# Feature scaling (mandatory for Logistic Regression stability, KNN, SVM)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=features)

# 3. Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 4. Model - Logistic Regression
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# 5. Evaluate
y_pred = model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1-score: {f1_score(y_test, y_pred):.4f}")

# Expected: ~0.94–0.97 accuracy with these features

# 6. Save model + scaler + label encoder (if needed)
os.makedirs("model", exist_ok=True)
joblib.dump(model, "model/breast_cancer_model.joblib")
joblib.dump(scaler, "model/scaler.joblib")
joblib.dump(le, "model/label_encoder.joblib")  # Optional but safe

print("Model saved!")

# 7. Demonstrate reload
loaded_model = joblib.load("model/breast_cancer_model.joblib")
loaded_scaler = joblib.load("model/scaler.joblib")

# Example new data (must scale!)
sample = pd.DataFrame({
    'radius_mean': [17.99],
    'texture_mean': [10.38],
    'perimeter_mean': [122.8],
    'area_mean': [1001.0],
    'smoothness_mean': [0.11840]
})
sample_scaled = loaded_scaler.transform(sample)
pred = loaded_model.predict(sample_scaled)[0]
print("Prediction:", "Malignant" if pred == 1 else "Benign")

In [None]:
import pickle
import os

# Create the model folder if it doesn't exist
os.makedirs("model", exist_ok=True)

# Save the model inside the model/ folder (matches assignment structure)
model_path = "model/breast_cancer_model.pkl"

with open(model_path, "wb") as file:
    pickle.dump(model, file)

print(f"Model saved successfully to: {os.path.abspath(model_path)}")

In [None]:
pickle.dump(scaler, open("model/scaler.pkl", "wb"))
# or if using joblib: joblib.dump(scaler, "model/scaler.joblib")