In [5]:
import zipfile

zip_path = "breast_images.zip.zip"  # match the actual file name

with zipfile.ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall("./data")

print("‚úÖ Dataset extracted to ./data folder")



‚úÖ Dataset extracted to ./data folder


In [2]:
import os
import cv2
import numpy as np

img_folder = "./data/Dataset_BUSI_with_GT"
image_data = []
labels = []

for label, folder in enumerate(["benign", "malignant", "normal"]):
    folder_path = os.path.join(img_folder, folder)
    for file in os.listdir(folder_path):
        img_path = os.path.join(folder_path, file)
        img = cv2.imread(img_path)

        # Skip if image is not loaded
        if img is None:
            print(f"‚ö†Ô∏è Skipping file (not an image or unreadable): {img_path}")
            continue

        img = cv2.resize(img, (64, 64))   # resize
        img_flat = img.flatten()          # flatten into 1D vector
        image_data.append(img_flat)
        labels.append(label)

X_image = np.array(image_data)
y_image = np.array(labels)

print("‚úÖ Image data shape:", X_image.shape)
print("‚úÖ Labels shape:", y_image.shape)



‚ö†Ô∏è Skipping file (not an image or unreadable): ./data/Dataset_BUSI_with_GT\benign\multimodal_classification.ipynb
‚úÖ Image data shape: (1578, 12288)
‚úÖ Labels shape: (1578,)


In [3]:
from sklearn.model_selection import train_test_split

# Train-test split for image data
X_train_img, X_test_img, y_train_img, y_test_img = train_test_split(
    X_image, y_image, test_size=0.2, random_state=42
)

print("Train set:", X_train_img.shape, y_train_img.shape)
print("Test set:", X_test_img.shape, y_test_img.shape)


Train set: (1262, 12288) (1262,)
Test set: (316, 12288) (316,)


In [5]:
# ‚úÖ Step 1: Import libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# ‚úÖ Step 2: Split data
X_train_img, X_test_img, y_train_img, y_test_img = train_test_split(
    X_image, y_image, test_size=0.2, random_state=42
)
 # Optional check for imbalance

# ‚úÖ Step 3: Apply SMOTE on training data only
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_img, y_train_img)

 # Check that all classes are balanced

# ‚úÖ Step 4: Train Random Forest
rf_img = RandomForestClassifier(
    n_estimators=200,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
rf_img.fit(X_train_res, y_train_res)

# ‚úÖ Step 5: Predict and evaluate
y_pred_img = rf_img.predict(X_test_img)
print("üìä Image Model Accuracy:", accuracy_score(y_test_img, y_pred_img))
print(classification_report(y_test_img, y_pred_img))





üìä Image Model Accuracy: 0.7246835443037974
              precision    recall  f1-score   support

           0       0.75      0.82      0.79       191
           1       0.60      0.50      0.55        84
           2       0.79      0.73      0.76        41

    accuracy                           0.72       316
   macro avg       0.71      0.68      0.70       316
weighted avg       0.72      0.72      0.72       316



In [6]:
import joblib

# Save the trained image model
joblib.dump(rf_img, "image_model.pkl")

print("‚úÖ Image model saved successfully as image_model.pkl!")


‚úÖ Image model saved successfully as image_model.pkl!
